Index: libcfa/src/collections/list2.hfa
===================================================================
--- libcfa/src/collections/list2.hfa	(revision 5a955608021bdca9ca9543f62d1b85c72eded1fb)
+++ libcfa/src/collections/list2.hfa	(revision 8eb85de6eca63d11bcae7804339296c22bbdba35)
@@ -108,4 +108,7 @@
     #define ORIGIN_TAG_NEQ(v1, v2) 0
 
+    #define TAGSONLY(...)
+    #define NOTAGS(...) __VA_ARGS__
+
 #else // Normal
 
@@ -146,4 +149,7 @@
     )
 
+    #define TAGSONLY(...) __VA_ARGS__
+    #define NOTAGS(...)
+
 #endif
 
@@ -270,4 +276,32 @@
 }
 
+// Compile-time memory (cmem) barrier
+// Prevents the optimizer from reordering instructions across it
+// Originally included for correctness, though a broken state is not known to be reproducible.
+// Found to have a critical impact on performance:
+// - in the positions given by default: generally optimal
+// - absent: sometimes much slower, depending on the test harness
+// - in positions (that my be influenced by a principle but) that are arbitrary wrt microarchitecture: typically, much slower
+#ifdef __EXPERIMENTAL_DISABLE_CMEM_BARRIER__
+// upon request, disable cmem barriers
+#define MAYBE_CMEM_BARRIER
+#else
+// by default, enable cmem barriers
+#define MAYBE_CMEM_BARRIER asm( "" : : : "memory" )
+#endif
+
+// Insert read (location)
+// One of the read operations that occurs during an insert operation was found to be performace-critical under certain harnesses.
+// Arguably, the position should not matter if cmem barriers are off.  Treating the factors as independent allows for measuring this idea.
+#ifdef __EXPERIMENTAL_DELAY_INSERT_READ__
+// upon request: do the read late (between the cmem barriers); this location is where the read was originally found when this insert read first became a performance-perterbing hypothesis
+#define MAYBE_INSERT_READ_EARLY(...)
+#define MAYBE_INSERT_READ_LATE(...) __VA_ARGS__
+#else
+// by default: do the read early (before the first cmem barrier); better performance has been seen here
+#define MAYBE_INSERT_READ_EARLY(...) __VA_ARGS__
+#define MAYBE_INSERT_READ_LATE(...)
+#endif
+
 forall( tE &, tLinks & | embedded( tE, tLinks, dlink(tE) ) ) {
 
@@ -283,11 +317,16 @@
         dlink(tE) & linkToInsert = to_insert`inner;
       NOLOOSE(
+       TAGSONLY(
 		verify(ORIGIN_TAG_QUERY((size_t)linkToInsert.prev));
 		verify(ORIGIN_TAG_QUERY((size_t)linkToInsert.next));
+       )
 		verify(ORIGIN_TAG_CLEAR((size_t)linkToInsert.prev) == (size_t)&linkToInsert);
 		verify(ORIGIN_TAG_CLEAR((size_t)linkToInsert.next) == (size_t)&linkToInsert);
       )
         dlink(tE) & list_pos_links = list_pos_real`inner;
-        asm( "" : : : "memory" );
+      MAYBE_INSERT_READ_EARLY(
+        dlink(tE) & afterLinks = * (dlink(tE) *) ORIGIN_TAG_CLEAR( (size_t) list_pos_links.next );
+      )
+        MAYBE_CMEM_BARRIER;
         size_t list_pos_links_num = (size_t)(& list_pos_links);
         size_t to_insert_prev_num = ORIGIN_TAG_ASGN(list_pos_links_num, list_pos_tag);
@@ -295,5 +334,7 @@
 		linkToInsert.prev = to_insert_prev;
 		linkToInsert.next = list_pos_links.next;
+      MAYBE_INSERT_READ_LATE(
         dlink(tE) & afterLinks = * (dlink(tE) *) ORIGIN_TAG_CLEAR( (size_t) list_pos_links.next );
+      )
         size_t afterLinks_prev_tag = ORIGIN_TAG_QUERY((size_t)afterLinks.prev);
         size_t linkToInsert_num = (size_t)(& linkToInsert);
@@ -301,5 +342,5 @@
         afterLinks.prev = (dlink(tE)*)(afterLinks_prev_num);
 		list_pos_links.next = &linkToInsert;
-        asm( "" : : : "memory" );
+        MAYBE_CMEM_BARRIER;
 	}
 
@@ -315,11 +356,16 @@
         dlink(tE) & linkToInsert = to_insert`inner;
       NOLOOSE(
+       TAGSONLY(
 		verify(ORIGIN_TAG_QUERY((size_t)linkToInsert.prev));
 		verify(ORIGIN_TAG_QUERY((size_t)linkToInsert.next));
+       )
 		verify(ORIGIN_TAG_CLEAR((size_t)linkToInsert.prev) == (size_t)&linkToInsert);
 		verify(ORIGIN_TAG_CLEAR((size_t)linkToInsert.next) == (size_t)&linkToInsert);
       )
         dlink(tE) & list_pos_links = list_pos_real`inner;
-        asm( "" : : : "memory" );
+      MAYBE_INSERT_READ_EARLY(
+        dlink(tE) & beforeLinks = * (dlink(tE) *) ORIGIN_TAG_CLEAR( (size_t) list_pos_links.prev );
+      )
+        MAYBE_CMEM_BARRIER;
         size_t list_pos_links_num = (size_t)(& list_pos_links);
         size_t to_insert_next_num = ORIGIN_TAG_ASGN(list_pos_links_num, list_pos_tag);
@@ -327,5 +373,7 @@
 		linkToInsert.next = to_insert_next;
 		linkToInsert.prev = list_pos_links.prev;
+      MAYBE_INSERT_READ_LATE(
         dlink(tE) & beforeLinks = * (dlink(tE) *) ORIGIN_TAG_CLEAR( (size_t) list_pos_links.prev );
+      )
         size_t beforeLinks_next_tag = ORIGIN_TAG_QUERY((size_t)beforeLinks.next);
         size_t linkToInsert_num = (size_t)(& linkToInsert);
@@ -333,5 +381,5 @@
         beforeLinks.next = (dlink(tE)*)(beforeLinks_next_num);
 		list_pos_links.prev = &linkToInsert;
-        asm( "" : : : "memory" );
+        MAYBE_CMEM_BARRIER;
 	}
 
@@ -355,9 +403,9 @@
 
       NOLOOSE(
-        asm( "" : : : "memory" );
+        MAYBE_CMEM_BARRIER;
         size_t list_pos_links_num = (size_t) &list_pos_links;
         size_t list_pos_links_tagged_num = ORIGIN_TAG_ENABL( list_pos_links_num );
 		list_pos_links.next = list_pos_links.prev = (dlink(tE)*) list_pos_links_tagged_num;
-        asm( "" : : : "memory" );
+        MAYBE_CMEM_BARRIER;
       )
         return list_pos;
@@ -482,10 +530,17 @@
     static inline void insert_first( dlist(tE, tLinks) &lst, tE & e ) {
         dlink(tE) & linkToInsert = e`inner;
+      NOLOOSE(
+       TAGSONLY(
 		verify(ORIGIN_TAG_QUERY((size_t)linkToInsert.prev));
 		verify(ORIGIN_TAG_QUERY((size_t)linkToInsert.next));
+       )
 		verify(ORIGIN_TAG_CLEAR((size_t)linkToInsert.prev) == (size_t)&linkToInsert);
 		verify(ORIGIN_TAG_CLEAR((size_t)linkToInsert.next) == (size_t)&linkToInsert);
+      )
         dlink(tE) & list_pos_links = lst;
-        asm( "" : : : "memory" );
+      MAYBE_INSERT_READ_EARLY(
+        dlink(tE) & afterLinks = * (dlink(tE) *) ORIGIN_TAG_CLEAR( (size_t) list_pos_links.next );
+      )
+        MAYBE_CMEM_BARRIER;
         size_t list_pos_links_num = (size_t)(& list_pos_links);
         size_t to_insert_prev_num = ORIGIN_TAG_ENABL(list_pos_links_num);
@@ -493,21 +548,29 @@
 		linkToInsert.prev = to_insert_prev;
 		linkToInsert.next = list_pos_links.next;
+      MAYBE_INSERT_READ_LATE(
         dlink(tE) & afterLinks = * (dlink(tE) *) ORIGIN_TAG_CLEAR( (size_t) list_pos_links.next );
+      )
         size_t linkToInsert_num = (size_t)(& linkToInsert);
         size_t afterLinks_prev_num = linkToInsert_num;
         afterLinks.prev = (dlink(tE)*)(afterLinks_prev_num);
 		list_pos_links.next = &linkToInsert;
-        asm( "" : : : "memory" );
+        MAYBE_CMEM_BARRIER;
     }
 
     static inline void insert_last( dlist(tE, tLinks) &lst, tE & e ) {
-        // insert_before(iter(lst), e);
         dlink(tE) & linkToInsert = e`inner;
+      NOLOOSE(
+       TAGSONLY(
 		verify(ORIGIN_TAG_QUERY((size_t)linkToInsert.next));
 		verify(ORIGIN_TAG_QUERY((size_t)linkToInsert.prev));
+       )
 		verify(ORIGIN_TAG_CLEAR((size_t)linkToInsert.next) == (size_t)&linkToInsert);
 		verify(ORIGIN_TAG_CLEAR((size_t)linkToInsert.prev) == (size_t)&linkToInsert);
+      )
         dlink(tE) & list_pos_links = lst;
-        asm( "" : : : "memory" );
+      MAYBE_INSERT_READ_EARLY(
+        dlink(tE) & beforeLinks = * (dlink(tE) *) ORIGIN_TAG_CLEAR( (size_t) list_pos_links.prev );
+      )
+        MAYBE_CMEM_BARRIER;
         size_t list_pos_links_num = (size_t)(& list_pos_links);
         size_t to_insert_next_num = ORIGIN_TAG_ENABL(list_pos_links_num);
@@ -515,10 +578,12 @@
 		linkToInsert.next = to_insert_next;
 		linkToInsert.prev = list_pos_links.prev;
+      MAYBE_INSERT_READ_LATE(
         dlink(tE) & beforeLinks = * (dlink(tE) *) ORIGIN_TAG_CLEAR( (size_t) list_pos_links.prev );
+      )
         size_t linkToInsert_num = (size_t)(& linkToInsert);
         size_t beforeLinks_next_num = linkToInsert_num;
         beforeLinks.next = (dlink(tE)*)(beforeLinks_next_num);
 		list_pos_links.prev = &linkToInsert;
-        asm( "" : : : "memory" );
+        MAYBE_CMEM_BARRIER;
     }
 
@@ -526,5 +591,4 @@
 		verify (&lst != 0p);
         dlink(tE) & list_links = lst;
-        verify (! ORIGIN_TAG_QUERY( (size_t) (& list_links) ) );
         // call is valid on empty list; when so, list_links.next and after_links.prev have otags set
 
@@ -541,9 +605,9 @@
         after_links.prev = (dlink(tE) *) after_links_prev_rslt;
 
-        asm( "" : : : "memory" );
+        MAYBE_CMEM_BARRIER;
         size_t list_pos_links_num = (size_t) &fst_links;
         size_t list_pos_links_tagged_num = ORIGIN_TAG_ENABL( list_pos_links_num );
 		fst_links.next = fst_links.prev = (dlink(tE)*) list_pos_links_tagged_num;
-        asm( "" : : : "memory" );
+        MAYBE_CMEM_BARRIER;
 
         tytagref( tLinks, dlink(tE) ) retExt = { fst_links };
@@ -568,9 +632,9 @@
         before_links.next = (dlink(tE) *) before_links_next_rslt;
 
-        asm( "" : : : "memory" );
+        MAYBE_CMEM_BARRIER;
         size_t list_pos_links_num = (size_t) &last_links;
         size_t list_pos_links_tagged_num = ORIGIN_TAG_ENABL( list_pos_links_num );
 		last_links.prev = last_links.next = (dlink(tE)*) list_pos_links_tagged_num;
-        asm( "" : : : "memory" );
+        MAYBE_CMEM_BARRIER;
 
         tytagref( tLinks, dlink(tE) ) lpLnkTagged = { last_links };
@@ -630,3 +694,2 @@
 
 }
-
