Index: doc/theses/andrew_beach_MMath/cfalab.sty
===================================================================
--- doc/theses/andrew_beach_MMath/cfalab.sty	(revision 692f0c895772f0dbe0cfe4f849661f820c7bc5eb)
+++ doc/theses/andrew_beach_MMath/cfalab.sty	(revision c0c940adccee8b334b58652795a5f0f352fc519f)
@@ -1,5 +1,6 @@
 % Package for CFA Research Lab.
 %
-% Made by combining and updating various macro files people had made.
+% This is a collection of commands everyone working on CFA related documents
+% should find useful. So mostly programming language related tools.
 %
 % Internal commands are prefixed with "\cfalab@".
@@ -17,22 +18,13 @@
 % Automatically adds spaces.
 \RequirePackage{xspace}
-% Improved reference tools.
-\RequirePackage[nospace]{varioref}
 
-% Symbols: All symbols are zero argument robust commands with special rules
-% about the space following the c.s. token. Normally the space might be
-% re-added according to the rules of the xspace package. They may be followed
-% by a star (which the command will consume) to disable this behaviour.
-
-% \newsymbolcmd{<command>}{<replacement text>}
-% Defines <command> to be a symbol that has the given <replacement text>.
-\newrobustcmd*\newsymbolcmd[2]{\newrobustcmd{#1}{\cfalab@symbol{#2}}}
-\def\cfalab@symbol#1{\@ifnextchar*{#1\cfalab@eatstar}{#1\xspace}}
-\def\cfalab@eatstar*{}
-
+% Tip for commands that end with \xspace: if the default is not correct then
+% follow the command with {} to disable \xspace, use '{} ' to force add a
+% space and '{}<whatever-follows>' to force remove one.
+%
 % Cforall with the forall symbol.
-\newsymbolcmd\CFA{\textsf{C}\raisebox{\depth}{\rotatebox{180}{\textsf{A}}}}
-% C++ with kerning. (No standard number support.)
-\newsymbolcmd\Cpp{\textrm{C}\kern-.1em\hbox{+\kern-.25em+}}
+\newrobustcmd\CFA{\textsf{C\raisebox{\depth}{\rotatebox{180}{A}}}\xspace}
+% C++ with kerning. You may optionally append a standard number.
+\newrobustcmd\Cpp[1][\xspace]{C++#1}
 
 % This is executed very early in the \begin{document} code, before the
Index: doc/theses/andrew_beach_MMath/uw-ethesis.tex
===================================================================
--- doc/theses/andrew_beach_MMath/uw-ethesis.tex	(revision 692f0c895772f0dbe0cfe4f849661f820c7bc5eb)
+++ doc/theses/andrew_beach_MMath/uw-ethesis.tex	(revision c0c940adccee8b334b58652795a5f0f352fc519f)
@@ -99,4 +99,6 @@
 % allow global and individual modification of spacing
 \usepackage{enumitem}
+% Improved reference tools.
+\usepackage[nospace]{varioref}
 
 % Hyperlinks make it very easy to navigate an electronic document.
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision 692f0c895772f0dbe0cfe4f849661f820c7bc5eb)
+++ libcfa/src/concurrency/invoke.h	(revision c0c940adccee8b334b58652795a5f0f352fc519f)
@@ -146,7 +146,5 @@
 	struct __thread_desc_link {
 		struct $thread * next;
-		struct $thread * prev;
 		volatile unsigned long long ts;
-		unsigned preferred;
 	};
 
@@ -155,4 +153,8 @@
 		// context that is switch during a __cfactx_switch
 		struct __stack_context_t context;
+
+		// Link lists fields
+		// instrusive link field for threads
+		struct __thread_desc_link link;
 
 		// current execution status for coroutine
@@ -170,7 +172,6 @@
 		struct cluster * curr_cluster;
 
-		// Link lists fields
-		// instrusive link field for threads
-		struct __thread_desc_link link;
+		// preferred ready-queue
+		unsigned preferred;
 
 		// coroutine body used to store context
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 692f0c895772f0dbe0cfe4f849661f820c7bc5eb)
+++ libcfa/src/concurrency/kernel.cfa	(revision c0c940adccee8b334b58652795a5f0f352fc519f)
@@ -184,4 +184,5 @@
 		MAIN_LOOP:
 		for() {
+			#if 1
 			// Check if there is pending io
 			__maybe_io_drain( this );
@@ -270,116 +271,113 @@
 			}
 
-		// 	SEARCH: {
-		// 		/* paranoid */ verify( ! __preemption_enabled() );
-		// 		/* paranoid */ verify( kernelTLS().this_proc_id );
-
-		// 		// First, lock the scheduler since we are searching for a thread
-
-		// 		// Try to get the next thread
-		// 		ready_schedule_lock();
-		// 		readyThread = pop_fast( this->cltr );
-		// 		ready_schedule_unlock();
-		// 		if(readyThread) {  break SEARCH; }
-
-		// 		// If we can't find a thread, might as well flush any outstanding I/O
-		// 		if(this->io.pending) { __cfa_io_flush( this ); }
-
-		// 		// Spin a little on I/O, just in case
-		// 		for(25) {
-		// 			__maybe_io_drain( this );
-		// 			ready_schedule_lock();
-		// 			readyThread = pop_fast( this->cltr );
-		// 			ready_schedule_unlock();
-		// 			if(readyThread) {  break SEARCH; }
-		// 		}
-
-		// 		// no luck, try stealing a few times
-		// 		for(25) {
-		// 			if( __maybe_io_drain( this ) ) {
-		// 				ready_schedule_lock();
-		// 				readyThread = pop_fast( this->cltr );
-		// 			} else {
-		// 				ready_schedule_lock();
-		// 				readyThread = pop_slow( this->cltr );
-		// 			}
-		// 			ready_schedule_unlock();
-		// 			if(readyThread) {  break SEARCH; }
-		// 		}
-
-		// 		// still no luck, search for a thread
-		// 		ready_schedule_lock();
-		// 		readyThread = pop_search( this->cltr );
-		// 		ready_schedule_unlock();
-		// 		if(readyThread) { break SEARCH; }
-
-		// 		// Don't block if we are done
-		// 		if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
-
-		// 		__STATS( __tls_stats()->ready.sleep.halts++; )
-
-		// 		// Push self to idle stack
-		// 		mark_idle(this->cltr->procs, * this);
-
-		// 		// Confirm the ready-queue is empty
-		// 		__maybe_io_drain( this );
-		// 		ready_schedule_lock();
-		// 		readyThread = pop_search( this->cltr );
-		// 		ready_schedule_unlock();
-
-		// 		if( readyThread ) {
-		// 			// A thread was found, cancel the halt
-		// 			mark_awake(this->cltr->procs, * this);
-
-		// 			__STATS( __tls_stats()->ready.sleep.cancels++; )
-
-		// 			// continue the main loop
-		// 			break SEARCH;
-		// 		}
-
-		// 		__STATS( if(this->print_halts) __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->id, rdtscl()); )
-		// 		__cfadbg_print_safe(runtime_core, "Kernel : core %p waiting on eventfd %d\n", this, this->idle);
-
-		// 		// __disable_interrupts_hard();
-		// 		eventfd_t val;
-		// 		eventfd_read( this->idle, &val );
-		// 		// __enable_interrupts_hard();
-
-		// 		__STATS( if(this->print_halts) __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->id, rdtscl()); )
-
-		// 		// We were woken up, remove self from idle
-		// 		mark_awake(this->cltr->procs, * this);
-
-		// 		// DON'T just proceed, start looking again
-		// 		continue MAIN_LOOP;
-		// 	}
-
-		// RUN_THREAD:
-		// 	/* paranoid */ verify( kernelTLS().this_proc_id );
-		// 	/* paranoid */ verify( ! __preemption_enabled() );
-		// 	/* paranoid */ verify( readyThread );
-
-		// 	// Reset io dirty bit
-		// 	this->io.dirty = false;
-
-		// 	// We found a thread run it
-		// 	__run_thread(this, readyThread);
-
-		// 	// Are we done?
-		// 	if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
-
-		// 	#if !defined(__CFA_NO_STATISTICS__)
-		// 		unsigned long long curr = rdtscl();
-		// 		if(curr > (last_tally + 500000000)) {
-		// 			__tally_stats(this->cltr->stats, __cfaabi_tls.this_stats);
-		// 			last_tally = curr;
-		// 		}
-		// 	#endif
-
-		// 	if(this->io.pending && !this->io.dirty) {
-		// 		__cfa_io_flush( this );
-		// 	}
-
-		// 	// Check if there is pending io
-		// 	__maybe_io_drain( this );
+			#else
+
+			SEARCH: {
+				/* paranoid */ verify( ! __preemption_enabled() );
+				/* paranoid */ verify( kernelTLS().this_proc_id );
+
+				// First, lock the scheduler since we are searching for a thread
+				ready_schedule_lock();
+
+				// Try to get the next thread
+				readyThread = pop_fast( this->cltr );
+				if(readyThread) { ready_schedule_unlock(); break SEARCH; }
+
+				// If we can't find a thread, might as well flush any outstanding I/O
+				if(this->io.pending) { __cfa_io_flush( this ); }
+
+				// Spin a little on I/O, just in case
+				for(25) {
+					__maybe_io_drain( this );
+					readyThread = pop_fast( this->cltr );
+					if(readyThread) { ready_schedule_unlock(); break SEARCH; }
+				}
+
+				// no luck, try stealing a few times
+				for(25) {
+					if( __maybe_io_drain( this ) ) {
+						readyThread = pop_fast( this->cltr );
+					} else {
+						readyThread = pop_slow( this->cltr );
+					}
+					if(readyThread) { ready_schedule_unlock(); break SEARCH; }
+				}
+
+				// still no luck, search for a thread
+				readyThread = pop_search( this->cltr );
+				if(readyThread) { ready_schedule_unlock(); break SEARCH; }
+
+				// Don't block if we are done
+				if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
+
+				__STATS( __tls_stats()->ready.sleep.halts++; )
+
+				// Push self to idle stack
+				ready_schedule_unlock();
+				mark_idle(this->cltr->procs, * this);
+				ready_schedule_lock();
+
+				// Confirm the ready-queue is empty
+				__maybe_io_drain( this );
+				readyThread = pop_search( this->cltr );
+				ready_schedule_unlock();
+
+				if( readyThread ) {
+					// A thread was found, cancel the halt
+					mark_awake(this->cltr->procs, * this);
+
+					__STATS( __tls_stats()->ready.sleep.cancels++; )
+
+					// continue the main loop
+					break SEARCH;
+				}
+
+				__STATS( if(this->print_halts) __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->id, rdtscl()); )
+				__cfadbg_print_safe(runtime_core, "Kernel : core %p waiting on eventfd %d\n", this, this->idle);
+
+				// __disable_interrupts_hard();
+				eventfd_t val;
+				eventfd_read( this->idle, &val );
+				// __enable_interrupts_hard();
+
+				__STATS( if(this->print_halts) __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->id, rdtscl()); )
+
+				// We were woken up, remove self from idle
+				mark_awake(this->cltr->procs, * this);
+
+				// DON'T just proceed, start looking again
+				continue MAIN_LOOP;
+			}
+
+		RUN_THREAD:
+			/* paranoid */ verify( kernelTLS().this_proc_id );
+			/* paranoid */ verify( ! __preemption_enabled() );
+			/* paranoid */ verify( readyThread );
+
+			// Reset io dirty bit
+			this->io.dirty = false;
+
+			// We found a thread run it
+			__run_thread(this, readyThread);
+
+			// Are we done?
+			if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
+
+			#if !defined(__CFA_NO_STATISTICS__)
+				unsigned long long curr = rdtscl();
+				if(curr > (last_tally + 500000000)) {
+					__tally_stats(this->cltr->stats, __cfaabi_tls.this_stats);
+					last_tally = curr;
+				}
+			#endif
+
+			if(this->io.pending && !this->io.dirty) {
+				__cfa_io_flush( this );
+			}
+
+			ready_schedule_lock();
+			__maybe_io_drain( this );
+			ready_schedule_unlock();
+			#endif
 		}
 
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 692f0c895772f0dbe0cfe4f849661f820c7bc5eb)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision c0c940adccee8b334b58652795a5f0f352fc519f)
@@ -461,6 +461,6 @@
 	self_mon_p = &self_mon;
 	link.next = 0p;
-	link.prev = 0p;
-	link.preferred = -1u;
+	link.ts   = 0;
+	preferred = -1u;
 	last_proc = 0p;
 	#if defined( __CFA_WITH_VERIFY__ )
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 692f0c895772f0dbe0cfe4f849661f820c7bc5eb)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision c0c940adccee8b334b58652795a5f0f352fc519f)
@@ -17,5 +17,4 @@
 // #define __CFA_DEBUG_PRINT_READY_QUEUE__
 
-// #define USE_MPSC
 
 #define USE_RELAXED_FIFO
@@ -256,7 +255,4 @@
 		/* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
 
-		// write timestamp
-		thrd->link.ts = rdtscl();
-
 		bool local;
 		int preferred = external ? -1 : kernelTLS().this_processor->rdq.id;
@@ -277,19 +273,12 @@
 			#endif
 
-		#if defined(USE_MPSC)
-			// mpsc always succeeds
-		} while( false );
-		#else
 			// If we can't lock it retry
 		} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
-		#endif
 
 		// Actually push it
 		push(lanes.data[i], thrd);
 
-		#if !defined(USE_MPSC)
 			// Unlock and return
 			__atomic_unlock( &lanes.data[i].lock );
-		#endif
 
 		// Mark the current index in the tls rng instance as having an item
@@ -350,9 +339,16 @@
 		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
 
+		// #define USE_PREFERRED
+		#if !defined(USE_PREFERRED)
 		const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
 		/* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
-
-		// write timestamp
-		thrd->link.ts = rdtscl();
+		#else
+			unsigned preferred = thrd->preferred;
+			const bool external = (!kernelTLS().this_processor) || preferred == -1u || thrd->curr_cluster != cltr;
+			/* paranoid */ verifyf(external || preferred < lanes.count, "Invalid preferred queue %u for %u lanes", preferred, lanes.count );
+
+			unsigned r = preferred % READYQ_SHARD_FACTOR;
+			const unsigned start = preferred - r;
+		#endif
 
 		// Try to pick a lane and lock it
@@ -368,25 +364,20 @@
 			}
 			else {
+				#if !defined(USE_PREFERRED)
 				processor * proc = kernelTLS().this_processor;
 				unsigned r = proc->rdq.its++;
 				i =  proc->rdq.id + (r % READYQ_SHARD_FACTOR);
+		#else
+					i = start + (r++ % READYQ_SHARD_FACTOR);
+				#endif
 			}
-
-
-		#if defined(USE_MPSC)
-			// mpsc always succeeds
-		} while( false );
-		#else
 			// If we can't lock it retry
 		} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
-		#endif
 
 		// Actually push it
 		push(lanes.data[i], thrd);
 
-		#if !defined(USE_MPSC)
 			// Unlock and return
 			__atomic_unlock( &lanes.data[i].lock );
-		#endif
 
 		#if !defined(__CFA_NO_STATISTICS__)
@@ -492,4 +483,6 @@
 		lanes.tscs[w].tv = thrd->link.ts;
 	#endif
+
+	thrd->preferred = w;
 
 	// return the popped thread
@@ -519,5 +512,5 @@
 // Check that all the intrusive queues in the data structure are still consistent
 static void check( __ready_queue_t & q ) with (q) {
-	#if defined(__CFA_WITH_VERIFY__) && !defined(USE_MPSC)
+	#if defined(__CFA_WITH_VERIFY__)
 		{
 			for( idx ; lanes.count ) {
@@ -525,16 +518,13 @@
 				assert(!lanes.data[idx].lock);
 
-				assert(head(sl)->link.prev == 0p );
-				assert(head(sl)->link.next->link.prev == head(sl) );
-				assert(tail(sl)->link.next == 0p );
-				assert(tail(sl)->link.prev->link.next == tail(sl) );
-
-				if(is_empty(sl)) {
-					assert(tail(sl)->link.prev == head(sl));
-					assert(head(sl)->link.next == tail(sl));
-				} else {
-					assert(tail(sl)->link.prev != head(sl));
-					assert(head(sl)->link.next != tail(sl));
-				}
+					if(is_empty(sl)) {
+						assert( sl.anchor.next == 0p );
+						assert( sl.anchor.ts   == 0  );
+						assert( mock_head(sl)  == sl.prev );
+					} else {
+						assert( sl.anchor.next != 0p );
+						assert( sl.anchor.ts   != 0  );
+						assert( mock_head(sl)  != sl.prev );
+					}
 			}
 		}
@@ -557,18 +547,8 @@
 // fixes the list so that the pointers back to anchors aren't left dangling
 static inline void fix(__intrusive_lane_t & ll) {
-	#if !defined(USE_MPSC)
-		// if the list is not empty then follow he pointer and fix its reverse
-		if(!is_empty(ll)) {
-			head(ll)->link.next->link.prev = head(ll);
-			tail(ll)->link.prev->link.next = tail(ll);
-		}
-		// Otherwise just reset the list
-		else {
-			verify(tail(ll)->link.next == 0p);
-			tail(ll)->link.prev = head(ll);
-			head(ll)->link.next = tail(ll);
-			verify(head(ll)->link.prev == 0p);
-		}
-	#endif
+			if(is_empty(ll)) {
+				verify(ll.anchor.next == 0p);
+				ll.prev = mock_head(ll);
+			}
 }
 
Index: libcfa/src/concurrency/ready_subqueue.hfa
===================================================================
--- libcfa/src/concurrency/ready_subqueue.hfa	(revision 692f0c895772f0dbe0cfe4f849661f820c7bc5eb)
+++ libcfa/src/concurrency/ready_subqueue.hfa	(revision c0c940adccee8b334b58652795a5f0f352fc519f)
@@ -7,61 +7,18 @@
 // Intrusives lanes which are used by the relaxed ready queue
 struct __attribute__((aligned(128))) __intrusive_lane_t {
-
-	#if defined(USE_MPSC)
-		mpsc_queue($thread) queue;
-		__attribute__((aligned(128)))
-	#else
-		// anchor for the head and the tail of the queue
-		__attribute__((aligned(128))) struct __sentinel_t {
-			// Link lists fields
-			// instrusive link field for threads
-			// must be exactly as in $thread
-			__thread_desc_link link;
-		} before, after;
-	#endif
+	struct $thread * prev;
 
 	// spin lock protecting the queue
 	volatile bool lock;
 
-	// Optional statistic counters
-	#if !defined(__CFA_NO_SCHED_STATS__)
-		struct __attribute__((aligned(64))) {
-			// difference between number of push and pops
-			ssize_t diff;
-
-			// total number of pushes and pops
-			size_t  push;
-			size_t  pop ;
-		} stat;
-	#endif
+	__thread_desc_link anchor;
 };
 
-void  ?{}(__intrusive_lane_t & this);
-void ^?{}(__intrusive_lane_t & this);
-
 // Get the head pointer (one before the first element) from the anchor
-static inline $thread * head(const __intrusive_lane_t & this) {
-	#if defined(USE_MPSC)
-		return this.queue.head;
-	#else
-		$thread * rhead = ($thread *)(
-			(uintptr_t)( &this.before ) - offsetof( $thread, link )
-		);
-		/* paranoid */ verify(rhead);
-		return rhead;
-	#endif
-}
-
-// Get the tail pointer (one after the last element) from the anchor
-static inline $thread * tail(const __intrusive_lane_t & this) {
-	#if defined(USE_MPSC)
-		return this.queue.tail;
-	#else
-		$thread * rtail = ($thread *)(
-			(uintptr_t)( &this.after ) - offsetof( $thread, link )
-		);
-		/* paranoid */ verify(rtail);
-		return rtail;
-	#endif
+static inline $thread * mock_head(const __intrusive_lane_t & this) {
+	$thread * rhead = ($thread *)(
+		(uintptr_t)( &this.anchor ) - __builtin_offsetof( $thread, link )
+	);
+	return rhead;
 }
 
@@ -69,100 +26,50 @@
 void ?{}( __intrusive_lane_t & this ) {
 	this.lock = false;
+	this.prev = mock_head(this);
+	this.anchor.next = 0p;
+	this.anchor.ts   = 0;
 
-	#if !defined(USE_MPSC)
-		this.before.link.prev = 0p;
-		this.before.link.next = tail(this);
-		this.before.link.ts   = 0;
-
-		this.after .link.prev = head(this);
-		this.after .link.next = 0p;
-		this.after .link.ts   = 0;
-
-		#if !defined(__CFA_NO_SCHED_STATS__)
-			this.stat.diff = 0;
-			this.stat.push = 0;
-			this.stat.pop  = 0;
-		#endif
-
-		// We add a boat-load of assertions here because the anchor code is very fragile
-		/* paranoid */ verify(((uintptr_t)( head(this) ) + offsetof( $thread, link )) == (uintptr_t)(&this.before));
-		/* paranoid */ verify(((uintptr_t)( tail(this) ) + offsetof( $thread, link )) == (uintptr_t)(&this.after ));
-		/* paranoid */ verify(head(this)->link.prev == 0p );
-		/* paranoid */ verify(head(this)->link.next == tail(this) );
-		/* paranoid */ verify(tail(this)->link.next == 0p );
-		/* paranoid */ verify(tail(this)->link.prev == head(this) );
-		/* paranoid */ verify(&head(this)->link.prev == &this.before.link.prev );
-		/* paranoid */ verify(&head(this)->link.next == &this.before.link.next );
-		/* paranoid */ verify(&tail(this)->link.prev == &this.after .link.prev );
-		/* paranoid */ verify(&tail(this)->link.next == &this.after .link.next );
-		/* paranoid */ verify(__alignof__(__intrusive_lane_t) == 128);
-		/* paranoid */ verify(__alignof__(this) == 128);
-		/* paranoid */ verifyf(((intptr_t)(&this) % 128) == 0, "Expected address to be aligned %p %% 128 == %zd", &this, ((intptr_t)(&this) % 128));
-	#endif
+	// We add a boat-load of assertions here because the anchor code is very fragile
+	/* paranoid */ verify( offsetof( $thread, link ) == offsetof(__intrusive_lane_t, anchor) );
+	/* paranoid */ verify( ((uintptr_t)( mock_head(this) ) + offsetof( $thread, link )) == (uintptr_t)(&this.anchor) );
+	/* paranoid */ verify( &mock_head(this)->link.next == &this.anchor.next );
+	/* paranoid */ verify( &mock_head(this)->link.ts   == &this.anchor.ts   );
+	/* paranoid */ verify( mock_head(this)->link.next == 0p );
+	/* paranoid */ verify( mock_head(this)->link.ts   == 0  );
+	/* paranoid */ verify( mock_head(this) == this.prev );
+	/* paranoid */ verify( __alignof__(__intrusive_lane_t) == 128 );
+	/* paranoid */ verify( __alignof__(this) == 128 );
+	/* paranoid */ verifyf( ((intptr_t)(&this) % 128) == 0, "Expected address to be aligned %p %% 128 == %zd", &this, ((intptr_t)(&this) % 128) );
 }
 
 // Dtor is trivial
 void ^?{}( __intrusive_lane_t & this ) {
-	#if !defined(USE_MPSC)
-		// Make sure the list is empty
-		/* paranoid */ verify(head(this)->link.prev == 0p );
-		/* paranoid */ verify(head(this)->link.next == tail(this) );
-		/* paranoid */ verify(tail(this)->link.next == 0p );
-		/* paranoid */ verify(tail(this)->link.prev == head(this) );
-	#endif
+	// Make sure the list is empty
+	/* paranoid */ verify( this.anchor.next == 0p );
+	/* paranoid */ verify( this.anchor.ts   == 0  );
+	/* paranoid */ verify( mock_head(this)  == this.prev );
 }
 
 // Push a thread onto this lane
 // returns true of lane was empty before push, false otherwise
-bool push(__intrusive_lane_t & this, $thread * node) {
-	#if defined(USE_MPSC)
-		inline $thread * volatile & ?`next ( $thread * this )  __attribute__((const)) {
-			return this->link.next;
-		}
-		push(this.queue, node);
-	#else
-		#if defined(__CFA_WITH_VERIFY__)
-			/* paranoid */ verify(this.lock);
-			/* paranoid */ verify(node->link.ts != 0);
-			/* paranoid */ verify(node->link.next == 0p);
-			/* paranoid */ verify(node->link.prev == 0p);
-			/* paranoid */ verify(tail(this)->link.next == 0p);
-			/* paranoid */ verify(head(this)->link.prev == 0p);
+void push( __intrusive_lane_t & this, $thread * node ) {
+	/* paranoid */ verify( node->link.next == 0p );
+	/* paranoid */ verify( node->link.ts   == 0  );
+	/* paranoid */ verify( this.prev->link.next == 0p );
+	/* paranoid */ verify( this.prev->link.ts   == 0  );
+	if( this.anchor.next == 0p ) {
+		/* paranoid */ verify( this.anchor.next == 0p );
+		/* paranoid */ verify( this.anchor.ts   == 0  );
+		/* paranoid */ verify( this.prev == mock_head( this ) );
+	} else {
+		/* paranoid */ verify( this.anchor.next != 0p );
+		/* paranoid */ verify( this.anchor.ts   != 0  );
+		/* paranoid */ verify( this.prev != mock_head( this ) );
+	}
 
-			if(this.before.link.ts == 0l) {
-				/* paranoid */ verify(tail(this)->link.prev == head(this));
-				/* paranoid */ verify(head(this)->link.next == tail(this));
-			} else {
-				/* paranoid */ verify(tail(this)->link.prev != head(this));
-				/* paranoid */ verify(head(this)->link.next != tail(this));
-			}
-		#endif
-
-		// Get the relevant nodes locally
-		$thread * tail = tail(this);
-		$thread * prev = tail->link.prev;
-
-		// Do the push
-		node->link.next = tail;
-		node->link.prev = prev;
-		prev->link.next = node;
-		tail->link.prev = node;
-
-		// Update stats
-		#if !defined(__CFA_NO_SCHED_STATS__)
-			this.stat.diff++;
-			this.stat.push++;
-		#endif
-
-		verify(node->link.next == tail(this));
-
-		// Check if the queue used to be empty
-		if(this.before.link.ts == 0l) {
-			this.before.link.ts = node->link.ts;
-			/* paranoid */ verify(node->link.prev == head(this));
-			return true;
-		}
-		return false;
-	#endif
+	// Get the relevant nodes locally
+	this.prev->link.next = node;
+	this.prev->link.ts   = rdtscl();
+	this.prev = node;
 }
 
@@ -170,79 +77,33 @@
 // returns popped
 // returns true of lane was empty before push, false otherwise
-$thread * pop(__intrusive_lane_t & this) {
-	/* paranoid */ verify(this.lock);
-	#if defined(USE_MPSC)
-		inline $thread * volatile & ?`next ( $thread * this )  __attribute__((const)) {
-			return this->link.next;
-		}
-		return pop(this.queue);
-	#else
-		/* paranoid */ verify(this.before.link.ts != 0ul);
+$thread * pop( __intrusive_lane_t & this ) {
+	/* paranoid */ verify( this.anchor.next != 0p );
+	/* paranoid */ verify( this.anchor.ts   != 0  );
 
-		// Get anchors locally
-		$thread * head = head(this);
-		$thread * tail = tail(this);
+	// Get the relevant nodes locally
+	$thread * node = this.anchor.next;
+	this.anchor.next = node->link.next;
+	this.anchor.ts   = node->link.ts;
+	bool is_empty = this.anchor.ts == 0;
+	node->link.next = 0p;
+	node->link.ts   = 0;
 
-		// Get the relevant nodes locally
-		$thread * node = head->link.next;
-		$thread * next = node->link.next;
+	// Update head time stamp
+	if(is_empty) this.prev = mock_head( this );
 
-		/* paranoid */ verify(node != tail);
-		/* paranoid */ verify(node);
-
-		// Do the pop
-		head->link.next = next;
-		next->link.prev = head;
-		node->link.next = 0p;
-		node->link.prev = 0p;
-
-		// Update head time stamp
-		this.before.link.ts = next->link.ts;
-
-		// Update stats
-		#ifndef __CFA_NO_SCHED_STATS__
-			this.stat.diff--;
-			this.stat.pop ++;
-		#endif
-
-		// Check if we emptied list and return accordingly
-		/* paranoid */ verify(tail(this)->link.next == 0p);
-		/* paranoid */ verify(head(this)->link.prev == 0p);
-		if(next == tail) {
-			/* paranoid */ verify(this.before.link.ts == 0);
-			/* paranoid */ verify(tail(this)->link.prev == head(this));
-			/* paranoid */ verify(head(this)->link.next == tail(this));
-			return node;
-		}
-		else {
-			/* paranoid */ verify(next->link.ts != 0);
-			/* paranoid */ verify(tail(this)->link.prev != head(this));
-			/* paranoid */ verify(head(this)->link.next != tail(this));
-			/* paranoid */ verify(this.before.link.ts != 0);
-			return node;
-		}
-	#endif
+	/* paranoid */ verify( node->link.next == 0p );
+	/* paranoid */ verify( node->link.ts   == 0  );
+	return node;
 }
 
 // Check whether or not list is empty
 static inline bool is_empty(__intrusive_lane_t & this) {
-	#if defined(USE_MPSC)
-		return this.queue.head == 0p;
-	#else
-		// Cannot verify here since it may not be locked
-		return this.before.link.ts == 0;
-	#endif
+	return this.anchor.ts == 0;
 }
 
 // Return the timestamp
 static inline unsigned long long ts(__intrusive_lane_t & this) {
-	#if defined(USE_MPSC)
-		$thread * tl = this.queue.head;
-		if(!tl) return -1ull;
-		return tl->link.ts;
-	#else
-		// Cannot verify here since it may not be locked
-		return this.before.link.ts;
-	#endif
+	// Cannot verify here since it may not be locked
+	return this.anchor.ts;
 }
 
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 692f0c895772f0dbe0cfe4f849661f820c7bc5eb)
+++ libcfa/src/concurrency/thread.cfa	(revision c0c940adccee8b334b58652795a5f0f352fc519f)
@@ -38,6 +38,6 @@
 	curr_cluster = &cl;
 	link.next = 0p;
-	link.prev = 0p;
-	link.preferred = -1u;
+	link.ts   = 0;
+	preferred = -1u;
 	last_proc = 0p;
 	#if defined( __CFA_WITH_VERIFY__ )
Index: libcfa/src/containers/array.hfa
===================================================================
--- libcfa/src/containers/array.hfa	(revision 692f0c895772f0dbe0cfe4f849661f820c7bc5eb)
+++ libcfa/src/containers/array.hfa	(revision c0c940adccee8b334b58652795a5f0f352fc519f)
@@ -21,26 +21,48 @@
     };
 
-    Timmed & ?[?]( arpk(N, S, Timmed, Tbase) & a, ptrdiff_t i ) {
+    // About the choice of integral types offered as subscript overloads:
+    // Intent is to cover these use cases:
+    //    float foo( ptrdiff_t i ) { return a[i]; }           // i : ptrdiff_t
+    //    forall( [N] ) ... for( i; N ) { total += a[i]; }    // i : typeof( sizeof(42) )
+    //    for( i; 5 ) { total += a[i]; }                      // i : int
+    // It gets complicated by:
+    // -  CFA does overloading on concrete types, like int and unsigned int, not on typedefed
+    //    types like size_t.  So trying to overload on ptrdiff_t vs int works in 64-bit mode
+    //    but not in 32-bit mode.
+    // -  Given bug of Trac #247, CFA gives sizeof expressions type unsigned long int, when it
+    //    should give them type size_t.
+    //    
+    //                          gcc -m32         cfa -m32 given bug         gcc -m64
+    // ptrdiff_t                int              int                        long int
+    // size_t                   unsigned int     unsigned int               unsigned long int
+    // typeof( sizeof(42) )     unsigned int     unsigned long int          unsigned long int
+    // int                      int              int                        int
+
+    static inline Timmed & ?[?]( arpk(N, S, Timmed, Tbase) & a, int i ) {
         return (Timmed &) a.strides[i];
     }
 
-    Timmed & ?[?]( arpk(N, S, Timmed, Tbase) & a, int i ) {
+    static inline Timmed & ?[?]( arpk(N, S, Timmed, Tbase) & a, unsigned int i ) {
         return (Timmed &) a.strides[i];
     }
 
-    Timmed & ?[?]( arpk(N, S, Timmed, Tbase) & a, size_t i ) {
+    static inline Timmed & ?[?]( arpk(N, S, Timmed, Tbase) & a, long int i ) {
         return (Timmed &) a.strides[i];
     }
 
-    size_t ?`len( arpk(N, S, Timmed, Tbase) & a ) {
+    static inline Timmed & ?[?]( arpk(N, S, Timmed, Tbase) & a, unsigned long int i ) {
+        return (Timmed &) a.strides[i];
+    }
+
+    static inline size_t ?`len( arpk(N, S, Timmed, Tbase) & a ) {
         return z(N);
     }
 
     // workaround #226 (and array relevance thereof demonstrated in mike102/otype-slow-ndims.cfa)
-    void ?{}( arpk(N, S, Timmed, Tbase) & this ) {
+    static inline void ?{}( arpk(N, S, Timmed, Tbase) & this ) {
         void ?{}( S (&inner)[z(N)] ) {}
         ?{}(this.strides);
     }
-    void ^?{}( arpk(N, S, Timmed, Tbase) & this ) {
+    static inline void ^?{}( arpk(N, S, Timmed, Tbase) & this ) {
         void ^?{}( S (&inner)[z(N)] ) {}
         ^?{}(this.strides);
@@ -53,8 +75,8 @@
 
 forall( Te )
-Te mkar_( tag(Te) ) {}
+static inline Te mkar_( tag(Te) ) {}
 
 forall( [N], ZTags ... , Trslt &, Tatom & | { Trslt mkar_( tag(Tatom), ZTags ); } )
-arpk(N, Trslt, Trslt, Tatom) mkar_( tag(Tatom), tag(N), ZTags ) {}
+static inline arpk(N, Trslt, Trslt, Tatom) mkar_( tag(Tatom), tag(N), ZTags ) {}
 
 // based on https://stackoverflow.com/questions/1872220/is-it-possible-to-iterate-over-arguments-in-variadic-macros
@@ -90,5 +112,5 @@
 
 forall( TA &, TB &, TC &, IxAB, IxBC ... | { TB & ?[?]( TA &, IxAB ); TC & ?[?]( TB &, IxBC ); } )
-TC & ?[?]( TA & this, IxAB ab, IxBC bc ) {
+static inline TC & ?[?]( TA & this, IxAB ab, IxBC bc ) {
     return this[ab][bc];
 }
@@ -99,15 +121,15 @@
 
 forall( TA &, TB &, TC &, IxAB_0, IxBC | { TB & ?[?]( TA &, IxAB_0 ); TC & ?[?]( TB &, IxBC ); } )
-TC & ?[?]( TA & this, IxAB_0 ab, IxBC bc ) {
+static inline TC & ?[?]( TA & this, IxAB_0 ab, IxBC bc ) {
     return this[ab][bc];
 }
 
 forall( TA &, TB &, TC &, IxAB_0, IxAB_1, IxBC | { TB & ?[?]( TA &, IxAB_0, IxAB_1 ); TC & ?[?]( TB &, IxBC ); } )
-TC & ?[?]( TA & this, IxAB_0 ab0, IxAB_1 ab1, IxBC bc ) {
+static inline TC & ?[?]( TA & this, IxAB_0 ab0, IxAB_1 ab1, IxBC bc ) {
     return this[[ab0,ab1]][bc];
 }
 
 forall( TA &, TB &, TC &, IxAB_0, IxAB_1, IxAB_2, IxBC | { TB & ?[?]( TA &, IxAB_0, IxAB_1, IxAB_2 ); TC & ?[?]( TB &, IxBC ); } )
-TC & ?[?]( TA & this, IxAB_0 ab0, IxAB_1 ab1, IxAB_2 ab2, IxBC bc ) {
+static inline TC & ?[?]( TA & this, IxAB_0 ab0, IxAB_1 ab1, IxAB_2 ab2, IxBC bc ) {
     return this[[ab0,ab1,ab2]][bc];
 }
@@ -121,14 +143,14 @@
 // Base
 forall( [Nq], [Sq], Tbase & )
-tag(arpk(Nq, Sq, Tbase, Tbase)) enq_( tag(Tbase), tag(Nq), tag(Sq), tag(Tbase) ) {}
+static inline tag(arpk(Nq, Sq, Tbase, Tbase)) enq_( tag(Tbase), tag(Nq), tag(Sq), tag(Tbase) ) {}
 
 // Rec
 forall( [Nq], [Sq], [N], [S], recq &, recr &, Tbase & | { tag(recr) enq_( tag(Tbase), tag(Nq), tag(Sq), tag(recq) ); } )
-tag(arpk(N, S, recr, Tbase)) enq_( tag(Tbase), tag(Nq), tag(Sq), tag(arpk(N, S, recq, Tbase)) ) {}
+static inline tag(arpk(N, S, recr, Tbase)) enq_( tag(Tbase), tag(Nq), tag(Sq), tag(arpk(N, S, recq, Tbase)) ) {}
 
 // Wrapper
 struct all_t {} all;
 forall( [N], [S], Te &, result &, Tbase & | { tag(result) enq_( tag(Tbase), tag(N), tag(S), tag(Te) ); } )
-result & ?[?]( arpk(N, S, Te, Tbase) & this, all_t ) {
+static inline result & ?[?]( arpk(N, S, Te, Tbase) & this, all_t ) {
     return (result&) this;
 }
Index: tests/array-container/.expect/array-basic.txt
===================================================================
--- tests/array-container/.expect/array-basic.txt	(revision c0c940adccee8b334b58652795a5f0f352fc519f)
+++ tests/array-container/.expect/array-basic.txt	(revision c0c940adccee8b334b58652795a5f0f352fc519f)
@@ -0,0 +1,10 @@
+expect Ws             = 7.060606
+result Ws [][][][] lo = 7.060606
+result Ws [,,,]    lo = 7.060606
+result Ws [][][][] hi = 7.060606
+result Ws [,,,]    hi = 7.060606
+expect Xs             = 8.150808
+result Xs [][][][] lo = 8.150808
+result Xs [,,,]    lo = 8.150808
+result Xs [][][][] hi = 8.150808
+result Xs [,,,]    hi = 8.150808
Index: sts/array-container/.expect/array-basic.x64.txt
===================================================================
--- tests/array-container/.expect/array-basic.x64.txt	(revision 692f0c895772f0dbe0cfe4f849661f820c7bc5eb)
+++ 	(revision )
@@ -1,10 +1,0 @@
-expect Ws             = 7.060606
-result Ws [][][][] lo = 7.060606
-result Ws [,,,]    lo = 7.060606
-result Ws [][][][] hi = 7.060606
-result Ws [,,,]    hi = 7.060606
-expect Xs             = 8.150808
-result Xs [][][][] lo = 8.150808
-result Xs [,,,]    lo = 8.150808
-result Xs [][][][] hi = 8.150808
-result Xs [,,,]    hi = 8.150808
Index: tests/array-container/.expect/array-md-sbscr-cases.txt
===================================================================
--- tests/array-container/.expect/array-md-sbscr-cases.txt	(revision c0c940adccee8b334b58652795a5f0f352fc519f)
+++ tests/array-container/.expect/array-md-sbscr-cases.txt	(revision c0c940adccee8b334b58652795a5f0f352fc519f)
@@ -0,0 +1,1 @@
+done
Index: sts/array-container/.expect/array-md-sbscr-cases.x64.txt
===================================================================
--- tests/array-container/.expect/array-md-sbscr-cases.x64.txt	(revision 692f0c895772f0dbe0cfe4f849661f820c7bc5eb)
+++ 	(revision )
@@ -1,1 +1,0 @@
-done
