Index: benchmark/benchcltr.hfa
===================================================================
--- benchmark/benchcltr.hfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ benchmark/benchcltr.hfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -114,5 +114,5 @@
 	for() {
 		sleep(100`ms);
-		end = getTimeNsec();
+		end = timeHiRes();
 		Duration delta = end - start;
 		/*if(is_tty)*/ {
@@ -126,5 +126,5 @@
 }
 #else
-uint64_t getTimeNsec() {
+uint64_t timeHiRes() {
 	timespec curr;
 	clock_gettime( CLOCK_REALTIME, &curr );
@@ -140,5 +140,5 @@
 	for(;;) {
 		usleep(100000);
-		end = getTimeNsec();
+		end = timeHiRes();
 		uint64_t delta = end - start;
 		/*if(is_tty)*/ {
Index: benchmark/io/http/protocol.cfa
===================================================================
--- benchmark/io/http/protocol.cfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ benchmark/io/http/protocol.cfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -228,5 +228,5 @@
 
 		char buff[100];
-		Time now = getTimeNsec();
+		Time now = timeHiRes();
 		strftime( buff, 100, "%a, %d %b %Y %H:%M:%S %Z", now );
 		sout | "Updated date to '" | buff | "'";
Index: benchmark/io/readv-posix.c
===================================================================
--- benchmark/io/readv-posix.c	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ benchmark/io/readv-posix.c	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -111,5 +111,5 @@
 				printf("Starting\n");
 				bool is_tty = isatty(STDOUT_FILENO);
-				start = getTimeNsec();
+				start = timeHiRes();
 				run = true;
 
@@ -118,5 +118,5 @@
 
 				run = false;
-				end = getTimeNsec();
+				end = timeHiRes();
 				printf("\nDone\n");
 
Index: benchmark/io/readv.cfa
===================================================================
--- benchmark/io/readv.cfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ benchmark/io/readv.cfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -147,5 +147,5 @@
 				printf("Starting\n");
 				bool is_tty = isatty(STDOUT_FILENO);
-				start = getTimeNsec();
+				start = timeHiRes();
 				run = true;
 
@@ -156,5 +156,5 @@
 
 				run = false;
-				end = getTimeNsec();
+				end = timeHiRes();
 				printf("\nDone\n");
 			}
Index: benchmark/readyQ/cycle.cc
===================================================================
--- benchmark/readyQ/cycle.cc	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ benchmark/readyQ/cycle.cc	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -89,5 +89,5 @@
 
 			bool is_tty = isatty(STDOUT_FILENO);
-			start = getTimeNsec();
+			start = timeHiRes();
 
 			for(int i = 0; i < nthreads; i++) {
@@ -97,5 +97,5 @@
 
 			stop = true;
-			end = getTimeNsec();
+			end = timeHiRes();
 			printf("\nDone\n");
 
Index: benchmark/readyQ/cycle.cfa
===================================================================
--- benchmark/readyQ/cycle.cfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ benchmark/readyQ/cycle.cfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -65,5 +65,5 @@
 
 			bool is_tty = isatty(STDOUT_FILENO);
-			start = getTimeNsec();
+			start = timeHiRes();
 
 			for(i; nthreads) {
@@ -73,5 +73,5 @@
 
 			stop = true;
-			end = getTimeNsec();
+			end = timeHiRes();
 			printf("\nDone\n");
 
Index: benchmark/readyQ/cycle.cpp
===================================================================
--- benchmark/readyQ/cycle.cpp	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ benchmark/readyQ/cycle.cpp	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -93,5 +93,5 @@
 
 			bool is_tty = isatty(STDOUT_FILENO);
-			start = getTimeNsec();
+			start = timeHiRes();
 
 			for(int i = 0; i < nthreads; i++) {
@@ -101,5 +101,5 @@
 
 			stop = true;
-			end = getTimeNsec();
+			end = timeHiRes();
 			printf("\nDone\n");
 
Index: benchmark/readyQ/locality.cc
===================================================================
--- benchmark/readyQ/locality.cc	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ benchmark/readyQ/locality.cc	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -281,5 +281,5 @@
 
 			bool is_tty = isatty(STDOUT_FILENO);
-			start = getTimeNsec();
+			start = timeHiRes();
 
 			for(size_t i = 0; i < nthreads; i++) {
@@ -289,5 +289,5 @@
 
 			stop = true;
-			end = getTimeNsec();
+			end = timeHiRes();
 			printf("\nDone\n");
 
Index: benchmark/readyQ/locality.cfa
===================================================================
--- benchmark/readyQ/locality.cfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ benchmark/readyQ/locality.cfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -232,5 +232,5 @@
 
 			bool is_tty = isatty(STDOUT_FILENO);
-			start = getTimeNsec();
+			start = timeHiRes();
 
 			for(i; nthreads) {
@@ -240,5 +240,5 @@
 
 			stop = true;
-			end = getTimeNsec();
+			end = timeHiRes();
 			printf("\nDone\n");
 
Index: benchmark/readyQ/locality.cpp
===================================================================
--- benchmark/readyQ/locality.cpp	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ benchmark/readyQ/locality.cpp	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -287,5 +287,5 @@
 
 			bool is_tty = isatty(STDOUT_FILENO);
-			start = getTimeNsec();
+			start = timeHiRes();
 
 			for(size_t i = 0; i < nthreads; i++) {
@@ -295,5 +295,5 @@
 
 			stop = true;
-			end = getTimeNsec();
+			end = timeHiRes();
 			printf("\nDone\n");
 
Index: benchmark/readyQ/rq_bench.hfa
===================================================================
--- benchmark/readyQ/rq_bench.hfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ benchmark/readyQ/rq_bench.hfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -73,5 +73,5 @@
 	for() {
 		sleep(100`ms);
-		Time end = getTimeNsec();
+		Time end = timeHiRes();
 		Duration delta = end - start;
 		if(is_tty) {
Index: benchmark/readyQ/rq_bench.hpp
===================================================================
--- benchmark/readyQ/rq_bench.hpp	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ benchmark/readyQ/rq_bench.hpp	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -46,5 +46,5 @@
 	}
 
-uint64_t getTimeNsec() {
+uint64_t timeHiRes() {
 	timespec curr;
 	clock_gettime( CLOCK_REALTIME, &curr );
@@ -60,5 +60,5 @@
 	for(;;) {
 		Sleeper::usleep(100000);
-		uint64_t end = getTimeNsec();
+		uint64_t end = timeHiRes();
 		uint64_t delta = end - start;
 		if(is_tty) {
Index: benchmark/readyQ/yield.cfa
===================================================================
--- benchmark/readyQ/yield.cfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ benchmark/readyQ/yield.cfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -66,5 +66,5 @@
 
 				bool is_tty = isatty(STDOUT_FILENO);
-				start = getTimeNsec();
+				start = timeHiRes();
 				run = true;
 
@@ -75,5 +75,5 @@
 
 				run = false;
-				end = getTimeNsec();
+				end = timeHiRes();
 				printf("\nDone\n");
 			}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/links.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/links.hpp	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/links.hpp	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -117,5 +117,5 @@
 	}
 
-	long long ts() const {
+	unsigned long long ts() const {
 		return before._links.ts;
 	}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/links2.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/links2.hpp	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/links2.hpp	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -56,9 +56,11 @@
 template<typename node_t>
 class mpsc_queue : private mcs_queue<node_t> {
-	node_t * volatile head;
+	node_t * volatile _head;
 public:
-	mpsc_queue(): mcs_queue<node_t>(), head(nullptr) {}
+	mpsc_queue(): mcs_queue<node_t>(), _head(nullptr) {}
 
 	inline bool empty() const { return mcs_queue<node_t>::empty(); }
+
+	node_t * head() const { return _head; }
 
 	// Added a new element to the queue
@@ -66,5 +68,5 @@
 	inline node_t * push(node_t * elem) {
 		node_t * prev = mcs_queue<node_t>::push(elem);
-		if (!prev) head = elem;
+		if (!prev) _head = elem;
 		return prev;
 	}
@@ -75,5 +77,5 @@
 	// NOT Multi-Thread Safe
 	inline node_t * pop(node_t *& next) {
-		node_t * elem = head;
+		node_t * elem = _head;
 		// If head is empty just return
 		if (!elem) return nullptr;
@@ -81,5 +83,5 @@
 		// If there is already someone in the list, then it's easy
 		if (elem->_links.next) {
-			head = next = elem->_links.next;
+			_head = next = elem->_links.next;
 			// force memory sync
 			__atomic_thread_fence(__ATOMIC_SEQ_CST);
@@ -93,5 +95,5 @@
 			// at the CAS in advance and therefore can write to head
 			// after that point, it could overwrite the write in push
-			head = nullptr;
+			_head = nullptr;
 			next = mcs_queue<node_t>::advance(elem);
 
@@ -99,5 +101,5 @@
 			// it is the only way we can guarantee we are not overwriting
 			// a write made in push
-			if (next) head = next;
+			if (next) _head = next;
 		}
 
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/utils.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/utils.hpp	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/utils.hpp	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -11,5 +11,5 @@
 #include <sys/sysinfo.h>
 
-#include <x86intrin.h>
+// #include <x86intrin.h>
 
 // class Random {
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/work_stealing.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/work_stealing.hpp	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/work_stealing.hpp	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -15,5 +15,5 @@
 #include "snzi.hpp"
 
-#include <x86intrin.h>
+// #include <x86intrin.h>
 
 using namespace std;
@@ -28,7 +28,28 @@
 template<typename node_t>
 struct __attribute__((aligned(128))) localQ_t {
-	mpsc_queue<node_t> queue = {};
-	spinlock_t lock = {};
-	bool needs_help = true;
+	#ifdef NO_MPSC
+		intrusive_queue_t<node_t> list;
+
+		inline auto ts() { return list.ts(); }
+		inline auto lock() { return list.lock.lock(); }
+		inline auto try_lock() { return list.lock.try_lock(); }
+		inline auto unlock() { return list.lock.unlock(); }
+
+		inline auto push( node_t * node ) { return list.push( node ); }
+		inline auto pop() { return list.pop(); }
+	#else
+		mpsc_queue<node_t> queue = {};
+		spinlock_t _lock = {};
+
+		inline auto ts() { auto h = queue.head(); return h ? h->_links.ts : 0ull; }
+		inline auto lock() { return _lock.lock(); }
+		inline auto try_lock() { return _lock.try_lock(); }
+		inline auto unlock() { return _lock.unlock(); }
+
+		inline auto push( node_t * node ) { return queue.push( node ); }
+		inline auto pop() { return queue.pop(); }
+	#endif
+
+
 };
 
@@ -44,5 +65,6 @@
 	work_stealing(unsigned _numThreads, unsigned)
 		: numThreads(_numThreads * nqueues)
-		, lists(new intrusive_queue_t<node_t>[numThreads])
+		, lists(new localQ_t<node_t>[numThreads])
+		// , lists(new intrusive_queue_t<node_t>[numThreads])
 		, times(new timestamp_t[numThreads])
 		// , snzi( std::log2( numThreads / 2 ), 2 )
@@ -58,10 +80,12 @@
 
 	__attribute__((noinline, hot)) void push(node_t * node) {
-		// node->_links.ts = rdtscl();
-		node->_links.ts = 1;
+		node->_links.ts = rdtscl();
+		// node->_links.ts = 1;
 
 		auto & list = *({
 			unsigned i;
-			do {
+			#ifdef NO_MPSC
+				do {
+			#endif
 				tls.stats.push.attempt++;
 				// unsigned r = tls.rng1.next();
@@ -72,10 +96,14 @@
 					i = tls.my_queue + (r % nqueues);
 				}
-			} while(!lists[i].lock.try_lock());
+			#ifdef NO_MPSC
+				} while(!lists[i].try_lock());
+			#endif
 		 	&lists[i];
 		});
 
 		list.push( node );
-		list.lock.unlock();
+		#ifdef NO_MPSC
+			list.unlock();
+		#endif
 		// tls.rng2.set_raw_state( tls.rng1.get_raw_state());
 		// count++;
@@ -84,17 +112,36 @@
 
 	__attribute__((noinline, hot)) node_t * pop() {
-		if( tls.myfriend == outside ) {
-			auto r  = tls.rng1.next();
-			tls.myfriend = r % numThreads;
-			times[tls.myfriend].val = 0;
-		}
-		else if(times[tls.myfriend].val == 0) {
-			node_t * n = try_pop(tls.myfriend, tls.stats.pop.help);
-			tls.stats.help++;
-			tls.myfriend = outside;
-			if(n) return n;
-		}
-
 		if(tls.my_queue != outside) {
+			// if( tls.myfriend == outside ) {
+			// 	auto r  = tls.rng1.next();
+			// 	tls.myfriend = r % numThreads;
+			// 	// assert(lists[(tls.it % nqueues) + tls.my_queue].ts() >= lists[((tls.it + 1) % nqueues) + tls.my_queue].ts());
+			// 	tls.mytime = std::min(lists[(tls.it % nqueues) + tls.my_queue].ts(), lists[((tls.it + 1) % nqueues) + tls.my_queue].ts());
+			// 	// times[tls.myfriend].val = 0;
+			// 	// lists[tls.myfriend].val = 0;
+			// }
+			// // else if(times[tls.myfriend].val == 0) {
+			// // else if(lists[tls.myfriend].val == 0) {
+			// else if(times[tls.myfriend].val < tls.mytime) {
+			// // else if(times[tls.myfriend].val < lists[(tls.it % nqueues) + tls.my_queue].ts()) {
+			// 	node_t * n = try_pop(tls.myfriend, tls.stats.pop.help);
+			// 	tls.stats.help++;
+			// 	tls.myfriend = outside;
+			// 	if(n) return n;
+			// }
+			// if( tls.myfriend == outside ) {
+			// 	auto r  = tls.rng1.next();
+			// 	tls.myfriend = r % numThreads;
+			// 	tls.mytime = lists[((tls.it + 1) % nqueues) + tls.my_queue].ts();
+			// }
+			// else {
+			// 	if(times[tls.myfriend].val + 1000 < tls.mytime) {
+			// 		node_t * n = try_pop(tls.myfriend, tls.stats.pop.help);
+			// 		tls.stats.help++;
+			// 		if(n) return n;
+			// 	}
+			// 	tls.myfriend = outside;
+			// }
+
 			node_t * n = local();
 			if(n) return n;
@@ -112,6 +159,8 @@
 private:
 	inline node_t * local() {
-		// unsigned i = (tls.rng2.prev() % 4) + tls.my_queue;
 		unsigned i = (--tls.it % nqueues) + tls.my_queue;
+		node_t * n = try_pop(i, tls.stats.pop.local);
+		if(n) return n;
+		i = (--tls.it % nqueues) + tls.my_queue;
 		return try_pop(i, tls.stats.pop.local);
 	}
@@ -153,10 +202,9 @@
 
 		// If we can't get the lock, move on
-		if( !list.lock.try_lock() ) { stat.elock++; return nullptr; }
-
+		if( !list.try_lock() ) { stat.elock++; return nullptr; }
 
 		// If list is empty, unlock and retry
 		if( list.ts() == 0 ) {
-			list.lock.unlock();
+			list.unlock();
 			stat.eempty++;
 			return nullptr;
@@ -164,10 +212,15 @@
 
 		auto node = list.pop();
-		list.lock.unlock();
+		list.unlock();
 		stat.success++;
-		times[i].val = 1; //node.first->_links.ts;
-		// count--;
-		// _mm_stream_si64((long long int*)&times[i].val, node.first->_links.ts);
-		return node.first;
+		#ifdef NO_MPSC
+			// times[i].val = 1;
+			times[i].val = node.first->_links.ts;
+			// lists[i].val = node.first->_links.ts;
+			return node.first;
+		#else
+			times[i].val = node->_links.ts;
+			return node;
+		#endif
 	}
 
@@ -191,4 +244,5 @@
 		unsigned   my_queue = calc_preferred();
 		unsigned   myfriend = outside;
+		unsigned long long int mytime = 0;
 		#if defined(READ)
 			unsigned it = 0;
@@ -211,5 +265,6 @@
 private:
 	const unsigned numThreads;
-    	std::unique_ptr<intrusive_queue_t<node_t> []> lists;
+    	std::unique_ptr<localQ_t<node_t> []> lists;
+    	// std::unique_ptr<intrusive_queue_t<node_t> []> lists;
     	std::unique_ptr<timestamp_t []> times;
 	__attribute__((aligned(128))) std::atomic_size_t count;
Index: example/io/batch-readv.c
===================================================================
--- example/io/batch-readv.c	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ example/io/batch-readv.c	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -66,5 +66,5 @@
 }
 
-uint64_t getTimeNsec() {
+uint64_t timeHiRes() {
 	timespec curr;
 	clock_gettime( CLOCK_REALTIME, &curr );
@@ -163,10 +163,10 @@
 
 	printf("Running for %f second, reading %d bytes in batches of %d\n", duration, buflen, batch);
-	uint64_t start = getTimeNsec();
-	uint64_t end   = getTimeNsec();
-	uint64_t prev  = getTimeNsec();
+	uint64_t start = timeHiRes();
+	uint64_t end   = timeHiRes();
+	uint64_t prev  = timeHiRes();
 	for(;;) {
 		submit_and_drain(&iov, batch);
-		end = getTimeNsec();
+		end = timeHiRes();
 		uint64_t delta = end - start;
 		if( to_fseconds(end - prev) > 0.1 ) {
Index: libcfa/src/bits/weakso_locks.cfa
===================================================================
--- libcfa/src/bits/weakso_locks.cfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ libcfa/src/bits/weakso_locks.cfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -25,5 +25,5 @@
 void unlock( blocking_lock & ) {}
 void on_notify( blocking_lock &, struct $thread * ) {}
-size_t on_wait( blocking_lock & ) {}
+size_t on_wait( blocking_lock & ) { return 0; }
 void on_wakeup( blocking_lock &, size_t ) {}
 size_t wait_count( blocking_lock & ) { return 0; }
Index: libcfa/src/clock.hfa
===================================================================
--- libcfa/src/clock.hfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ libcfa/src/clock.hfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -10,6 +10,6 @@
 // Created On       : Thu Apr 12 14:36:06 2018
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Mon Jan  6 12:49:58 2020
-// Update Count     : 9
+// Last Modified On : Sun Apr 18 08:12:16 2021
+// Update Count     : 28
 //
 
@@ -27,63 +27,90 @@
 //######################### Clock #########################
 
-struct Clock {											// private
-	Duration offset;									// for virtual clock: contains offset from real-time
+struct Clock {											// virtual clock
+	// private
+	Duration offset;									// offset from computer real-time
 };
 
 static inline {
-	void resetClock( Clock & clk, Duration adj ) with( clk ) {
+	void reset( Clock & clk, Duration adj ) with( clk ) { // change offset
 		offset = adj + __timezone`s;					// timezone (global) is (UTC - local time) in seconds
-	} // resetClock
+	} // reset
 
-	void ?{}( Clock & clk, Duration adj ) { resetClock( clk, adj ); }
+	void ?{}( Clock & clk ) { reset( clk, (Duration){ 0 } ); } // create no offset
+	void ?{}( Clock & clk, Duration adj ) { reset( clk, adj ); } // create with offset
 
-	Duration getResNsec() {
+	// System-wide clock that measures real, i.e., wall-clock) time. This clock is affected by discontinuous jumps in
+	// the system time. For example, manual changes of the clock, and incremental adjustments performed by adjtime(3)
+	// and NTP (daylight saving (Fall back).
+	Duration resolutionHi() {							// clock resolution in nanoseconds (fine)
 		struct timespec res;
 		clock_getres( CLOCK_REALTIME, &res );
 		return ((int64_t)res.tv_sec * TIMEGRAN + res.tv_nsec)`ns;
-	} // getRes
+	} // resolutionHi
 
-	Duration getRes() {
+	Duration resolution() {								// clock resolution without nanoseconds (coarse)
 		struct timespec res;
 		clock_getres( CLOCK_REALTIME_COARSE, &res );
 		return ((int64_t)res.tv_sec * TIMEGRAN + res.tv_nsec)`ns;
-	} // getRes
+	} // resolution
 
-	Time getTimeNsec() {								// with nanoseconds
+	Time timeHiRes() {									// real time with nanoseconds
 		timespec curr;
 		clock_gettime( CLOCK_REALTIME, &curr );
 		return (Time){ curr };
-	} // getTimeNsec
+	} // timeHiRes
 
-	Time getTime() {									// without nanoseconds
+	Time time() {										// real time without nanoseconds
 		timespec curr;
 		clock_gettime( CLOCK_REALTIME_COARSE, &curr );
 		curr.tv_nsec = 0;
 		return (Time){ curr };
-	} // getTime
+	} // time
 
-	Time getTime( Clock & clk ) with( clk ) {
-		return getTime() + offset;
-	} // getTime
+	Time time( Clock & clk ) with( clk ) {				// real time for given clock
+		return time() + offset;
+	} // time
 
 	Time ?()( Clock & clk ) with( clk ) {				// alternative syntax
-		return getTime() + offset;
-	} // getTime
+		return time() + offset;
+	} // ?()
 
-	timeval getTime( Clock & clk ) {
+	timeval time( Clock & clk ) {						// convert to C time format
 		return (timeval){ clk() };
-	} // getTime
+	} // time
 
-	tm getTime( Clock & clk ) with( clk ) {
+	tm time( Clock & clk ) with( clk ) {
 		tm ret;
-		localtime_r( getTime( clk ).tv_sec, &ret );
+		localtime_r( time( clk ).tv_sec, &ret );
 		return ret;
-	} // getTime
+	} // time
 
-	Time getCPUTime() {
+	// CFA processor CPU-time watch that ticks when the processor (kernel thread) is running. This watch is affected by
+	// discontinuous jumps when the OS is not running the kernal thread. A duration is returned because the value is
+	// relative and cannot be converted to real-time (wall-clock) time.
+	Duration processor() {								// non-monotonic duration of kernel thread
 		timespec ts;
 		clock_gettime( CLOCK_THREAD_CPUTIME_ID, &ts );
-		return (Time){ ts };
-    } // getCPUTime
+		return (Duration){ ts };
+	} // processor
+
+	// Program CPU-time watch measures CPU time consumed by all processors (kernel threads) in the UNIX process.  This
+	// watch is affected by discontinuous jumps when the OS is not running the kernel threads. A duration is returned
+	// because the value is relative and cannot be converted to real-time (wall-clock) time.
+	Duration program() {								// non-monotonic duration of program CPU
+		timespec ts;
+		clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts );
+		return (Duration){ ts };
+	} // program
+
+	// Monotonic duration from machine boot and including system suspension. This watch is unaffected by discontinuous
+	// jumps resulting from manual changes of the clock, and incremental adjustments performed by adjtime(3) and NTP
+	// (Fall back). A duration is returned because the value is relative and cannot be converted to real-time
+	// (wall-clock) time.
+	Duration boot() {									// monotonic duration since computer boot
+		timespec ts;
+		clock_gettime( CLOCK_BOOTTIME, &ts );
+		return (Duration){ ts };
+	} // boot
 } // distribution
 
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ libcfa/src/concurrency/invoke.h	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -148,5 +148,4 @@
 		struct $thread * prev;
 		volatile unsigned long long ts;
-		int preferred;
 	};
 
Index: libcfa/src/concurrency/io/call.cfa.in
===================================================================
--- libcfa/src/concurrency/io/call.cfa.in	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ libcfa/src/concurrency/io/call.cfa.in	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -201,5 +201,5 @@
 
 		sqe->opcode = IORING_OP_{op};
-		sqe->user_data = (__u64)(uintptr_t)&future;
+		sqe->user_data = (uintptr_t)&future;
 		sqe->flags = sflags;
 		sqe->ioprio = 0;
@@ -215,5 +215,5 @@
 		asm volatile("": : :"memory");
 
-		verify( sqe->user_data == (__u64)(uintptr_t)&future );
+		verify( sqe->user_data == (uintptr_t)&future );
 		cfa_io_submit( ctx, &idx, 1, 0 != (submit_flags & CFA_IO_LAZY) );
 	#endif
@@ -238,5 +238,5 @@
 		'fd'  : 'fd',
 		'off' : 'offset',
-		'addr': '(__u64)iov',
+		'addr': '(uintptr_t)iov',
 		'len' : 'iovcnt',
 	}, define = 'CFA_HAVE_PREADV2'),
@@ -245,5 +245,5 @@
 		'fd'  : 'fd',
 		'off' : 'offset',
-		'addr': '(__u64)iov',
+		'addr': '(uintptr_t)iov',
 		'len' : 'iovcnt'
 	}, define = 'CFA_HAVE_PWRITEV2'),
@@ -257,5 +257,5 @@
 		'addr': 'fd',
 		'len': 'op',
-		'off': '(__u64)event'
+		'off': '(uintptr_t)event'
 	}),
 	# CFA_HAVE_IORING_OP_SYNC_FILE_RANGE
@@ -269,5 +269,5 @@
 	Call('SENDMSG', 'ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags)', {
 		'fd': 'sockfd',
-		'addr': '(__u64)(struct msghdr *)msg',
+		'addr': '(uintptr_t)(struct msghdr *)msg',
 		'len': '1',
 		'msg_flags': 'flags'
@@ -276,5 +276,5 @@
 	Call('RECVMSG', 'ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags)', {
 		'fd': 'sockfd',
-		'addr': '(__u64)(struct msghdr *)msg',
+		'addr': '(uintptr_t)(struct msghdr *)msg',
 		'len': '1',
 		'msg_flags': 'flags'
@@ -283,5 +283,5 @@
 	Call('SEND', 'ssize_t send(int sockfd, const void *buf, size_t len, int flags)', {
 		'fd': 'sockfd',
-		'addr': '(__u64)buf',
+		'addr': '(uintptr_t)buf',
 		'len': 'len',
 		'msg_flags': 'flags'
@@ -290,5 +290,5 @@
 	Call('RECV', 'ssize_t recv(int sockfd, void *buf, size_t len, int flags)', {
 		'fd': 'sockfd',
-		'addr': '(__u64)buf',
+		'addr': '(uintptr_t)buf',
 		'len': 'len',
 		'msg_flags': 'flags'
@@ -297,6 +297,6 @@
 	Call('ACCEPT', 'int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags)', {
 		'fd': 'sockfd',
-		'addr': '(__u64)addr',
-		'addr2': '(__u64)addrlen',
+		'addr': '(uintptr_t)addr',
+		'addr2': '(uintptr_t)addrlen',
 		'accept_flags': 'flags'
 	}),
@@ -304,5 +304,5 @@
 	Call('CONNECT', 'int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen)', {
 		'fd': 'sockfd',
-		'addr': '(__u64)addr',
+		'addr': '(uintptr_t)addr',
 		'off': 'addrlen'
 	}),
@@ -310,5 +310,5 @@
 	Call('FALLOCATE', 'int fallocate(int fd, int mode, off_t offset, off_t len)', {
 		'fd': 'fd',
-		'addr': '(__u64)len',
+		'addr': '(uintptr_t)len',
 		'len': 'mode',
 		'off': 'offset'
@@ -323,5 +323,5 @@
 	# CFA_HAVE_IORING_OP_MADVISE
 	Call('MADVISE', 'int madvise(void *addr, size_t length, int advice)', {
-		'addr': '(__u64)addr',
+		'addr': '(uintptr_t)addr',
 		'len': 'length',
 		'fadvise_advice': 'advice'
@@ -330,5 +330,5 @@
 	Call('OPENAT', 'int openat(int dirfd, const char *pathname, int flags, mode_t mode)', {
 		'fd': 'dirfd',
-		'addr': '(__u64)pathname',
+		'addr': '(uintptr_t)pathname',
 		'len': 'mode',
 		'open_flags': 'flags;'
@@ -339,5 +339,5 @@
 		'addr': 'pathname',
 		'len': 'sizeof(*how)',
-		'off': '(__u64)how',
+		'off': '(uintptr_t)how',
 	}, define = 'CFA_HAVE_OPENAT2'),
 	# CFA_HAVE_IORING_OP_CLOSE
@@ -348,5 +348,5 @@
 	Call('STATX', 'int statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf)', {
 		'fd': 'dirfd',
-		'off': '(__u64)statxbuf',
+		'off': '(uintptr_t)statxbuf',
 		'addr': 'pathname',
 		'len': 'mask',
@@ -356,5 +356,5 @@
 	Call('READ', 'ssize_t read(int fd, void * buf, size_t count)', {
 		'fd': 'fd',
-		'addr': '(__u64)buf',
+		'addr': '(uintptr_t)buf',
 		'len': 'count'
 	}),
@@ -362,5 +362,5 @@
 	Call('WRITE', 'ssize_t write(int fd, void * buf, size_t count)', {
 		'fd': 'fd',
-		'addr': '(__u64)buf',
+		'addr': '(uintptr_t)buf',
 		'len': 'count'
 	}),
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ libcfa/src/concurrency/kernel.cfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -113,7 +113,7 @@
 static void __wake_one(cluster * cltr);
 
-static void push  (__cluster_idles & idles, processor & proc);
-static void remove(__cluster_idles & idles, processor & proc);
-static [unsigned idle, unsigned total, * processor] query( & __cluster_idles idles );
+static void mark_idle (__cluster_proc_list & idles, processor & proc);
+static void mark_awake(__cluster_proc_list & idles, processor & proc);
+static [unsigned idle, unsigned total, * processor] query_idles( & __cluster_proc_list idles );
 
 extern void __cfa_io_start( processor * );
@@ -189,5 +189,5 @@
 
 				// Push self to idle stack
-				push(this->cltr->idles, * this);
+				mark_idle(this->cltr->procs, * this);
 
 				// Confirm the ready-queue is empty
@@ -195,5 +195,5 @@
 				if( readyThread ) {
 					// A thread was found, cancel the halt
-					remove(this->cltr->idles, * this);
+					mark_awake(this->cltr->procs, * this);
 
 					#if !defined(__CFA_NO_STATISTICS__)
@@ -225,5 +225,5 @@
 
 				// We were woken up, remove self from idle
-				remove(this->cltr->idles, * this);
+				mark_awake(this->cltr->procs, * this);
 
 				// DON'T just proceed, start looking again
@@ -474,5 +474,5 @@
 
 	ready_schedule_lock();
-		$thread * thrd = pop( this );
+		$thread * thrd = pop_fast( this );
 	ready_schedule_unlock();
 
@@ -617,5 +617,5 @@
 	unsigned idle;
 	unsigned total;
-	[idle, total, p] = query(this->idles);
+	[idle, total, p] = query_idles(this->procs);
 
 	// If no one is sleeping, we are done
@@ -654,27 +654,30 @@
 }
 
-static void push  (__cluster_idles & this, processor & proc) {
+static void mark_idle(__cluster_proc_list & this, processor & proc) {
 	/* paranoid */ verify( ! __preemption_enabled() );
 	lock( this );
 		this.idle++;
 		/* paranoid */ verify( this.idle <= this.total );
-
-		insert_first(this.list, proc);
+		remove(proc);
+		insert_first(this.idles, proc);
 	unlock( this );
 	/* paranoid */ verify( ! __preemption_enabled() );
 }
 
-static void remove(__cluster_idles & this, processor & proc) {
+static void mark_awake(__cluster_proc_list & this, processor & proc) {
 	/* paranoid */ verify( ! __preemption_enabled() );
 	lock( this );
 		this.idle--;
 		/* paranoid */ verify( this.idle >= 0 );
-
 		remove(proc);
+		insert_last(this.actives, proc);
 	unlock( this );
 	/* paranoid */ verify( ! __preemption_enabled() );
 }
 
-static [unsigned idle, unsigned total, * processor] query( & __cluster_idles this ) {
+static [unsigned idle, unsigned total, * processor] query_idles( & __cluster_proc_list this ) {
+	/* paranoid */ verify( ! __preemption_enabled() );
+	/* paranoid */ verify( ready_schedule_islocked() );
+
 	for() {
 		uint64_t l = __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST);
@@ -682,5 +685,5 @@
 		unsigned idle    = this.idle;
 		unsigned total   = this.total;
-		processor * proc = &this.list`first;
+		processor * proc = &this.idles`first;
 		// Compiler fence is unnecessary, but gcc-8 and older incorrectly reorder code without it
 		asm volatile("": : :"memory");
@@ -688,4 +691,7 @@
 		return [idle, total, proc];
 	}
+
+	/* paranoid */ verify( ready_schedule_islocked() );
+	/* paranoid */ verify( ! __preemption_enabled() );
 }
 
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ libcfa/src/concurrency/kernel.hfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -69,6 +69,12 @@
 	struct cluster * cltr;
 
-	// Id within the cluster
-	unsigned cltr_id;
+	// Ready Queue state per processor
+	struct {
+		unsigned short its;
+		unsigned short itr;
+		unsigned id;
+		unsigned target;
+		unsigned long long int cutoff;
+	} rdq;
 
 	// Set to true to notify the processor should terminate
@@ -140,27 +146,17 @@
 // Cluster Tools
 
-// Intrusives lanes which are used by the relaxed ready queue
+// Intrusives lanes which are used by the ready queue
 struct __attribute__((aligned(128))) __intrusive_lane_t;
 void  ?{}(__intrusive_lane_t & this);
 void ^?{}(__intrusive_lane_t & this);
 
-// Counter used for wether or not the lanes are all empty
-struct __attribute__((aligned(128))) __snzi_node_t;
-struct __snzi_t {
-	unsigned mask;
-	int root;
-	__snzi_node_t * nodes;
-};
-
-void  ?{}( __snzi_t & this, unsigned depth );
-void ^?{}( __snzi_t & this );
+// Aligned timestamps which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __timestamp_t;
+void  ?{}(__timestamp_t & this);
+void ^?{}(__timestamp_t & this);
 
 //TODO adjust cache size to ARCHITECTURE
 // Structure holding the relaxed ready queue
 struct __ready_queue_t {
-	// Data tracking how many/which lanes are used
-	// Aligned to 128 for cache locality
-	__snzi_t snzi;
-
 	// Data tracking the actual lanes
 	// On a seperate cacheline from the used struct since
@@ -171,4 +167,7 @@
 		__intrusive_lane_t * volatile data;
 
+		// Array of times
+		__timestamp_t * volatile tscs;
+
 		// Number of lanes (empty or not)
 		volatile size_t count;
@@ -180,5 +179,5 @@
 
 // Idle Sleep
-struct __cluster_idles {
+struct __cluster_proc_list {
 	// Spin lock protecting the queue
 	volatile uint64_t lock;
@@ -191,5 +190,8 @@
 
 	// List of idle processors
-	dlist(processor, processor) list;
+	dlist(processor, processor) idles;
+
+	// List of active processors
+	dlist(processor, processor) actives;
 };
 
@@ -207,5 +209,5 @@
 
 	// List of idle processors
-	__cluster_idles idles;
+	__cluster_proc_list procs;
 
 	// List of threads
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -469,4 +469,9 @@
 	this.name = name;
 	this.cltr = &_cltr;
+	this.rdq.its = 0;
+	this.rdq.itr = 0;
+	this.rdq.id  = -1u;
+	this.rdq.target = -1u;
+	this.rdq.cutoff = -1ull;
 	do_terminate = false;
 	preemption_alarm = 0p;
@@ -489,39 +494,30 @@
 	#endif
 
-	lock( this.cltr->idles );
-		int target = this.cltr->idles.total += 1u;
-	unlock( this.cltr->idles );
-
-	id = doregister((__processor_id_t*)&this);
-
+	// Register and Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_register((__processor_id_t*)&this);
+		this.cltr->procs.total += 1u;
+		insert_last(this.cltr->procs.actives, this);
+
+		// Adjust the ready queue size
+		ready_queue_grow( cltr );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
+}
+
+// Not a ctor, it just preps the destruction but should not destroy members
+static void deinit(processor & this) {
 	// Lock the RWlock so no-one pushes/pops while we are changing the queue
 	uint_fast32_t last_size = ready_mutate_lock();
+		this.cltr->procs.total -= 1u;
+		remove(this);
 
 		// Adjust the ready queue size
-		this.cltr_id = ready_queue_grow( cltr, target );
-
-	// Unlock the RWlock
-	ready_mutate_unlock( last_size );
-
-	__cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
-}
-
-// Not a ctor, it just preps the destruction but should not destroy members
-static void deinit(processor & this) {
-	lock( this.cltr->idles );
-		int target = this.cltr->idles.total -= 1u;
-	unlock( this.cltr->idles );
-
-	// Lock the RWlock so no-one pushes/pops while we are changing the queue
-	uint_fast32_t last_size = ready_mutate_lock();
-
-		// Adjust the ready queue size
-		ready_queue_shrink( this.cltr, target );
-
-	// Unlock the RWlock
-	ready_mutate_unlock( last_size );
-
-	// Finally we don't need the read_lock any more
-	unregister((__processor_id_t*)&this);
+		ready_queue_shrink( this.cltr );
+
+	// Unlock the RWlock and unregister: we don't need the read_lock any more
+	ready_mutate_unregister((__processor_id_t*)&this, last_size );
 
 	close(this.idle);
@@ -566,9 +562,8 @@
 //-----------------------------------------------------------------------------
 // Cluster
-static void ?{}(__cluster_idles & this) {
+static void ?{}(__cluster_proc_list & this) {
 	this.lock  = 0;
 	this.idle  = 0;
 	this.total = 0;
-	(this.list){};
 }
 
@@ -596,5 +591,5 @@
 
 		// Adjust the ready queue size
-		ready_queue_grow( &this, 0 );
+		ready_queue_grow( &this );
 
 	// Unlock the RWlock
@@ -611,5 +606,5 @@
 
 		// Adjust the ready queue size
-		ready_queue_shrink( &this, 0 );
+		ready_queue_shrink( &this );
 
 	// Unlock the RWlock
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -83,46 +83,10 @@
 // Cluster lock API
 //=======================================================================
-// Cells use by the reader writer lock
-// while not generic it only relies on a opaque pointer
-struct __attribute__((aligned(128))) __scheduler_lock_id_t {
-	// Spin lock used as the underlying lock
-	volatile bool lock;
-
-	// Handle pointing to the proc owning this cell
-	// Used for allocating cells and debugging
-	__processor_id_t * volatile handle;
-
-	#ifdef __CFA_WITH_VERIFY__
-		// Debug, check if this is owned for reading
-		bool owned;
-	#endif
-};
-
-static_assert( sizeof(struct __scheduler_lock_id_t) <= __alignof(struct __scheduler_lock_id_t));
-
 // Lock-Free registering/unregistering of threads
 // Register a processor to a given cluster and get its unique id in return
-unsigned doregister( struct __processor_id_t * proc );
+void register_proc_id( struct __processor_id_t * );
 
 // Unregister a processor from a given cluster using its id, getting back the original pointer
-void     unregister( struct __processor_id_t * proc );
-
-//-----------------------------------------------------------------------
-// Cluster idle lock/unlock
-static inline void lock(__cluster_idles & this) {
-	for() {
-		uint64_t l = this.lock;
-		if(
-			(0 == (l % 2))
-			&& __atomic_compare_exchange_n(&this.lock, &l, l + 1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
-		) return;
-		Pause();
-	}
-}
-
-static inline void unlock(__cluster_idles & this) {
-	/* paranoid */ verify( 1 == (this.lock % 2) );
-	__atomic_fetch_add( &this.lock, 1, __ATOMIC_SEQ_CST );
-}
+void unregister_proc_id( struct __processor_id_t * proc );
 
 //=======================================================================
@@ -152,4 +116,22 @@
 	__atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
 }
+
+// Cells use by the reader writer lock
+// while not generic it only relies on a opaque pointer
+struct __attribute__((aligned(128))) __scheduler_lock_id_t {
+	// Spin lock used as the underlying lock
+	volatile bool lock;
+
+	// Handle pointing to the proc owning this cell
+	// Used for allocating cells and debugging
+	__processor_id_t * volatile handle;
+
+	#ifdef __CFA_WITH_VERIFY__
+		// Debug, check if this is owned for reading
+		bool owned;
+	#endif
+};
+
+static_assert( sizeof(struct __scheduler_lock_id_t) <= __alignof(struct __scheduler_lock_id_t));
 
 //-----------------------------------------------------------------------
@@ -247,15 +229,62 @@
 void ready_mutate_unlock( uint_fast32_t /* value returned by lock */ );
 
+//-----------------------------------------------------------------------
+// Lock-Free registering/unregistering of threads
+// Register a processor to a given cluster and get its unique id in return
+// For convenience, also acquires the lock
+static inline uint_fast32_t ready_mutate_register( struct __processor_id_t * proc ) {
+	register_proc_id( proc );
+	return ready_mutate_lock();
+}
+
+// Unregister a processor from a given cluster using its id, getting back the original pointer
+// assumes the lock is acquired
+static inline void ready_mutate_unregister( struct __processor_id_t * proc, uint_fast32_t last_s ) {
+	ready_mutate_unlock( last_s );
+	unregister_proc_id( proc );
+}
+
+//-----------------------------------------------------------------------
+// Cluster idle lock/unlock
+static inline void lock(__cluster_proc_list & this) {
+	/* paranoid */ verify( ! __preemption_enabled() );
+
+	// Start by locking the global RWlock so that we know no-one is
+	// adding/removing processors while we mess with the idle lock
+	ready_schedule_lock();
+
+	// Simple counting lock, acquired, acquired by incrementing the counter
+	// to an odd number
+	for() {
+		uint64_t l = this.lock;
+		if(
+			(0 == (l % 2))
+			&& __atomic_compare_exchange_n(&this.lock, &l, l + 1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+		) return;
+		Pause();
+	}
+
+	/* paranoid */ verify( ! __preemption_enabled() );
+}
+
+static inline void unlock(__cluster_proc_list & this) {
+	/* paranoid */ verify( ! __preemption_enabled() );
+
+	/* paranoid */ verify( 1 == (this.lock % 2) );
+	// Simple couting lock, release by incrementing to an even number
+	__atomic_fetch_add( &this.lock, 1, __ATOMIC_SEQ_CST );
+
+	// Release the global lock, which we acquired when locking
+	ready_schedule_unlock();
+
+	/* paranoid */ verify( ! __preemption_enabled() );
+}
+
 //=======================================================================
 // Ready-Queue API
 //-----------------------------------------------------------------------
-// pop thread from the ready queue of a cluster
-// returns 0p if empty
-__attribute__((hot)) bool query(struct cluster * cltr);
-
-//-----------------------------------------------------------------------
 // push thread onto a ready queue for a cluster
 // returns true if the list was previously empty, false otherwise
-__attribute__((hot)) bool push(struct cluster * cltr, struct $thread * thrd);
+__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd);
 
 //-----------------------------------------------------------------------
@@ -263,5 +292,5 @@
 // returns 0p if empty
 // May return 0p spuriously
-__attribute__((hot)) struct $thread * pop(struct cluster * cltr);
+__attribute__((hot)) struct $thread * pop_fast(struct cluster * cltr);
 
 //-----------------------------------------------------------------------
@@ -272,15 +301,10 @@
 
 //-----------------------------------------------------------------------
-// remove thread from the ready queue of a cluster
-// returns bool if it wasn't found
-bool remove_head(struct cluster * cltr, struct $thread * thrd);
-
-//-----------------------------------------------------------------------
 // Increase the width of the ready queue (number of lanes) by 4
-unsigned ready_queue_grow  (struct cluster * cltr, int target);
+void ready_queue_grow  (struct cluster * cltr);
 
 //-----------------------------------------------------------------------
 // Decrease the width of the ready queue (number of lanes) by 4
-void ready_queue_shrink(struct cluster * cltr, int target);
+void ready_queue_shrink(struct cluster * cltr);
 
 
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ libcfa/src/concurrency/preemption.cfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -712,5 +712,5 @@
 static void * alarm_loop( __attribute__((unused)) void * args ) {
 	__processor_id_t id;
-	id.id = doregister(&id);
+	register_proc_id(&id);
 	__cfaabi_tls.this_proc_id = &id;
 
@@ -773,5 +773,5 @@
 EXIT:
 	__cfaabi_dbg_print_safe( "Kernel : Preemption thread stopping\n" );
-	unregister(&id);
+	register_proc_id(&id);
 
 	return 0p;
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -17,6 +17,8 @@
 // #define __CFA_DEBUG_PRINT_READY_QUEUE__
 
-// #define USE_SNZI
 // #define USE_MPSC
+
+#define USE_RELAXED_FIFO
+// #define USE_WORK_STEALING
 
 #include "bits/defs.hfa"
@@ -29,8 +31,13 @@
 #include <unistd.h>
 
-#include "snzi.hfa"
 #include "ready_subqueue.hfa"
 
 static const size_t cache_line_size = 64;
+
+#if !defined(__CFA_NO_STATISTICS__)
+	#define __STATS(...) __VA_ARGS__
+#else
+	#define __STATS(...)
+#endif
 
 // No overriden function, no environment variable, no define
@@ -40,5 +47,20 @@
 #endif
 
-#define BIAS 4
+#if   defined(USE_RELAXED_FIFO)
+	#define BIAS 4
+	#define READYQ_SHARD_FACTOR 4
+	#define SEQUENTIAL_SHARD 1
+#elif defined(USE_WORK_STEALING)
+	#define READYQ_SHARD_FACTOR 2
+	#define SEQUENTIAL_SHARD 2
+#else
+	#error no scheduling strategy selected
+#endif
+
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats));
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j __STATS(, __stats_readyQ_pop_t & stats));
+static inline struct $thread * search(struct cluster * cltr);
+static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred);
+
 
 // returns the maximum number of processors the RWLock support
@@ -94,5 +116,5 @@
 //=======================================================================
 // Lock-Free registering/unregistering of threads
-unsigned doregister( struct __processor_id_t * proc ) with(*__scheduler_lock) {
+void register_proc_id( struct __processor_id_t * proc ) with(*__scheduler_lock) {
 	__cfadbg_print_safe(ready_queue, "Kernel : Registering proc %p for RW-Lock\n", proc);
 
@@ -108,5 +130,5 @@
 			/*paranoid*/ verify(0 == (__alignof__(data[i]) % cache_line_size));
 			/*paranoid*/ verify((((uintptr_t)&data[i]) % cache_line_size) == 0);
-			return i;
+			proc->id = i;
 		}
 	}
@@ -135,8 +157,8 @@
 	/*paranoid*/ verify(__alignof__(data[n]) == (2 * cache_line_size));
 	/*paranoid*/ verify((((uintptr_t)&data[n]) % cache_line_size) == 0);
-	return n;
-}
-
-void unregister( struct __processor_id_t * proc ) with(*__scheduler_lock) {
+	proc->id = n;
+}
+
+void unregister_proc_id( struct __processor_id_t * proc ) with(*__scheduler_lock) {
 	unsigned id = proc->id;
 	/*paranoid*/ verify(id < ready);
@@ -193,31 +215,25 @@
 
 //=======================================================================
-// Cforall Reqdy Queue used for scheduling
+// Cforall Ready Queue used for scheduling
 //=======================================================================
 void ?{}(__ready_queue_t & this) with (this) {
 	lanes.data  = 0p;
+	lanes.tscs  = 0p;
 	lanes.count = 0;
 }
 
 void ^?{}(__ready_queue_t & this) with (this) {
-	verify( 1 == lanes.count );
-	#ifdef USE_SNZI
-		verify( !query( snzi ) );
-	#endif
+	verify( SEQUENTIAL_SHARD == lanes.count );
 	free(lanes.data);
+	free(lanes.tscs);
 }
 
 //-----------------------------------------------------------------------
-__attribute__((hot)) bool query(struct cluster * cltr) {
-	#ifdef USE_SNZI
-		return query(cltr->ready_queue.snzi);
-	#endif
-	return true;
-}
-
-static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred) {
-	unsigned i;
-	bool local;
-	#if defined(BIAS)
+#if defined(USE_RELAXED_FIFO)
+	//-----------------------------------------------------------------------
+	// get index from random number with or without bias towards queues
+	static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred) {
+		unsigned i;
+		bool local;
 		unsigned rlow  = r % BIAS;
 		unsigned rhigh = r / BIAS;
@@ -225,5 +241,5 @@
 			// (BIAS - 1) out of BIAS chances
 			// Use perferred queues
-			i = preferred + (rhigh % 4);
+			i = preferred + (rhigh % READYQ_SHARD_FACTOR);
 			local = true;
 		}
@@ -234,160 +250,254 @@
 			local = false;
 		}
-	#else
-		i = r;
-		local = false;
-	#endif
-	return [i, local];
-}
-
-//-----------------------------------------------------------------------
-__attribute__((hot)) bool push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
-	__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
-
-	const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
-
-	// write timestamp
-	thrd->link.ts = rdtscl();
-
-	bool first = false;
-	__attribute__((unused)) bool local;
-	__attribute__((unused)) int preferred;
-	#if defined(BIAS)
-		preferred =
-			//*
-			external ? -1 : kernelTLS().this_processor->cltr_id;
-			/*/
-			thrd->link.preferred * 4;
-			//*/
-	#endif
-
-	// Try to pick a lane and lock it
-	unsigned i;
-	do {
-		// Pick the index of a lane
-		// unsigned r = __tls_rand();
-		unsigned r = __tls_rand_fwd();
-		[i, local] = idx_from_r(r, preferred);
-
-		i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
-
+		return [i, local];
+	}
+
+	__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
+
+		const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+		/* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
+
+		// write timestamp
+		thrd->link.ts = rdtscl();
+
+		bool local;
+		int preferred = external ? -1 : kernelTLS().this_processor->rdq.id;
+
+		// Try to pick a lane and lock it
+		unsigned i;
+		do {
+			// Pick the index of a lane
+			unsigned r = __tls_rand_fwd();
+			[i, local] = idx_from_r(r, preferred);
+
+			i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+
+			#if !defined(__CFA_NO_STATISTICS__)
+				if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.attempt, 1, __ATOMIC_RELAXED);
+				else if(local) __tls_stats()->ready.push.local.attempt++;
+				else __tls_stats()->ready.push.share.attempt++;
+			#endif
+
+		#if defined(USE_MPSC)
+			// mpsc always succeeds
+		} while( false );
+		#else
+			// If we can't lock it retry
+		} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+		#endif
+
+		// Actually push it
+		push(lanes.data[i], thrd);
+
+		#if !defined(USE_MPSC)
+			// Unlock and return
+			__atomic_unlock( &lanes.data[i].lock );
+		#endif
+
+		// Mark the current index in the tls rng instance as having an item
+		__tls_rand_advance_bck();
+
+		__cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
+
+		// Update statistics
 		#if !defined(__CFA_NO_STATISTICS__)
-			if(external) {
-				if(local) __atomic_fetch_add(&cltr->stats->ready.pick.ext.local, 1, __ATOMIC_RELAXED);
-				__atomic_fetch_add(&cltr->stats->ready.pick.ext.attempt, 1, __ATOMIC_RELAXED);
+			if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED);
+			else if(local) __tls_stats()->ready.push.local.success++;
+			else __tls_stats()->ready.push.share.success++;
+		#endif
+	}
+
+	// Pop from the ready queue from a given cluster
+	__attribute__((hot)) $thread * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
+		/* paranoid */ verify( lanes.count > 0 );
+		/* paranoid */ verify( kernelTLS().this_processor );
+		/* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count );
+
+		unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+		int preferred = kernelTLS().this_processor->rdq.id;
+
+
+		// As long as the list is not empty, try finding a lane that isn't empty and pop from it
+		for(25) {
+			// Pick two lists at random
+			unsigned ri = __tls_rand_bck();
+			unsigned rj = __tls_rand_bck();
+
+			unsigned i, j;
+			__attribute__((unused)) bool locali, localj;
+			[i, locali] = idx_from_r(ri, preferred);
+			[j, localj] = idx_from_r(rj, preferred);
+
+			i %= count;
+			j %= count;
+
+			// try popping from the 2 picked lists
+			struct $thread * thrd = try_pop(cltr, i, j __STATS(, *(locali || localj ? &__tls_stats()->ready.pop.local : &__tls_stats()->ready.pop.help)));
+			if(thrd) {
+				return thrd;
+			}
+		}
+
+		// All lanes where empty return 0p
+		return 0p;
+	}
+
+	__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) {
+		return search(cltr);
+	}
+#endif
+#if defined(USE_WORK_STEALING)
+	__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
+
+		const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+		/* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
+
+		// write timestamp
+		thrd->link.ts = rdtscl();
+
+		// Try to pick a lane and lock it
+		unsigned i;
+		do {
+			#if !defined(__CFA_NO_STATISTICS__)
+				if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.attempt, 1, __ATOMIC_RELAXED);
+				else __tls_stats()->ready.push.local.attempt++;
+			#endif
+
+			if(unlikely(external)) {
+				i = __tls_rand() % lanes.count;
 			}
 			else {
-				if(local) __tls_stats()->ready.pick.push.local++;
-				__tls_stats()->ready.pick.push.attempt++;
+				processor * proc = kernelTLS().this_processor;
+				unsigned r = proc->rdq.its++;
+				i =  proc->rdq.id + (r % READYQ_SHARD_FACTOR);
 			}
+
+
+		#if defined(USE_MPSC)
+			// mpsc always succeeds
+		} while( false );
+		#else
+			// If we can't lock it retry
+		} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
 		#endif
 
-	#if defined(USE_MPSC)
-		// mpsc always succeeds
-	} while( false );
-	#else
-		// If we can't lock it retry
-	} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+		// Actually push it
+		push(lanes.data[i], thrd);
+
+		#if !defined(USE_MPSC)
+			// Unlock and return
+			__atomic_unlock( &lanes.data[i].lock );
+		#endif
+
+		#if !defined(__CFA_NO_STATISTICS__)
+			if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED);
+			else __tls_stats()->ready.push.local.success++;
+		#endif
+
+		__cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
+	}
+
+	// Pop from the ready queue from a given cluster
+	__attribute__((hot)) $thread * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
+		/* paranoid */ verify( lanes.count > 0 );
+		/* paranoid */ verify( kernelTLS().this_processor );
+		/* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count );
+
+		processor * proc = kernelTLS().this_processor;
+
+		if(proc->rdq.target == -1u) {
+			proc->rdq.target = __tls_rand() % lanes.count;
+			unsigned it1  = proc->rdq.itr;
+			unsigned it2  = proc->rdq.itr + 1;
+			unsigned idx1 = proc->rdq.id + (it1 % READYQ_SHARD_FACTOR);
+			unsigned idx2 = proc->rdq.id + (it1 % READYQ_SHARD_FACTOR);
+			unsigned long long tsc1 = ts(lanes.data[idx1]);
+			unsigned long long tsc2 = ts(lanes.data[idx2]);
+			proc->rdq.cutoff = min(tsc1, tsc2);
+		}
+		else if(lanes.tscs[proc->rdq.target].tv < proc->rdq.cutoff) {
+			$thread * t = try_pop(cltr, proc->rdq.target __STATS(, __tls_stats()->ready.pop.help));
+			proc->rdq.target = -1u;
+			if(t) return t;
+		}
+
+		for(READYQ_SHARD_FACTOR) {
+			unsigned i = proc->rdq.id + (--proc->rdq.itr % READYQ_SHARD_FACTOR);
+			if($thread * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
+		}
+		return 0p;
+	}
+
+	__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+		for(25) {
+			unsigned i = __tls_rand() % lanes.count;
+			$thread * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal));
+			if(t) return t;
+		}
+
+		return search(cltr);
+	}
+#endif
+
+//=======================================================================
+// Various Ready Queue utilities
+//=======================================================================
+// these function work the same or almost the same
+// whether they are using work-stealing or relaxed fifo scheduling
+
+//-----------------------------------------------------------------------
+// try to pop from a lane given by index w
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->ready_queue) {
+	__STATS( stats.attempt++; )
+
+	// Get relevant elements locally
+	__intrusive_lane_t & lane = lanes.data[w];
+
+	// If list looks empty retry
+	if( is_empty(lane) ) {
+		__STATS( stats.espec++; )
+		return 0p;
+	}
+
+	// If we can't get the lock retry
+	if( !__atomic_try_acquire(&lane.lock) ) {
+		__STATS( stats.elock++; )
+		return 0p;
+	}
+
+	// If list is empty, unlock and retry
+	if( is_empty(lane) ) {
+		__atomic_unlock(&lane.lock);
+		__STATS( stats.eempty++; )
+		return 0p;
+	}
+
+	// Actually pop the list
+	struct $thread * thrd;
+	thrd = pop(lane);
+
+	/* paranoid */ verify(thrd);
+	/* paranoid */ verify(lane.lock);
+
+	// Unlock and return
+	__atomic_unlock(&lane.lock);
+
+	// Update statistics
+	__STATS( stats.success++; )
+
+	#if defined(USE_WORK_STEALING)
+		lanes.tscs[w].tv = thrd->link.ts;
 	#endif
 
-	// Actually push it
-	#ifdef USE_SNZI
-		bool lane_first =
-	#endif
-
-	push(lanes.data[i], thrd);
-
-	#ifdef USE_SNZI
-		// If this lane used to be empty we need to do more
-		if(lane_first) {
-			// Check if the entire queue used to be empty
-			first = !query(snzi);
-
-			// Update the snzi
-			arrive( snzi, i );
-		}
-	#endif
-
-	#if !defined(USE_MPSC)
-		// Unlock and return
-		__atomic_unlock( &lanes.data[i].lock );
-	#endif
-
-	// Mark the current index in the tls rng instance as having an item
-	__tls_rand_advance_bck();
-
-	__cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
-
-	// Update statistics
-	#if !defined(__CFA_NO_STATISTICS__)
-		if(external) {
-			if(local) __atomic_fetch_add(&cltr->stats->ready.pick.ext.lsuccess, 1, __ATOMIC_RELAXED);
-			__atomic_fetch_add(&cltr->stats->ready.pick.ext.success, 1, __ATOMIC_RELAXED);
-		}
-		else {
-			if(local) __tls_stats()->ready.pick.push.lsuccess++;
-			__tls_stats()->ready.pick.push.success++;
-		}
-	#endif
-
-	// return whether or not the list was empty before this push
-	return first;
-}
-
-static struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j);
-static struct $thread * try_pop(struct cluster * cltr, unsigned i);
-
-// Pop from the ready queue from a given cluster
-__attribute__((hot)) $thread * pop(struct cluster * cltr) with (cltr->ready_queue) {
-	/* paranoid */ verify( lanes.count > 0 );
-	unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
-	int preferred;
-	#if defined(BIAS)
-		// Don't bother trying locally too much
-		preferred = kernelTLS().this_processor->cltr_id;
-	#endif
-
-
-	// As long as the list is not empty, try finding a lane that isn't empty and pop from it
-	#ifdef USE_SNZI
-		while( query(snzi) ) {
-	#else
-		for(25) {
-	#endif
-		// Pick two lists at random
-		// unsigned ri = __tls_rand();
-		// unsigned rj = __tls_rand();
-		unsigned ri = __tls_rand_bck();
-		unsigned rj = __tls_rand_bck();
-
-		unsigned i, j;
-		__attribute__((unused)) bool locali, localj;
-		[i, locali] = idx_from_r(ri, preferred);
-		[j, localj] = idx_from_r(rj, preferred);
-
-		#if !defined(__CFA_NO_STATISTICS__)
-			if(locali && localj) {
-				__tls_stats()->ready.pick.pop.local++;
-			}
-		#endif
-
-		i %= count;
-		j %= count;
-
-		// try popping from the 2 picked lists
-		struct $thread * thrd = try_pop(cltr, i, j);
-		if(thrd) {
-			#if defined(BIAS) && !defined(__CFA_NO_STATISTICS__)
-				if( locali || localj ) __tls_stats()->ready.pick.pop.lsuccess++;
-			#endif
-			return thrd;
-		}
-	}
-
-	// All lanes where empty return 0p
-	return 0p;
-}
-
-__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+	// return the popped thread
+	return thrd;
+}
+
+//-----------------------------------------------------------------------
+// try to pop from any lanes making sure you don't miss any threads push
+// before the start of the function
+static inline struct $thread * search(struct cluster * cltr) with (cltr->ready_queue) {
 	/* paranoid */ verify( lanes.count > 0 );
 	unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
@@ -395,5 +505,5 @@
 	for(i; count) {
 		unsigned idx = (offset + i) % count;
-		struct $thread * thrd = try_pop(cltr, idx);
+		struct $thread * thrd = try_pop(cltr, idx __STATS(, __tls_stats()->ready.pop.search));
 		if(thrd) {
 			return thrd;
@@ -405,97 +515,6 @@
 }
 
-
 //-----------------------------------------------------------------------
-// Given 2 indexes, pick the list with the oldest push an try to pop from it
-static inline struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j) with (cltr->ready_queue) {
-	#if !defined(__CFA_NO_STATISTICS__)
-		__tls_stats()->ready.pick.pop.attempt++;
-	#endif
-
-	// Pick the bet list
-	int w = i;
-	if( __builtin_expect(!is_empty(lanes.data[j]), true) ) {
-		w = (ts(lanes.data[i]) < ts(lanes.data[j])) ? i : j;
-	}
-
-	return try_pop(cltr, w);
-}
-
-static inline struct $thread * try_pop(struct cluster * cltr, unsigned w) with (cltr->ready_queue) {
-	// Get relevant elements locally
-	__intrusive_lane_t & lane = lanes.data[w];
-
-	// If list looks empty retry
-	if( is_empty(lane) ) return 0p;
-
-	// If we can't get the lock retry
-	if( !__atomic_try_acquire(&lane.lock) ) return 0p;
-
-
-	// If list is empty, unlock and retry
-	if( is_empty(lane) ) {
-		__atomic_unlock(&lane.lock);
-		return 0p;
-	}
-
-	// Actually pop the list
-	struct $thread * thrd;
-	thrd = pop(lane);
-
-	/* paranoid */ verify(thrd);
-	/* paranoid */ verify(lane.lock);
-
-	#ifdef USE_SNZI
-		// If this was the last element in the lane
-		if(emptied) {
-			depart( snzi, w );
-		}
-	#endif
-
-	// Unlock and return
-	__atomic_unlock(&lane.lock);
-
-	// Update statistics
-	#if !defined(__CFA_NO_STATISTICS__)
-		__tls_stats()->ready.pick.pop.success++;
-	#endif
-
-	// Update the thread bias
-	thrd->link.preferred = w / 4;
-
-	// return the popped thread
-	return thrd;
-}
-//-----------------------------------------------------------------------
-
-bool remove_head(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
-	for(i; lanes.count) {
-		__intrusive_lane_t & lane = lanes.data[i];
-
-		bool removed = false;
-
-		__atomic_acquire(&lane.lock);
-			if(head(lane)->link.next == thrd) {
-				$thread * pthrd;
-				pthrd = pop(lane);
-
-				/* paranoid */ verify( pthrd == thrd );
-
-				removed = true;
-				#ifdef USE_SNZI
-					if(emptied) {
-						depart( snzi, i );
-					}
-				#endif
-			}
-		__atomic_unlock(&lane.lock);
-
-		if( removed ) return true;
-	}
-	return false;
-}
-
-//-----------------------------------------------------------------------
-
+// Check that all the intrusive queues in the data structure are still consistent
 static void check( __ready_queue_t & q ) with (q) {
 	#if defined(__CFA_WITH_VERIFY__) && !defined(USE_MPSC)
@@ -522,4 +541,16 @@
 }
 
+//-----------------------------------------------------------------------
+// Given 2 indexes, pick the list with the oldest push an try to pop from it
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->ready_queue) {
+	// Pick the bet list
+	int w = i;
+	if( __builtin_expect(!is_empty(lanes.data[j]), true) ) {
+		w = (ts(lanes.data[i]) < ts(lanes.data[j])) ? i : j;
+	}
+
+	return try_pop(cltr, w __STATS(, stats));
+}
+
 // Call this function of the intrusive list was moved using memcpy
 // fixes the list so that the pointers back to anchors aren't left dangling
@@ -541,8 +572,34 @@
 }
 
+static void assign_list(unsigned & value, dlist(processor, processor) & list, unsigned count) {
+	processor * it = &list`first;
+	for(unsigned i = 0; i < count; i++) {
+		/* paranoid */ verifyf( it, "Unexpected null iterator, at index %u of %u\n", i, count);
+		it->rdq.id = value;
+		it->rdq.target = -1u;
+		value += READYQ_SHARD_FACTOR;
+		it = &(*it)`next;
+	}
+}
+
+static void reassign_cltr_id(struct cluster * cltr) {
+	unsigned preferred = 0;
+	assign_list(preferred, cltr->procs.actives, cltr->procs.total - cltr->procs.idle);
+	assign_list(preferred, cltr->procs.idles  , cltr->procs.idle );
+}
+
+static void fix_times( struct cluster * cltr ) with( cltr->ready_queue ) {
+	#if defined(USE_WORK_STEALING)
+		lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
+		for(i; lanes.count) {
+			lanes.tscs[i].tv = ts(lanes.data[i]);
+		}
+	#endif
+}
+
 // Grow the ready queue
-unsigned ready_queue_grow(struct cluster * cltr, int target) {
-	unsigned preferred;
+void ready_queue_grow(struct cluster * cltr) {
 	size_t ncount;
+	int target = cltr->procs.total;
 
 	/* paranoid */ verify( ready_mutate_islocked() );
@@ -554,16 +611,10 @@
 	// grow the ready queue
 	with( cltr->ready_queue ) {
-		#ifdef USE_SNZI
-			^(snzi){};
-		#endif
-
 		// Find new count
 		// Make sure we always have atleast 1 list
 		if(target >= 2) {
-			ncount = target * 4;
-			preferred = ncount - 4;
+			ncount = target * READYQ_SHARD_FACTOR;
 		} else {
-			ncount = 1;
-			preferred = 0;
+			ncount = SEQUENTIAL_SHARD;
 		}
 
@@ -583,15 +634,9 @@
 		// Update original
 		lanes.count = ncount;
-
-		#ifdef USE_SNZI
-			// Re-create the snzi
-			snzi{ log2( lanes.count / 8 ) };
-			for( idx; (size_t)lanes.count ) {
-				if( !is_empty(lanes.data[idx]) ) {
-					arrive(snzi, idx);
-				}
-			}
-		#endif
-	}
+	}
+
+	fix_times(cltr);
+
+	reassign_cltr_id(cltr);
 
 	// Make sure that everything is consistent
@@ -601,9 +646,8 @@
 
 	/* paranoid */ verify( ready_mutate_islocked() );
-	return preferred;
 }
 
 // Shrink the ready queue
-void ready_queue_shrink(struct cluster * cltr, int target) {
+void ready_queue_shrink(struct cluster * cltr) {
 	/* paranoid */ verify( ready_mutate_islocked() );
 	__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue\n");
@@ -612,9 +656,7 @@
 	/* paranoid */ check( cltr->ready_queue );
 
+	int target = cltr->procs.total;
+
 	with( cltr->ready_queue ) {
-		#ifdef USE_SNZI
-			^(snzi){};
-		#endif
-
 		// Remember old count
 		size_t ocount = lanes.count;
@@ -622,7 +664,7 @@
 		// Find new count
 		// Make sure we always have atleast 1 list
-		lanes.count = target >= 2 ? target * 4: 1;
+		lanes.count = target >= 2 ? target * READYQ_SHARD_FACTOR: SEQUENTIAL_SHARD;
 		/* paranoid */ verify( ocount >= lanes.count );
-		/* paranoid */ verify( lanes.count == target * 4 || target < 2 );
+		/* paranoid */ verify( lanes.count == target * READYQ_SHARD_FACTOR || target < 2 );
 
 		// for printing count the number of displaced threads
@@ -667,15 +709,9 @@
 			fix(lanes.data[idx]);
 		}
-
-		#ifdef USE_SNZI
-			// Re-create the snzi
-			snzi{ log2( lanes.count / 8 ) };
-			for( idx; (size_t)lanes.count ) {
-				if( !is_empty(lanes.data[idx]) ) {
-					arrive(snzi, idx);
-				}
-			}
-		#endif
-	}
+	}
+
+	fix_times(cltr);
+
+	reassign_cltr_id(cltr);
 
 	// Make sure that everything is consistent
Index: libcfa/src/concurrency/ready_subqueue.hfa
===================================================================
--- libcfa/src/concurrency/ready_subqueue.hfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ libcfa/src/concurrency/ready_subqueue.hfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -246,2 +246,10 @@
 	#endif
 }
+
+// Aligned timestamps which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __timestamp_t {
+	volatile unsigned long long tv;
+};
+
+void  ?{}(__timestamp_t & this) { this.tv = 0; }
+void ^?{}(__timestamp_t & this) {}
Index: libcfa/src/concurrency/stats.cfa
===================================================================
--- libcfa/src/concurrency/stats.cfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ libcfa/src/concurrency/stats.cfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -10,17 +10,30 @@
 #if !defined(__CFA_NO_STATISTICS__)
 	void __init_stats( struct __stats_t * stats ) {
-		stats->ready.pick.push.attempt  = 0;
-		stats->ready.pick.push.success  = 0;
-		stats->ready.pick.push.local    = 0;
-		stats->ready.pick.push.lsuccess = 0;
-		stats->ready.pick.ext.attempt  = 0;
-		stats->ready.pick.ext.success  = 0;
-		stats->ready.pick.ext.local    = 0;
-		stats->ready.pick.ext.lsuccess = 0;
-		stats->ready.pick.pop .probe    = 0;
-		stats->ready.pick.pop .attempt  = 0;
-		stats->ready.pick.pop .success  = 0;
-		stats->ready.pick.pop .local    = 0;
-		stats->ready.pick.pop .lsuccess = 0;
+		stats->ready.push.local.attempt = 0;
+		stats->ready.push.local.success = 0;
+		stats->ready.push.share.attempt = 0;
+		stats->ready.push.share.success = 0;
+		stats->ready.push.extrn.attempt = 0;
+		stats->ready.push.extrn.success = 0;
+		stats->ready.pop.local .attempt = 0;
+		stats->ready.pop.local .success = 0;
+		stats->ready.pop.local .elock   = 0;
+		stats->ready.pop.local .eempty  = 0;
+		stats->ready.pop.local .espec   = 0;
+		stats->ready.pop.help  .attempt = 0;
+		stats->ready.pop.help  .success = 0;
+		stats->ready.pop.help  .elock   = 0;
+		stats->ready.pop.help  .eempty  = 0;
+		stats->ready.pop.help  .espec   = 0;
+		stats->ready.pop.steal .attempt = 0;
+		stats->ready.pop.steal .success = 0;
+		stats->ready.pop.steal .elock   = 0;
+		stats->ready.pop.steal .eempty  = 0;
+		stats->ready.pop.steal .espec   = 0;
+		stats->ready.pop.search.attempt = 0;
+		stats->ready.pop.search.success = 0;
+		stats->ready.pop.search.elock   = 0;
+		stats->ready.pop.search.eempty  = 0;
+		stats->ready.pop.search.espec   = 0;
 		stats->ready.threads.migration = 0;
 		stats->ready.threads.threads   = 0;
@@ -54,17 +67,30 @@
 
 	void __tally_stats( struct __stats_t * cltr, struct __stats_t * proc ) {
-		__atomic_fetch_add( &cltr->ready.pick.push.attempt , proc->ready.pick.push.attempt , __ATOMIC_SEQ_CST ); proc->ready.pick.push.attempt  = 0;
-		__atomic_fetch_add( &cltr->ready.pick.push.success , proc->ready.pick.push.success , __ATOMIC_SEQ_CST ); proc->ready.pick.push.success  = 0;
-		__atomic_fetch_add( &cltr->ready.pick.push.local   , proc->ready.pick.push.local   , __ATOMIC_SEQ_CST ); proc->ready.pick.push.local    = 0;
-		__atomic_fetch_add( &cltr->ready.pick.push.lsuccess, proc->ready.pick.push.lsuccess, __ATOMIC_SEQ_CST ); proc->ready.pick.push.lsuccess = 0;
-		__atomic_fetch_add( &cltr->ready.pick.ext.attempt  , proc->ready.pick.ext.attempt  , __ATOMIC_SEQ_CST ); proc->ready.pick.ext.attempt   = 0;
-		__atomic_fetch_add( &cltr->ready.pick.ext.success  , proc->ready.pick.ext.success  , __ATOMIC_SEQ_CST ); proc->ready.pick.ext.success   = 0;
-		__atomic_fetch_add( &cltr->ready.pick.ext.local    , proc->ready.pick.ext.local    , __ATOMIC_SEQ_CST ); proc->ready.pick.ext.local     = 0;
-		__atomic_fetch_add( &cltr->ready.pick.ext.lsuccess , proc->ready.pick.ext.lsuccess , __ATOMIC_SEQ_CST ); proc->ready.pick.ext.lsuccess  = 0;
-		__atomic_fetch_add( &cltr->ready.pick.pop .probe   , proc->ready.pick.pop .probe   , __ATOMIC_SEQ_CST ); proc->ready.pick.pop .probe    = 0;
-		__atomic_fetch_add( &cltr->ready.pick.pop .attempt , proc->ready.pick.pop .attempt , __ATOMIC_SEQ_CST ); proc->ready.pick.pop .attempt  = 0;
-		__atomic_fetch_add( &cltr->ready.pick.pop .success , proc->ready.pick.pop .success , __ATOMIC_SEQ_CST ); proc->ready.pick.pop .success  = 0;
-		__atomic_fetch_add( &cltr->ready.pick.pop .local   , proc->ready.pick.pop .local   , __ATOMIC_SEQ_CST ); proc->ready.pick.pop .local    = 0;
-		__atomic_fetch_add( &cltr->ready.pick.pop .lsuccess, proc->ready.pick.pop .lsuccess, __ATOMIC_SEQ_CST ); proc->ready.pick.pop .lsuccess = 0;
+		__atomic_fetch_add( &cltr->ready.push.local.attempt, proc->ready.push.local.attempt, __ATOMIC_SEQ_CST ); proc->ready.push.local.attempt = 0;
+		__atomic_fetch_add( &cltr->ready.push.local.success, proc->ready.push.local.success, __ATOMIC_SEQ_CST ); proc->ready.push.local.success = 0;
+		__atomic_fetch_add( &cltr->ready.push.share.attempt, proc->ready.push.share.attempt, __ATOMIC_SEQ_CST ); proc->ready.push.share.attempt = 0;
+		__atomic_fetch_add( &cltr->ready.push.share.success, proc->ready.push.share.success, __ATOMIC_SEQ_CST ); proc->ready.push.share.success = 0;
+		__atomic_fetch_add( &cltr->ready.push.extrn.attempt, proc->ready.push.extrn.attempt, __ATOMIC_SEQ_CST ); proc->ready.push.extrn.attempt = 0;
+		__atomic_fetch_add( &cltr->ready.push.extrn.success, proc->ready.push.extrn.success, __ATOMIC_SEQ_CST ); proc->ready.push.extrn.success = 0;
+		__atomic_fetch_add( &cltr->ready.pop.local .attempt, proc->ready.pop.local .attempt, __ATOMIC_SEQ_CST ); proc->ready.pop.local .attempt = 0;
+		__atomic_fetch_add( &cltr->ready.pop.local .success, proc->ready.pop.local .success, __ATOMIC_SEQ_CST ); proc->ready.pop.local .success = 0;
+		__atomic_fetch_add( &cltr->ready.pop.local .elock  , proc->ready.pop.local .elock  , __ATOMIC_SEQ_CST ); proc->ready.pop.local .elock   = 0;
+		__atomic_fetch_add( &cltr->ready.pop.local .eempty , proc->ready.pop.local .eempty , __ATOMIC_SEQ_CST ); proc->ready.pop.local .eempty  = 0;
+		__atomic_fetch_add( &cltr->ready.pop.local .espec  , proc->ready.pop.local .espec  , __ATOMIC_SEQ_CST ); proc->ready.pop.local .espec   = 0;
+		__atomic_fetch_add( &cltr->ready.pop.help  .attempt, proc->ready.pop.help  .attempt, __ATOMIC_SEQ_CST ); proc->ready.pop.help  .attempt = 0;
+		__atomic_fetch_add( &cltr->ready.pop.help  .success, proc->ready.pop.help  .success, __ATOMIC_SEQ_CST ); proc->ready.pop.help  .success = 0;
+		__atomic_fetch_add( &cltr->ready.pop.help  .elock  , proc->ready.pop.help  .elock  , __ATOMIC_SEQ_CST ); proc->ready.pop.help  .elock   = 0;
+		__atomic_fetch_add( &cltr->ready.pop.help  .eempty , proc->ready.pop.help  .eempty , __ATOMIC_SEQ_CST ); proc->ready.pop.help  .eempty  = 0;
+		__atomic_fetch_add( &cltr->ready.pop.help  .espec  , proc->ready.pop.help  .espec  , __ATOMIC_SEQ_CST ); proc->ready.pop.help  .espec   = 0;
+		__atomic_fetch_add( &cltr->ready.pop.steal .attempt, proc->ready.pop.steal .attempt, __ATOMIC_SEQ_CST ); proc->ready.pop.steal .attempt = 0;
+		__atomic_fetch_add( &cltr->ready.pop.steal .success, proc->ready.pop.steal .success, __ATOMIC_SEQ_CST ); proc->ready.pop.steal .success = 0;
+		__atomic_fetch_add( &cltr->ready.pop.steal .elock  , proc->ready.pop.steal .elock  , __ATOMIC_SEQ_CST ); proc->ready.pop.steal .elock   = 0;
+		__atomic_fetch_add( &cltr->ready.pop.steal .eempty , proc->ready.pop.steal .eempty , __ATOMIC_SEQ_CST ); proc->ready.pop.steal .eempty  = 0;
+		__atomic_fetch_add( &cltr->ready.pop.steal .espec  , proc->ready.pop.steal .espec  , __ATOMIC_SEQ_CST ); proc->ready.pop.steal .espec   = 0;
+		__atomic_fetch_add( &cltr->ready.pop.search.attempt, proc->ready.pop.search.attempt, __ATOMIC_SEQ_CST ); proc->ready.pop.search.attempt = 0;
+		__atomic_fetch_add( &cltr->ready.pop.search.success, proc->ready.pop.search.success, __ATOMIC_SEQ_CST ); proc->ready.pop.search.success = 0;
+		__atomic_fetch_add( &cltr->ready.pop.search.elock  , proc->ready.pop.search.elock  , __ATOMIC_SEQ_CST ); proc->ready.pop.search.elock   = 0;
+		__atomic_fetch_add( &cltr->ready.pop.search.eempty , proc->ready.pop.search.eempty , __ATOMIC_SEQ_CST ); proc->ready.pop.search.eempty  = 0;
+		__atomic_fetch_add( &cltr->ready.pop.search.espec  , proc->ready.pop.search.espec  , __ATOMIC_SEQ_CST ); proc->ready.pop.search.espec   = 0;
 		__atomic_fetch_add( &cltr->ready.threads.migration , proc->ready.threads.migration , __ATOMIC_SEQ_CST ); proc->ready.threads.migration  = 0;
 		__atomic_fetch_add( &cltr->ready.threads.threads   , proc->ready.threads.threads   , __ATOMIC_SEQ_CST ); proc->ready.threads.threads    = 0;
@@ -95,27 +121,33 @@
 
 		if( flags & CFA_STATS_READY_Q ) {
-			double push_len = ((double)ready.pick.push.attempt) / ready.pick.push.success;
-			double ext_len  = ((double)ready.pick.ext .attempt) / ready.pick.ext .success;
-			double pop_len  = ((double)ready.pick.pop .attempt) / ready.pick.pop .success;
-
-			double lpush_len = ((double)ready.pick.push.local) / ready.pick.push.lsuccess;
-			double lext_len  = ((double)ready.pick.ext .local) / ready.pick.ext .lsuccess;
-			double lpop_len  = ((double)ready.pick.pop .local) / ready.pick.pop .lsuccess;
+			double push_len = ((double)ready.push.local.attempt + ready.push.share.attempt + ready.push.extrn.attempt) / (ready.push.local.success + ready.push.share.success + ready.push.extrn.success);
+			double sLcl_len = ready.push.local.success ? ((double)ready.push.local.attempt) / ready.push.local.success : 0;
+			double sOth_len = ready.push.share.success ? ((double)ready.push.share.attempt) / ready.push.share.success : 0;
+			double sExt_len = ready.push.extrn.success ? ((double)ready.push.extrn.attempt) / ready.push.extrn.success : 0;
+
+			double rLcl_len  = ready.pop.local .success ? ((double)ready.pop.local .attempt) / ready.pop.local .success : 0;
+			double rHlp_len  = ready.pop.help  .success ? ((double)ready.pop.help  .attempt) / ready.pop.help  .success : 0;
+			double rStl_len  = ready.pop.steal .success ? ((double)ready.pop.steal .attempt) / ready.pop.steal .success : 0;
+			double rSch_len  = ready.pop.search.success ? ((double)ready.pop.search.attempt) / ready.pop.search.success : 0;
 
 			__cfaabi_bits_print_safe( STDOUT_FILENO,
 				"----- %s \"%s\" (%p) - Ready Q Stats -----\n"
-				"- total threads  : %'15" PRIu64 "run, %'15" PRIu64 "schd (%'" PRIu64 "ext, %'" PRIu64 "mig, %'" PRId64 " )\n"
-				"- push avg probe : %'3.2lf, %'3.2lfl (%'15" PRIu64 " attempts, %'15" PRIu64 " locals)\n"
-				"- ext  avg probe : %'3.2lf, %'3.2lfl (%'15" PRIu64 " attempts, %'15" PRIu64 " locals)\n"
-				"- pop  avg probe : %'3.2lf, %'3.2lfl (%'15" PRIu64 " attempts, %'15" PRIu64 " locals)\n"
-				"- Idle Sleep     : %'15" PRIu64 "h, %'15" PRIu64 "c, %'15" PRIu64 "w, %'15" PRIu64 "e\n"
+				"- totals   : %'3" PRIu64 " run, %'3" PRIu64 " schd (%'" PRIu64 "ext, %'" PRIu64 "mig, %'" PRId64 " )\n"
+				"- push avg : %'3.2lf (l: %'3.2lf/%'" PRIu64 ", s: %'3.2lf/%'" PRIu64 ", e: %'3.2lf : %'" PRIu64 "e)\n"
+				"- local    : %'3.2lf (%'3" PRIu64 " try, %'3" PRIu64 " spc, %'3" PRIu64 " lck, %'3" PRIu64 " ept)\n"
+				"- help     : %'3.2lf (%'3" PRIu64 " try, %'3" PRIu64 " spc, %'3" PRIu64 " lck, %'3" PRIu64 " ept)\n"
+				"- steal    : %'3.2lf (%'3" PRIu64 " try, %'3" PRIu64 " spc, %'3" PRIu64 " lck, %'3" PRIu64 " ept)\n"
+				"- search   : %'3.2lf (%'3" PRIu64 " try, %'3" PRIu64 " spc, %'3" PRIu64 " lck, %'3" PRIu64 " ept)\n"
+				"- Idle Slp : %'3" PRIu64 "h, %'3" PRIu64 "c, %'3" PRIu64 "w, %'3" PRIu64 "e\n"
 				"\n"
 				, type, name, id
-				, ready.pick.pop.success
-				, ready.pick.push.success + ready.pick.ext.success
-				, ready.pick.ext.success, ready.threads.migration, ready.threads.threads
-				, push_len, lpush_len, ready.pick.push.attempt, ready.pick.push.local
-				, ext_len , lext_len , ready.pick.ext .attempt, ready.pick.ext .local
-				, pop_len , lpop_len , ready.pick.pop .attempt, ready.pick.pop .local
+				, ready.pop.local.success + ready.pop.help.success + ready.pop.steal.success + ready.pop.search.success
+				, ready.push.local.success + ready.push.share.success + ready.push.extrn.success
+				, ready.push.extrn.success, ready.threads.migration, ready.threads.threads
+				, push_len, sLcl_len, ready.push.local.attempt, sOth_len, ready.push.share.attempt, sExt_len, ready.push.extrn.attempt
+				, rLcl_len, ready.pop.local .attempt, ready.pop.local .espec, ready.pop.local .elock, ready.pop.local .eempty
+				, rHlp_len, ready.pop.help  .attempt, ready.pop.help  .espec, ready.pop.help  .elock, ready.pop.help  .eempty
+				, rStl_len, ready.pop.steal .attempt, ready.pop.steal .espec, ready.pop.steal .elock, ready.pop.steal .eempty
+				, rSch_len, ready.pop.search.attempt, ready.pop.search.espec, ready.pop.search.elock, ready.pop.search.eempty
 				, ready.sleep.halts, ready.sleep.cancels, ready.sleep.wakes, ready.sleep.exits
 			);
Index: libcfa/src/concurrency/stats.hfa
===================================================================
--- libcfa/src/concurrency/stats.hfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ libcfa/src/concurrency/stats.hfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -16,56 +16,56 @@
 	static inline void __print_stats( struct __stats_t *, int, const char *, const char *, void * ) {}
 #else
+	struct __stats_readyQ_pop_t {
+		// number of attemps at poping something
+		volatile uint64_t attempt;
 
-	struct __attribute__((aligned(64))) __stats_readQ_t {
+		// number of successes at poping
+		volatile uint64_t success;
+
+		// number of attempts failed due to the lock being held
+		volatile uint64_t elock;
+
+		// number of attempts failed due to the queue being empty (lock held)
+		volatile uint64_t eempty;
+
+		// number of attempts failed due to the queue looking empty (lock not held)
+		volatile uint64_t espec;
+	};
+
+	struct __attribute__((aligned(64))) __stats_readyQ_t {
+		// Push statistic
 		struct {
-			// Push statistic
 			struct {
-				// number of attemps at pushing something
+				// number of attemps at pushing something to preferred queues
 				volatile uint64_t attempt;
 
-				// number of successes at pushing
+				// number of successes at pushing to preferred queues
 				volatile uint64_t success;
+			}
+			// Stats for local queue within cluster
+			local,
 
-				// number of attemps at pushing something to preferred queues
-				volatile uint64_t local;
+			// Stats for non-local queues within cluster
+			share,
 
-				// number of successes at pushing to preferred queues
-				volatile uint64_t lsuccess;
-			} push;
+			// Stats from outside cluster
+			extrn;
+		} push;
 
-			struct {
-				// number of attemps at pushing something
-				volatile uint64_t attempt;
+		// Pop statistic
+		struct {
+			// pop from local queue
+			__stats_readyQ_pop_t local;
 
-				// number of successes at pushing
-				volatile uint64_t success;
+			// pop before looking at local queue
+			__stats_readyQ_pop_t help;
 
-				// number of attemps at pushing something to preferred queues
-				volatile uint64_t local;
+			// pop from some other queue
+			__stats_readyQ_pop_t steal;
 
-				// number of successes at pushing to preferred queues
-				volatile uint64_t lsuccess;
-			} ext;
+			// pop when searching queues sequentially
+			__stats_readyQ_pop_t search;
+		} pop;
 
-			// Pop statistic
-			struct {
-				// number of reads of the mask
-				// picking an empty __cfa_readyQ_mask_t counts here
-				// but not as an attempt
-				volatile uint64_t probe;
-
-				// number of attemps at poping something
-				volatile uint64_t attempt;
-
-				// number of successes at poping
-				volatile uint64_t success;
-
-				// number of attemps at poping something to preferred queues
-				volatile uint64_t local;
-
-				// number of successes at poping to preferred queues
-				volatile uint64_t lsuccess;
-			} pop;
-		} pick;
 		struct {
 			volatile uint64_t migration;
@@ -119,5 +119,5 @@
 
 	struct __attribute__((aligned(128))) __stats_t {
-		__stats_readQ_t ready;
+		__stats_readyQ_t ready;
 		#if defined(CFA_HAVE_LINUX_IO_URING_H)
 			__stats_io_t    io;
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ libcfa/src/concurrency/thread.cfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -39,5 +39,4 @@
 	link.next = 0p;
 	link.prev = 0p;
-	link.preferred = -1;
 	#if defined( __CFA_WITH_VERIFY__ )
 		canary = 0x0D15EA5E0D15EA5Ep;
Index: libcfa/src/math.hfa
===================================================================
--- libcfa/src/math.hfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ libcfa/src/math.hfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -5,11 +5,11 @@
 // file "LICENCE" distributed with Cforall.
 //
-// math --
+// math.hfa --
 //
 // Author           : Peter A. Buhr
 // Created On       : Mon Apr 18 23:37:04 2016
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Mon Apr 12 18:35:39 2021
-// Update Count     : 131
+// Last Modified On : Thu Apr 15 11:47:56 2021
+// Update Count     : 132
 //
 
@@ -104,5 +104,5 @@
 	int log2( unsigned int n ) { return n == 0 ? -1 : sizeof(n) * __CHAR_BIT__ - 1 - __builtin_clz( n ); }
 	long int log2( unsigned long int n ) { return n == 0 ? -1 : sizeof(n) * __CHAR_BIT__ - 1 - __builtin_clzl( n ); }
-	long long int log2( unsigned long long int n ) { return n == 0 ? -1 : sizeof(n) * __CHAR_BIT__ - 1 - __builtin_clzl( n ); }
+	long long int log2( unsigned long long int n ) { return n == 0 ? -1 : sizeof(n) * __CHAR_BIT__ - 1 - __builtin_clzll( n ); }
 	float log2( float x ) { return log2f( x ); }
 	// extern "C" { double log2( double ); }
Index: libcfa/src/time.hfa
===================================================================
--- libcfa/src/time.hfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ libcfa/src/time.hfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -10,6 +10,6 @@
 // Created On       : Wed Mar 14 23:18:57 2018
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Wed Jun 17 16:13:00 2020
-// Update Count     : 663
+// Last Modified On : Wed Apr 14 09:30:30 2021
+// Update Count     : 664
 //
 
@@ -29,4 +29,16 @@
 static inline {
 	Duration ?=?( Duration & dur, __attribute__((unused)) zero_t ) { return dur{ 0 }; }
+
+	void ?{}( Duration & dur, timeval t ) with( dur ) { tn = (int64_t)t.tv_sec * TIMEGRAN + t.tv_usec * 1000; }
+	Duration ?=?( Duration & dur, timeval t ) with( dur ) {
+		tn = (int64_t)t.tv_sec * TIMEGRAN + t.tv_usec * (TIMEGRAN / 1_000_000LL);
+		return dur;
+	} // ?=?
+
+	void ?{}( Duration & dur, timespec t ) with( dur ) { tn = (int64_t)t.tv_sec * TIMEGRAN + t.tv_nsec; }
+	Duration ?=?( Duration & dur, timespec t ) with( dur ) {
+		tn = (int64_t)t.tv_sec * TIMEGRAN + t.tv_nsec;
+		return dur;
+	} // ?=?
 
 	Duration +?( Duration rhs ) with( rhs ) { return (Duration)@{ +tn }; }
Index: src/Parser/parser.yy
===================================================================
--- src/Parser/parser.yy	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ src/Parser/parser.yy	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -10,6 +10,6 @@
 // Created On       : Sat Sep  1 20:22:55 2001
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Thu Apr  1 14:43:24 2021
-// Update Count     : 4978
+// Last Modified On : Wed Apr 14 18:13:44 2021
+// Update Count     : 4983
 //
 
@@ -281,5 +281,6 @@
 %token ATTRIBUTE EXTENSION								// GCC
 %token IF ELSE SWITCH CASE DEFAULT DO WHILE FOR BREAK CONTINUE GOTO RETURN
-%token CHOOSE DISABLE ENABLE FALLTHRU FALLTHROUGH TRY THROW THROWRESUME AT WITH WHEN WAITFOR // CFA
+%token CHOOSE FALLTHRU FALLTHROUGH WITH WHEN WAITFOR	// CFA
+%token DISABLE ENABLE TRY THROW THROWRESUME AT			// CFA
 %token ASM												// C99, extension ISO/IEC 9899:1999 Section J.5.10(1)
 %token ALIGNAS ALIGNOF GENERIC STATICASSERT				// C11
Index: tests/.expect/math.nast.x86.txt
===================================================================
--- tests/.expect/math.nast.x86.txt	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ tests/.expect/math.nast.x86.txt	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -19,5 +19,5 @@
 log2:10 17 23
 log2:10 17 23
-log2:42 49 55
+log2:10 17 23
 log2:3. 3. 3.
 log10:2. 2. 2.
Index: tests/concurrent/futures/multi.cfa
===================================================================
--- tests/concurrent/futures/multi.cfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ tests/concurrent/futures/multi.cfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -5,10 +5,12 @@
 
 thread Server {
-	int cnt, iteration;
+	int pending, done, iteration;
 	multi_future(int) * request;
 };
 
 void ?{}( Server & this ) {
-	this.cnt = 0;
+	((thread&)this){"Server Thread"};
+	this.pending = 0;
+	this.done = 0;
 	this.iteration = 0;
 	this.request = 0p;
@@ -16,6 +18,6 @@
 
 void ^?{}( Server & mutex this ) {
-	assert(this.cnt == 0);
-    this.request = 0p;
+	assert(this.pending == 0);
+	this.request = 0p;
 }
 
@@ -24,30 +26,31 @@
 }
 
-void process( Server & mutex this ) {
-	fulfil( *this.request, this.iteration );
-	this.iteration++;
+void call( Server & mutex this ) {
+	this.pending++;
 }
 
-void call( Server & mutex this ) {
-	this.cnt++;
+void finish( Server & mutex this ) {
+	this.done++;
 }
 
-void finish( Server & mutex this ) { }
-
 void main( Server & this ) {
+	MAIN_LOOP:
 	for() {
 		waitfor( ^?{} : this ) {
 			break;
 		}
-		or when( this.cnt < NFUTURES ) waitfor( call: this ) {
-			if (this.cnt == NFUTURES) {
-				process(this);
+		or waitfor( call: this ) {
+			if (this.pending != NFUTURES) { continue MAIN_LOOP; }
+
+			this.pending = 0;
+			fulfil( *this.request, this.iteration );
+			this.iteration++;
+
+			for(NFUTURES) {
+				waitfor( finish: this );
 			}
-		}
-		or waitfor( finish: this ) {
-			if (this.cnt == NFUTURES) {
-				reset( *this.request );
-				this.cnt = 0;
-			}
+
+			reset( *this.request );
+			this.done = 0;
 		}
 	}
@@ -57,4 +60,8 @@
 Server * the_server;
 thread Worker {};
+void ?{}(Worker & this) {
+	((thread&)this){"Worker Thread"};
+}
+
 multi_future(int) * shared_future;
 
Index: tests/concurrent/spinaphore.cfa
===================================================================
--- tests/concurrent/spinaphore.cfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ tests/concurrent/spinaphore.cfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -49,10 +49,10 @@
 void main(Unblocker & this) {
 	this.sum = 0;
-	unsigned me = (unsigned)&this;
+	unsigned me = (unsigned)(uintptr_t)&this;
 	for(num_unblocks) {
 		$thread * t = V(sem, false);
 		Blocker * b = from_thread(t);
 		b->sum += me;
-		this.sum += (unsigned)b;
+		this.sum += (unsigned)(uintptr_t)b;
 		unpark(t);
 		yield(random(10));
@@ -73,10 +73,10 @@
 		for(i;num_blockers) {
 			for(num_blocks)
-				usum += (unsigned)&blockers[i];
+				usum += (unsigned)(uintptr_t)&blockers[i];
 		}
 
 		for(i;num_unblockers) {
 			for(num_unblocks)
-				bsum += (unsigned)&unblockers[i];
+				bsum += (unsigned)(uintptr_t)&unblockers[i];
 		}
 
Index: tests/time.cfa
===================================================================
--- tests/time.cfa	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ tests/time.cfa	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -10,6 +10,6 @@
 // Created On       : Tue Mar 27 17:24:56 2018
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Thu Jun 18 18:14:49 2020
-// Update Count     : 37
+// Last Modified On : Fri Apr 16 14:59:53 2021
+// Update Count     : 38
 //
 
@@ -53,5 +53,5 @@
 	// 	 | "Newfoundland" | getTime( Newfoundland )
 	// 	 | "local" | getTime()
-	// 	 | "local nsec" | getTimeNsec()
+	// 	 | "local nsec" | timeHiRes()
 	// 	 | "PST" | PST();								// getTime short form
 	// sout | nl;
Index: tools/gdb/utils-gdb.py
===================================================================
--- tools/gdb/utils-gdb.py	(revision 6a8208cbc93288ea3ca61f2a04619465c12cf317)
+++ tools/gdb/utils-gdb.py	(revision 59f3f616c737817ce7af766fa1bcf77a79d695d9)
@@ -23,5 +23,5 @@
 gdb.execute('handle SIGUSR1 nostop noprint pass')
 
-CfaTypes = collections.namedtuple('CfaTypes', 'cluster_ptr processor_ptr thread_ptr int_ptr thread_state')
+CfaTypes = collections.namedtuple('CfaTypes', 'cluster_ptr processor_ptr thread_ptr int_ptr thread_state yield_state')
 
 class ThreadInfo:
@@ -52,8 +52,9 @@
 	# GDB types for various structures/types in CFA
 	return CfaTypes(cluster_ptr = gdb.lookup_type('struct cluster').pointer(),
-				  processor_ptr = gdb.lookup_type('struct processor').pointer(),
-					 thread_ptr = gdb.lookup_type('struct $thread').pointer(),
-						int_ptr = gdb.lookup_type('int').pointer(),
-				   thread_state = gdb.lookup_type('enum __Coroutine_State'))
+		processor_ptr = gdb.lookup_type('struct processor').pointer(),
+		thread_ptr = gdb.lookup_type('struct $thread').pointer(),
+		int_ptr = gdb.lookup_type('int').pointer(),
+		thread_state = gdb.lookup_type('enum __Coroutine_State'),
+		yield_state = gdb.lookup_type('enum __Preemption_Reason'))
 
 def get_addr(addr):
@@ -371,5 +372,17 @@
 	def print_thread(self, thread, tid, marked):
 		cfa_t = get_cfa_types()
-		self.print_formatted(marked, tid, thread['self_cor']['name'].string(), str(thread['state'].cast(cfa_t.thread_state)), str(thread))
+		ys = str(thread['preempted'].cast(cfa_t.yield_state))
+		if ys == '_X15__NO_PREEMPTIONKM19__Preemption_Reason_1':
+			state = str(thread['state'].cast(cfa_t.thread_state))
+		elif ys == '_X18__ALARM_PREEMPTIONKM19__Preemption_Reason_1':
+			state = 'preempted'
+		elif ys == '_X19__MANUAL_PREEMPTIONKM19__Preemption_Reason_1':
+			state = 'yield'
+		elif ys == '_X17__POLL_PREEMPTIONKM19__Preemption_Reason_1':
+			state = 'poll'
+		else:
+			print("error: thread {} in undefined preemption state {}".format(thread, ys))
+			state = 'error'
+		self.print_formatted(marked, tid, thread['self_cor']['name'].string(), state, str(thread))
 
 	def print_threads_by_cluster(self, cluster, print_system = False):
@@ -480,17 +493,24 @@
 		context = thread['context']
 
+
+
+		# must be at frame 0 to set pc register
+		gdb.execute('select-frame 0')
+		if gdb.selected_frame().architecture().name() != 'i386:x86-64':
+			print('gdb debugging only supported for i386:x86-64 for now')
+			return
+
+		# gdb seems to handle things much better if we pretend we just entered the context switch
+		# pretend the pc is __cfactx_switch and adjust the sp, base pointer doesn't need to change
 		# lookup for sp,fp and uSwitch
-		xsp = context['SP'] + 48
+		xsp = context['SP'] + 40 # 40 = 5 64bit registers : %r15, %r14, %r13, %r12, %rbx WARNING: x64 specific
 		xfp = context['FP']
 
 		# convert string so we can strip out the address
 		try:
-			xpc = get_addr(gdb.parse_and_eval('__cfactx_switch').address + 28)
+			xpc = get_addr(gdb.parse_and_eval('__cfactx_switch').address)
 		except:
 			print("here")
 			return
-
-		# must be at frame 0 to set pc register
-		gdb.execute('select-frame 0')
 
 		# push sp, fp, pc into a global stack
@@ -503,5 +523,6 @@
 
 		# update registers for new task
-		print('switching to ')
+		# print('switching to {} ({}) : [{}, {}, {}]'.format(thread['self_cor']['name'].string(), str(thread), str(xsp), str(xfp), str(xpc)))
+		print('switching to thread {} ({})'.format(str(thread), thread['self_cor']['name'].string()))
 		gdb.execute('set $rsp={}'.format(xsp))
 		gdb.execute('set $rbp={}'.format(xfp))
@@ -552,5 +573,4 @@
 
 		argv = parse(arg)
-		print(argv)
 		if argv[0].isdigit():
 			cname = " ".join(argv[1:]) if len(argv) > 1 else None