Index: libcfa/src/bits/random.hfa
===================================================================
--- libcfa/src/bits/random.hfa	(revision 13c5e19865b1e243b856f4a84487f29cc918f162)
+++ libcfa/src/bits/random.hfa	(revision 13c5e19865b1e243b856f4a84487f29cc918f162)
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <stdint.h>
+
+//--------------------------------------------------
+typedef __uint128_t __lehmer64_state_t;
+static inline uint64_t __lehmer64( __lehmer64_state_t & state ) {
+	state *= 0xda942042e4dd58b5;
+	return state >> 64;
+}
+
+//--------------------------------------------------
+typedef uint64_t __wyhash64_state_t;
+static inline uint64_t __wyhash64( __wyhash64_state_t & state ) {
+	state += 0x60bee2bee120fc15;
+	__uint128_t tmp;
+	tmp = (__uint128_t) state * 0xa3b195354a39b70d;
+	uint64_t m1 = (tmp >> 64) ^ tmp;
+	tmp = (__uint128_t)m1 * 0x1b03738712fad5c9;
+	uint64_t m2 = (tmp >> 64) ^ tmp;
+	return m2;
+}
+
+//--------------------------------------------------
+typedef uint64_t __xorshift64_state_t;
+static inline uint64_t __xorshift64( __xorshift64_state_t & state ) {
+	uint64_t x = state;
+	x ^= x << 13;
+	x ^= x >> 7;
+	x ^= x << 17;
+	return state = x;
+}
+
+//--------------------------------------------------
+typedef struct {
+  uint32_t a, b, c, d;
+  uint32_t counter;
+} __xorwow__state_t;
+
+/* The state array must be initialized to not be all zero in the first four words */
+static inline uint32_t __xorwow( __xorwow__state_t & state ) {
+	/* Algorithm "xorwow" from p. 5 of Marsaglia, "Xorshift RNGs" */
+	uint32_t t = state.d;
+
+	uint32_t const s = state.a;
+	state.d = state.c;
+	state.c = state.b;
+	state.b = s;
+
+	t ^= t >> 2;
+	t ^= t << 1;
+	t ^= s ^ (s << 4);
+	state.a = t;
+
+	state.counter += 362437;
+	return t + state.counter;
+}
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision b232745aa346d2a551cb84ea589c6e3eb116ad6a)
+++ libcfa/src/concurrency/invoke.h	(revision 13c5e19865b1e243b856f4a84487f29cc918f162)
@@ -57,5 +57,5 @@
 			} preemption_state;
 
-			uint32_t rand_seed;
+			__uint128_t rand_seed;
 		} kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
 	}
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision b232745aa346d2a551cb84ea589c6e3eb116ad6a)
+++ libcfa/src/concurrency/io.cfa	(revision 13c5e19865b1e243b856f4a84487f29cc918f162)
@@ -167,4 +167,5 @@
 		struct {
 			struct {
+				__processor_id_t id;
 				void * stack;
 				pthread_t kthrd;
@@ -334,24 +335,30 @@
 		if( this.io->cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD ) {
 			with( this.io->poller.fast ) {
-				/* paranoid */ verify( this.procs.head == 0p || &this == mainCluster );
-				/* paranoid */ verify( this.idles.head == 0p || &this == mainCluster );
+				/* paranoid */ verify( this.nprocessors == 0 || &this == mainCluster );
+				/* paranoid */ verify( !ready_mutate_islocked() );
 
 				// We need to adjust the clean-up based on where the thread is
 				if( thrd.state == Ready || thrd.preempted != __NO_PREEMPTION ) {
 
-					// This is the tricky case
-					// The thread was preempted and now it is on the ready queue
-
-					/* paranoid */ verify( thrd.next != 0p );                // The thread should be the last on the list
-					/* paranoid */ verify( this.ready_queue.head == &thrd ); // The thread should be the only thing on the list
-
-					// Remove the thread from the ready queue of this cluster
-					this.ready_queue.head = 1p;
-					thrd.next = 0p;
-					__cfaabi_dbg_debug_do( thrd.unpark_stale = true );
-
-					// Fixup the thread state
-					thrd.state = Blocked;
-					thrd.preempted = __NO_PREEMPTION;
+					ready_schedule_lock( (struct __processor_id_t *)active_processor() );
+
+						// This is the tricky case
+						// The thread was preempted and now it is on the ready queue
+						// The thread should be the last on the list
+						/* paranoid */ verify( thrd.link.next != 0p );
+
+						// Remove the thread from the ready queue of this cluster
+						__attribute__((unused)) bool removed = remove_head( &this, &thrd );
+						/* paranoid */ verify( removed );
+						thrd.link.next = 0p;
+						thrd.link.prev = 0p;
+						__cfaabi_dbg_debug_do( thrd.unpark_stale = true );
+
+						// Fixup the thread state
+						thrd.state = Blocked;
+						thrd.ticket = 0;
+						thrd.preempted = __NO_PREEMPTION;
+
+					ready_schedule_unlock( (struct __processor_id_t *)active_processor() );
 
 					// Pretend like the thread was blocked all along
@@ -365,5 +372,5 @@
 					thrd.curr_cluster = active_cluster();
 
-			// unpark the fast io_poller
+					// unpark the fast io_poller
 					unpark( &thrd __cfaabi_dbg_ctx2 );
 				}
@@ -458,5 +465,6 @@
 		}
 
-		verify( (shead + ret) == *ring.submit_q.head );
+		uint32_t nhead = *ring.submit_q.head;
+		verifyf( (shead + ret) == nhead, "Expected %u got %u\n", (shead + ret), nhead );
 
 		// Release the consumed SQEs
@@ -474,8 +482,8 @@
 		// update statistics
 		#if !defined(__CFA_NO_STATISTICS__)
-			__tls_stats()->io.submit_q.stats.submit_avg.rdy += to_submit;
-			__tls_stats()->io.submit_q.stats.submit_avg.csm += ret;
-			__tls_stats()->io.submit_q.stats.submit_avg.avl += avail;
-			__tls_stats()->io.submit_q.stats.submit_avg.cnt += 1;
+			__tls_stats()->io.submit_q.submit_avg.rdy += to_submit;
+			__tls_stats()->io.submit_q.submit_avg.csm += ret;
+			__tls_stats()->io.submit_q.submit_avg.avl += avail;
+			__tls_stats()->io.submit_q.submit_avg.cnt += 1;
 		#endif
 
@@ -505,5 +513,5 @@
 			data->result = cqe.res;
 			if(!in_kernel) { unpark( data->thrd __cfaabi_dbg_ctx2 ); }
-			else         { __unpark( data->thrd __cfaabi_dbg_ctx2 ); }
+			else         { __unpark( &ring.poller.slow.id, data->thrd __cfaabi_dbg_ctx2 ); }
 		}
 
@@ -520,6 +528,14 @@
 
 	static void * __io_poller_slow( void * arg ) {
+		#if !defined( __CFA_NO_STATISTICS__ )
+			__stats_t local_stats;
+			__init_stats( &local_stats );
+			kernelTLS.this_stats = &local_stats;
+		#endif
+
 		cluster * cltr = (cluster *)arg;
 		struct __io_data & ring = *cltr->io;
+
+		ring.poller.slow.id.id = doregister( &ring.poller.slow.id );
 
 		sigset_t mask;
@@ -551,11 +567,11 @@
 				// Update statistics
 				#if !defined(__CFA_NO_STATISTICS__)
-					__tls_stats()->io.complete_q.stats.completed_avg.val += count;
-					__tls_stats()->io.complete_q.stats.completed_avg.slow_cnt += 1;
+					__tls_stats()->io.complete_q.completed_avg.val += count;
+					__tls_stats()->io.complete_q.completed_avg.slow_cnt += 1;
 				#endif
 
 				if(again) {
 					__cfadbg_print_safe(io_core, "Kernel I/O : Moving to ring %p to fast poller\n", &ring);
-					__unpark( &ring.poller.fast.thrd __cfaabi_dbg_ctx2 );
+					__unpark( &ring.poller.slow.id, &ring.poller.fast.thrd __cfaabi_dbg_ctx2 );
 					wait( ring.poller.sem );
 				}
@@ -571,6 +587,6 @@
 				// Update statistics
 				#if !defined(__CFA_NO_STATISTICS__)
-					__tls_stats()->io.complete_q.stats.completed_avg.val += count;
-					__tls_stats()->io.complete_q.stats.completed_avg.slow_cnt += 1;
+					__tls_stats()->io.complete_q.completed_avg.val += count;
+					__tls_stats()->io.complete_q.completed_avg.slow_cnt += 1;
 				#endif
 			}
@@ -578,4 +594,6 @@
 
 		__cfadbg_print_safe(io_core, "Kernel I/O : Slow poller for ring %p stopping\n", &ring);
+
+		unregister( &ring.poller.slow.id );
 
 		return 0p;
@@ -598,13 +616,15 @@
 			int count;
 			bool again;
-			[count, again] = __drain_io( *this.ring, 0p, 0, false );
-
-			if(!again) reset++;
-
-			// Update statistics
-			#if !defined(__CFA_NO_STATISTICS__)
-				__tls_stats()->io.complete_q.stats.completed_avg.val += count;
-				__tls_stats()->io.complete_q.stats.completed_avg.fast_cnt += 1;
-			#endif
+			disable_interrupts();
+				[count, again] = __drain_io( *this.ring, 0p, 0, false );
+
+				if(!again) reset++;
+
+				// Update statistics
+				#if !defined(__CFA_NO_STATISTICS__)
+					__tls_stats()->io.complete_q.completed_avg.val += count;
+					__tls_stats()->io.complete_q.completed_avg.fast_cnt += 1;
+				#endif
+			enable_interrupts( __cfaabi_dbg_ctx );
 
 			// If we got something, just yield and check again
@@ -667,4 +687,6 @@
 		verify( data != 0 );
 
+		disable_interrupts();
+
 		// Prepare the data we need
 		__attribute((unused)) int len   = 0;
@@ -675,5 +697,5 @@
 
 		// Loop around looking for an available spot
-		LOOKING: for() {
+		for() {
 			// Look through the list starting at some offset
 			for(i; cnt) {
@@ -688,8 +710,10 @@
 					// update statistics
 					#if !defined(__CFA_NO_STATISTICS__)
-						__tls_stats()->io.submit_q.stats.alloc_avg.val   += len;
-						__tls_stats()->io.submit_q.stats.alloc_avg.block += block;
-						__tls_stats()->io.submit_q.stats.alloc_avg.cnt   += 1;
+						__tls_stats()->io.submit_q.alloc_avg.val   += len;
+						__tls_stats()->io.submit_q.alloc_avg.block += block;
+						__tls_stats()->io.submit_q.alloc_avg.cnt   += 1;
 					#endif
+
+					enable_interrupts( __cfaabi_dbg_ctx );
 
 					// Success return the data
@@ -710,4 +734,6 @@
 		uint32_t * const tail = ring.submit_q.tail;
 		const uint32_t mask = *ring.submit_q.mask;
+
+		disable_interrupts();
 
 		// There are 2 submission schemes, check which one we are using
@@ -743,7 +769,7 @@
 			// update statistics
 			#if !defined(__CFA_NO_STATISTICS__)
-				__tls_stats()->io.submit_q.stats.look_avg.val   += len;
-				__tls_stats()->io.submit_q.stats.look_avg.block += block;
-				__tls_stats()->io.submit_q.stats.look_avg.cnt   += 1;
+				__tls_stats()->io.submit_q.look_avg.val   += len;
+				__tls_stats()->io.submit_q.look_avg.block += block;
+				__tls_stats()->io.submit_q.look_avg.cnt   += 1;
 			#endif
 
@@ -772,6 +798,6 @@
 			// update statistics
 			#if !defined(__CFA_NO_STATISTICS__)
-				__tls_stats()->io.submit_q.stats.submit_avg.csm += 1;
-				__tls_stats()->io.submit_q.stats.submit_avg.cnt += 1;
+				__tls_stats()->io.submit_q.submit_avg.csm += 1;
+				__tls_stats()->io.submit_q.submit_avg.cnt += 1;
 			#endif
 
@@ -780,4 +806,6 @@
 			__cfadbg_print_safe( io, "Kernel I/O : Performed io_submit for %p, returned %d\n", active_thread(), ret );
 		}
+
+		enable_interrupts( __cfaabi_dbg_ctx );
 	}
 
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision b232745aa346d2a551cb84ea589c6e3eb116ad6a)
+++ libcfa/src/concurrency/kernel.cfa	(revision 13c5e19865b1e243b856f4a84487f29cc918f162)
@@ -229,7 +229,7 @@
 static void * __invoke_processor(void * arg);
 
-void ?{}(processor & this, const char name[], cluster & cltr) with( this ) {
+void ?{}(processor & this, const char name[], cluster & _cltr) with( this ) {
 	this.name = name;
-	this.cltr = &cltr;
+	this.cltr = &_cltr;
 	id = -1u;
 	terminated{ 0 };
@@ -245,4 +245,5 @@
 
 	this.stack = __create_pthread( &this.kernel_thread, __invoke_processor, (void *)&this );
+	__atomic_fetch_add( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
 
 	__cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
@@ -264,4 +265,6 @@
 
 	free( this.stack );
+
+	__atomic_fetch_sub( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
 }
 
@@ -269,4 +272,5 @@
 	this.name = name;
 	this.preemption_rate = preemption_rate;
+	this.nprocessors = 0;
 	ready_queue{};
 
@@ -324,6 +328,4 @@
 	}
 
-	doregister(this->cltr, this);
-
 	{
 		// Setup preemption data
@@ -357,6 +359,4 @@
 		__cfadbg_print_safe(runtime_core, "Kernel : core %p stopping\n", this);
 	}
-
-	unregister(this->cltr, this);
 
 	V( this->terminated );
@@ -845,4 +845,6 @@
 		runner{ &this };
 		__cfadbg_print_safe(runtime_core, "Kernel : constructed main processor context %p\n", &runner);
+
+		__atomic_fetch_add( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
 	}
 
@@ -918,4 +920,5 @@
 	void ^?{}(processor & this) with( this ){
 		/* paranoid */ verify( this.do_terminate == true );
+		__atomic_fetch_sub( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
 		__cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
 	}
@@ -1150,18 +1153,4 @@
 }
 
-void doregister( cluster * cltr, processor * proc ) {
-	// lock      (cltr->idle_lock __cfaabi_dbg_ctx2);
-	// cltr->nprocessors += 1;
-	// push_front(cltr->procs, *proc);
-	// unlock    (cltr->idle_lock);
-}
-
-void unregister( cluster * cltr, processor * proc ) {
-	// lock  (cltr->idle_lock __cfaabi_dbg_ctx2);
-	// remove(cltr->procs, *proc );
-	// cltr->nprocessors -= 1;
-	// unlock(cltr->idle_lock);
-}
-
 //-----------------------------------------------------------------------------
 // Debug
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision b232745aa346d2a551cb84ea589c6e3eb116ad6a)
+++ libcfa/src/concurrency/kernel.hfa	(revision 13c5e19865b1e243b856f4a84487f29cc918f162)
@@ -186,5 +186,5 @@
 	// List of idle processors
 	StackLF(processor) idles;
-	unsigned int nprocessors;
+	volatile unsigned int nprocessors;
 
 	// List of threads
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision b232745aa346d2a551cb84ea589c6e3eb116ad6a)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 13c5e19865b1e243b856f4a84487f29cc918f162)
@@ -22,4 +22,6 @@
 #include "stats.hfa"
 
+#include "bits/random.hfa"
+
 
 //-----------------------------------------------------------------------------
@@ -89,9 +91,10 @@
 #define KERNEL_STORAGE(T,X) __attribute((aligned(__alignof__(T)))) static char storage_##X[sizeof(T)]
 
-static inline uint32_t __tls_rand() {
-	kernelTLS.rand_seed ^= kernelTLS.rand_seed << 6;
-	kernelTLS.rand_seed ^= kernelTLS.rand_seed >> 21;
-	kernelTLS.rand_seed ^= kernelTLS.rand_seed << 7;
-	return kernelTLS.rand_seed;
+static inline uint64_t __tls_rand() {
+	// kernelTLS.rand_seed ^= kernelTLS.rand_seed << 6;
+	// kernelTLS.rand_seed ^= kernelTLS.rand_seed >> 21;
+	// kernelTLS.rand_seed ^= kernelTLS.rand_seed << 7;
+	// return kernelTLS.rand_seed;
+	return __lehmer64( kernelTLS.rand_seed );
 }
 
@@ -102,7 +105,4 @@
 void doregister( struct cluster * cltr, struct $thread & thrd );
 void unregister( struct cluster * cltr, struct $thread & thrd );
-
-void doregister( struct cluster * cltr, struct processor * proc );
-void unregister( struct cluster * cltr, struct processor * proc );
 
 //=======================================================================
@@ -264,4 +264,9 @@
 
 //-----------------------------------------------------------------------
+// remove thread from the ready queue of a cluster
+// returns bool if it wasn't found
+bool remove_head(struct cluster * cltr, struct $thread * thrd);
+
+//-----------------------------------------------------------------------
 // Increase the width of the ready queue (number of lanes) by 4
 void ready_queue_grow  (struct cluster * cltr);
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision b232745aa346d2a551cb84ea589c6e3eb116ad6a)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 13c5e19865b1e243b856f4a84487f29cc918f162)
@@ -26,4 +26,7 @@
 #include <unistd.h>
 
+#include "snzi.hfa"
+#include "ready_subqueue.hfa"
+
 static const size_t cache_line_size = 64;
 
@@ -34,5 +37,5 @@
 #endif
 
-#define BIAS 64
+#define BIAS 8
 
 // returns the maximum number of processors the RWLock support
@@ -180,366 +183,6 @@
 
 //=======================================================================
-// Intrusive Queue used by ready queue
+// Cforall Reqdy Queue used for scheduling
 //=======================================================================
-// Intrusives lanes which are used by the relaxed ready queue
-struct __attribute__((aligned(128))) __intrusive_lane_t {
-	// spin lock protecting the queue
-	volatile bool lock;
-
-	// anchor for the head and the tail of the queue
-	struct __sentinel_t {
-		// Link lists fields
-		// instrusive link field for threads
-		// must be exactly as in $thread
-		__thread_desc_link link;
-	} before, after;
-
-	// Optional statistic counters
-	#if !defined(__CFA_NO_SCHED_STATS__)
-		struct __attribute__((aligned(64))) {
-			// difference between number of push and pops
-			ssize_t diff;
-
-			// total number of pushes and pops
-			size_t  push;
-			size_t  pop ;
-		} stat;
-	#endif
-};
-
-void  ?{}(__intrusive_lane_t & this);
-void ^?{}(__intrusive_lane_t & this);
-
-// Get the head pointer (one before the first element) from the anchor
-static inline $thread * head(const __intrusive_lane_t & this) {
-	$thread * rhead = ($thread *)(
-		(uintptr_t)( &this.before ) - offsetof( $thread, link )
-	);
-	/* paranoid */ verify(rhead);
-	return rhead;
-}
-
-// Get the tail pointer (one after the last element) from the anchor
-static inline $thread * tail(const __intrusive_lane_t & this) {
-	$thread * rtail = ($thread *)(
-		(uintptr_t)( &this.after ) - offsetof( $thread, link )
-	);
-	/* paranoid */ verify(rtail);
-	return rtail;
-}
-
-// Ctor
-void ?{}( __intrusive_lane_t & this ) {
-	this.lock = false;
-
-	this.before.link.prev = 0p;
-	this.before.link.next = tail(this);
-	this.before.link.ts   = 0;
-
-	this.after .link.prev = head(this);
-	this.after .link.next = 0p;
-	this.after .link.ts   = 0;
-
-	#if !defined(__CFA_NO_SCHED_STATS__)
-		this.stat.diff = 0;
-		this.stat.push = 0;
-		this.stat.pop  = 0;
-	#endif
-
-	// We add a boat-load of assertions here because the anchor code is very fragile
-	/* paranoid */ verify(((uintptr_t)( head(this) ) + offsetof( $thread, link )) == (uintptr_t)(&this.before));
-	/* paranoid */ verify(((uintptr_t)( tail(this) ) + offsetof( $thread, link )) == (uintptr_t)(&this.after ));
-	/* paranoid */ verify(head(this)->link.prev == 0p );
-	/* paranoid */ verify(head(this)->link.next == tail(this) );
-	/* paranoid */ verify(tail(this)->link.next == 0p );
-	/* paranoid */ verify(tail(this)->link.prev == head(this) );
-	/* paranoid */ verify(&head(this)->link.prev == &this.before.link.prev );
-	/* paranoid */ verify(&head(this)->link.next == &this.before.link.next );
-	/* paranoid */ verify(&tail(this)->link.prev == &this.after .link.prev );
-	/* paranoid */ verify(&tail(this)->link.next == &this.after .link.next );
-	/* paranoid */ verify(sizeof(__intrusive_lane_t) == 128);
-	/* paranoid */ verify(sizeof(this) == 128);
-	/* paranoid */ verify(__alignof__(__intrusive_lane_t) == 128);
-	/* paranoid */ verify(__alignof__(this) == 128);
-	/* paranoid */ verifyf(((intptr_t)(&this) % 128) == 0, "Expected address to be aligned %p %% 128 == %zd", &this, ((intptr_t)(&this) % 128));
-}
-
-// Dtor is trivial
-void ^?{}( __intrusive_lane_t & this ) {
-	// Make sure the list is empty
-	/* paranoid */ verify(head(this)->link.prev == 0p );
-	/* paranoid */ verify(head(this)->link.next == tail(this) );
-	/* paranoid */ verify(tail(this)->link.next == 0p );
-	/* paranoid */ verify(tail(this)->link.prev == head(this) );
-}
-
-// Push a thread onto this lane
-// returns true of lane was empty before push, false otherwise
-bool push(__intrusive_lane_t & this, $thread * node) {
-	#if defined(__CFA_WITH_VERIFY__)
-		/* paranoid */ verify(this.lock);
-		/* paranoid */ verify(node->link.ts != 0);
-		/* paranoid */ verify(node->link.next == 0p);
-		/* paranoid */ verify(node->link.prev == 0p);
-		/* paranoid */ verify(tail(this)->link.next == 0p);
-		/* paranoid */ verify(head(this)->link.prev == 0p);
-
-		if(this.before.link.ts == 0l) {
-			/* paranoid */ verify(tail(this)->link.prev == head(this));
-			/* paranoid */ verify(head(this)->link.next == tail(this));
-		} else {
-			/* paranoid */ verify(tail(this)->link.prev != head(this));
-			/* paranoid */ verify(head(this)->link.next != tail(this));
-		}
-	#endif
-
-	// Get the relevant nodes locally
-	$thread * tail = tail(this);
-	$thread * prev = tail->link.prev;
-
-	// Do the push
-	node->link.next = tail;
-	node->link.prev = prev;
-	prev->link.next = node;
-	tail->link.prev = node;
-
-	// Update stats
-	#if !defined(__CFA_NO_SCHED_STATS__)
-		this.stat.diff++;
-		this.stat.push++;
-	#endif
-
-	verify(node->link.next == tail(this));
-
-	// Check if the queue used to be empty
-	if(this.before.link.ts == 0l) {
-		this.before.link.ts = node->link.ts;
-		/* paranoid */ verify(node->link.prev == head(this));
-		return true;
-	}
-	return false;
-}
-
-// Pop a thread from this lane (must be non-empty)
-// returns popped
-// returns true of lane was empty before push, false otherwise
-[$thread *, bool] pop(__intrusive_lane_t & this) {
-	/* paranoid */ verify(this.lock);
-	/* paranoid */ verify(this.before.link.ts != 0ul);
-
-	// Get anchors locally
-	$thread * head = head(this);
-	$thread * tail = tail(this);
-
-	// Get the relevant nodes locally
-	$thread * node = head->link.next;
-	$thread * next = node->link.next;
-
-	/* paranoid */ verify(node != tail);
-	/* paranoid */ verify(node);
-
-	// Do the pop
-	head->link.next = next;
-	next->link.prev = head;
-	node->link.[next, prev] = 0p;
-
-	// Update head time stamp
-	this.before.link.ts = next->link.ts;
-
-	// Update stats
-	#ifndef __CFA_NO_SCHED_STATS__
-		this.stat.diff--;
-		this.stat.pop ++;
-	#endif
-
-	// Check if we emptied list and return accordingly
-	/* paranoid */ verify(tail(this)->link.next == 0p);
-	/* paranoid */ verify(head(this)->link.prev == 0p);
-	if(next == tail) {
-		/* paranoid */ verify(this.before.link.ts == 0);
-		/* paranoid */ verify(tail(this)->link.prev == head(this));
-		/* paranoid */ verify(head(this)->link.next == tail(this));
-		return [node, true];
-	}
-	else {
-		/* paranoid */ verify(next->link.ts != 0);
-		/* paranoid */ verify(tail(this)->link.prev != head(this));
-		/* paranoid */ verify(head(this)->link.next != tail(this));
-		/* paranoid */ verify(this.before.link.ts != 0);
-		return [node, false];
-	}
-}
-
-// Check whether or not list is empty
-static inline bool is_empty(__intrusive_lane_t & this) {
-	// Cannot verify here since it may not be locked
-	return this.before.link.ts == 0;
-}
-
-// Return the timestamp
-static inline unsigned long long ts(__intrusive_lane_t & this) {
-	// Cannot verify here since it may not be locked
-	return this.before.link.ts;
-}
-
-//=======================================================================
-// Scalable Non-Zero counter
-//=======================================================================
-
-union __snzi_val_t {
-	uint64_t _all;
-	struct __attribute__((packed)) {
-		char cnt;
-		uint64_t ver:56;
-	};
-};
-
-bool cas(volatile __snzi_val_t & self, __snzi_val_t & exp, char _cnt, uint64_t _ver) {
-	__snzi_val_t t;
-	t.ver = _ver;
-	t.cnt = _cnt;
-	/* paranoid */ verify(t._all == ((_ver << 8) | ((unsigned char)_cnt)));
-	return __atomic_compare_exchange_n(&self._all, &exp._all, t._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-}
-
-bool cas(volatile __snzi_val_t & self, __snzi_val_t & exp, const __snzi_val_t & tar) {
-	return __atomic_compare_exchange_n(&self._all, &exp._all, tar._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-}
-
-void ?{}( __snzi_val_t & this ) { this._all = 0; }
-void ?{}( __snzi_val_t & this, const volatile __snzi_val_t & o) { this._all = o._all; }
-
-struct __attribute__((aligned(128))) __snzi_node_t {
-	volatile __snzi_val_t value;
-	struct __snzi_node_t * parent;
-	bool is_root;
-};
-
-static inline void arrive( __snzi_node_t & );
-static inline void depart( __snzi_node_t & );
-
-#define __snzi_half -1
-
-//--------------------------------------------------
-// Root node
-static void arrive_r( __snzi_node_t & this ) {
-	/* paranoid */ verify( this.is_root );
-	__atomic_fetch_add(&this.value._all, 1, __ATOMIC_SEQ_CST);
-}
-
-static void depart_r( __snzi_node_t & this ) {
-	/* paranoid */ verify( this.is_root );
-	__atomic_fetch_sub(&this.value._all, 1, __ATOMIC_SEQ_CST);
-}
-
-//--------------------------------------------------
-// Hierarchical node
-static void arrive_h( __snzi_node_t & this ) {
-	int undoArr = 0;
-	bool success = false;
-	while(!success) {
-		__snzi_val_t x = { this.value };
-		/* paranoid */ verify(x.cnt <= 120);
-		if( x.cnt >= 1 ) {
-			if( cas( this.value, x, x.cnt + 1, x.ver ) ) {
-				success = true;
-			}
-		}
-		/* paranoid */ verify(x.cnt <= 120);
-		if( x.cnt == 0 ) {
-			if( cas( this.value, x, __snzi_half, x.ver + 1) ) {
-				success = true;
-				x.cnt = __snzi_half;
-				x.ver = x.ver + 1;
-			}
-		}
-		/* paranoid */ verify(x.cnt <= 120);
-		if( x.cnt == __snzi_half ) {
-			/* paranoid */ verify( this.parent);
-			arrive( *this.parent );
-			if( !cas( this.value, x, 1, x.ver) ) {
-				undoArr = undoArr + 1;
-			}
-		}
-	}
-
-	for(int i = 0; i < undoArr; i++) {
-		/* paranoid */ verify( this.parent );
-		depart( *this.parent );
-	}
-}
-
-static void depart_h( __snzi_node_t & this ) {
-	while(true) {
-		const __snzi_val_t x = { this.value };
-		/* paranoid */ verifyf(x.cnt >= 1, "%d", x.cnt);
-		if( cas( this.value, x, x.cnt - 1, x.ver ) ) {
-			if( x.cnt == 1 ) {
-				/* paranoid */ verify( this.parent );
-				depart( *this.parent );
-			}
-			return;
-		}
-	}
-}
-
-//--------------------------------------------------
-// All nodes
-static inline void arrive( __snzi_node_t & this ) {
-	if(this.is_root) arrive_r( this );
-	else arrive_h( this );
-}
-
-static inline void depart( __snzi_node_t & this ) {
-	if(this.is_root) depart_r( this );
-	else depart_h( this );
-}
-
-static inline bool query( __snzi_node_t & this ) {
-	/* paranoid */ verify( this.is_root );
-	return this.value._all > 0;
-}
-
-//--------------------------------------------------
-// SNZI object
-void  ?{}( __snzi_t & this, unsigned depth ) with( this ) {
-	mask = (1 << depth) - 1;
-	root = (1 << (depth + 1)) - 2;
-	nodes = alloc( root + 1 );
-
-	int width = 1 << depth;
-	for(int i = 0; i < root; i++) {
-		nodes[i].value._all = 0;
-		nodes[i].parent = &nodes[(i / 2) + width ];
-		nodes[i].is_root = false;
-	}
-
-	nodes[ root ].value._all = 0;
-	nodes[ root ].parent = 0p;
-	nodes[ root ].is_root = true;
-}
-
-void ^?{}( __snzi_t & this ) {
-	free( this.nodes );
-}
-
-static inline void arrive( __snzi_t & this, int idx) {
-	idx &= this.mask;
-	arrive( this.nodes[idx] );
-}
-
-static inline void depart( __snzi_t & this, int idx) {
-	idx &= this.mask;
-	depart( this.nodes[idx] );
-}
-
-static inline bool query( const __snzi_t & this ) {
-	return query( this.nodes[ this.root ] );
-}
-
-//=======================================================================
-// Cforall Reqdy Queue used by ready queue
-//=======================================================================
-
 void ?{}(__ready_queue_t & this) with (this) {
 
@@ -584,8 +227,13 @@
 			unsigned rlow  = r % BIAS;
 			unsigned rhigh = r / BIAS;
-			if(0 != (rlow % BIAS) && kernelTLS.this_processor) {
+			if((0 != rlow) && kernelTLS.this_processor) {
 				// (BIAS - 1) out of BIAS chances
 				// Use perferred queues
-				i = (kernelTLS.this_processor->id * 4) + (rhigh % 4);
+				unsigned pid = kernelTLS.this_processor->id * 4;
+				i = pid + (rhigh % 4);
+
+				#if !defined(__CFA_NO_STATISTICS__)
+					__tls_stats()->ready.pick.push.local++;
+				#endif
 			}
 			else {
@@ -635,7 +283,63 @@
 }
 
+static struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j);
+static struct $thread * try_pop(struct cluster * cltr, unsigned i);
+
+// Pop from the ready queue from a given cluster
+__attribute__((hot)) $thread * pop(struct cluster * cltr) with (cltr->ready_queue) {
+	/* paranoid */ verify( lanes.count > 0 );
+	#if defined(BIAS)
+		// Don't bother trying locally too much
+		int local_tries = 8;
+	#endif
+
+	// As long as the list is not empty, try finding a lane that isn't empty and pop from it
+	while( query(snzi) ) {
+		// Pick two lists at random
+		unsigned i,j;
+		#if defined(BIAS)
+			uint64_t r = __tls_rand();
+			unsigned rlow  = r % BIAS;
+			uint64_t rhigh = r / BIAS;
+			if(local_tries && 0 != rlow) {
+				// (BIAS - 1) out of BIAS chances
+				// Use perferred queues
+				unsigned pid = kernelTLS.this_processor->id * 4;
+				i = pid + (rhigh % 4);
+				j = pid + ((rhigh >> 32ull) % 4);
+
+				// count the tries
+				local_tries--;
+
+				#if !defined(__CFA_NO_STATISTICS__)
+					__tls_stats()->ready.pick.pop.local++;
+				#endif
+			}
+			else {
+				// 1 out of BIAS chances
+				// Use all queues
+				i = rhigh;
+				j = rhigh >> 32ull;
+			}
+		#else
+			i = __tls_rand();
+			j = __tls_rand();
+		#endif
+
+		i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+		j %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+
+		// try popping from the 2 picked lists
+		struct $thread * thrd = try_pop(cltr, i, j);
+		if(thrd) return thrd;
+	}
+
+	// All lanes where empty return 0p
+	return 0p;
+}
+
 //-----------------------------------------------------------------------
 // Given 2 indexes, pick the list with the oldest push an try to pop from it
-static struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j) with (cltr->ready_queue) {
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j) with (cltr->ready_queue) {
 	#if !defined(__CFA_NO_STATISTICS__)
 		__tls_stats()->ready.pick.pop.attempt++;
@@ -648,4 +352,8 @@
 	}
 
+	return try_pop(cltr, w);
+}
+
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned w) with (cltr->ready_queue) {
 	// Get relevant elements locally
 	__intrusive_lane_t & lane = lanes.data[w];
@@ -688,38 +396,30 @@
 	return thrd;
 }
-
-// Pop from the ready queue from a given cluster
-__attribute__((hot)) $thread * pop(struct cluster * cltr) with (cltr->ready_queue) {
-	/* paranoid */ verify( lanes.count > 0 );
-
-	// As long as the list is not empty, try finding a lane that isn't empty and pop from it
-	while( query(snzi) ) {
-		// Pick two lists at random
-		#if defined(BIAS)
-			unsigned i = __tls_rand();
-			unsigned j = __tls_rand();
-
-			if(0 == (i % BIAS)) {
-				i = i / BIAS;
-			}
-			else {
-				i = ((kernelTLS.this_processor->id * 4) + ((i / BIAS) % 4));
-				j = ((kernelTLS.this_processor->id * 4) + ((j / BIAS) % 4));
-			}
-		#else
-			unsigned i = __tls_rand();
-			unsigned j = __tls_rand();
-		#endif
-
-		i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
- 		j %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
-
-		// try popping from the 2 picked lists
-		struct $thread * thrd = try_pop(cltr, i, j);
-		if(thrd) return thrd;
-	}
-
-	// All lanes where empty return 0p
-	return 0p;
+//-----------------------------------------------------------------------
+
+bool remove_head(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+	for(i; lanes.count) {
+		__intrusive_lane_t & lane = lanes.data[i];
+
+		bool removed = false;
+
+		__atomic_acquire(&lane.lock);
+			if(head(lane)->link.next == thrd) {
+				$thread * pthrd;
+				bool emptied;
+				[pthrd, emptied] = pop(lane);
+
+				/* paranoid */ verify( pthrd == thrd );
+
+				removed = true;
+				if(emptied) {
+					depart( snzi, i );
+				}
+			}
+		__atomic_unlock(&lane.lock);
+
+		if( removed ) return true;
+	}
+	return false;
 }
 
Index: libcfa/src/concurrency/ready_subqueue.hfa
===================================================================
--- libcfa/src/concurrency/ready_subqueue.hfa	(revision 13c5e19865b1e243b856f4a84487f29cc918f162)
+++ libcfa/src/concurrency/ready_subqueue.hfa	(revision 13c5e19865b1e243b856f4a84487f29cc918f162)
@@ -0,0 +1,203 @@
+#pragma once
+
+#define __CFA_NO_SCHED_STATS__
+
+// Intrusives lanes which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __intrusive_lane_t {
+
+	// anchor for the head and the tail of the queue
+	__attribute__((aligned(128))) struct __sentinel_t {
+		// Link lists fields
+		// instrusive link field for threads
+		// must be exactly as in $thread
+		__thread_desc_link link;
+	} before, after;
+
+	// spin lock protecting the queue
+	volatile bool lock;
+
+	// Optional statistic counters
+	#if !defined(__CFA_NO_SCHED_STATS__)
+		struct __attribute__((aligned(64))) {
+			// difference between number of push and pops
+			ssize_t diff;
+
+			// total number of pushes and pops
+			size_t  push;
+			size_t  pop ;
+		} stat;
+	#endif
+};
+
+void  ?{}(__intrusive_lane_t & this);
+void ^?{}(__intrusive_lane_t & this);
+
+// Get the head pointer (one before the first element) from the anchor
+static inline $thread * head(const __intrusive_lane_t & this) {
+	$thread * rhead = ($thread *)(
+		(uintptr_t)( &this.before ) - offsetof( $thread, link )
+	);
+	/* paranoid */ verify(rhead);
+	return rhead;
+}
+
+// Get the tail pointer (one after the last element) from the anchor
+static inline $thread * tail(const __intrusive_lane_t & this) {
+	$thread * rtail = ($thread *)(
+		(uintptr_t)( &this.after ) - offsetof( $thread, link )
+	);
+	/* paranoid */ verify(rtail);
+	return rtail;
+}
+
+// Ctor
+void ?{}( __intrusive_lane_t & this ) {
+	this.lock = false;
+
+	this.before.link.prev = 0p;
+	this.before.link.next = tail(this);
+	this.before.link.ts   = 0;
+
+	this.after .link.prev = head(this);
+	this.after .link.next = 0p;
+	this.after .link.ts   = 0;
+
+	#if !defined(__CFA_NO_SCHED_STATS__)
+		this.stat.diff = 0;
+		this.stat.push = 0;
+		this.stat.pop  = 0;
+	#endif
+
+	// We add a boat-load of assertions here because the anchor code is very fragile
+	/* paranoid */ verify(((uintptr_t)( head(this) ) + offsetof( $thread, link )) == (uintptr_t)(&this.before));
+	/* paranoid */ verify(((uintptr_t)( tail(this) ) + offsetof( $thread, link )) == (uintptr_t)(&this.after ));
+	/* paranoid */ verify(head(this)->link.prev == 0p );
+	/* paranoid */ verify(head(this)->link.next == tail(this) );
+	/* paranoid */ verify(tail(this)->link.next == 0p );
+	/* paranoid */ verify(tail(this)->link.prev == head(this) );
+	/* paranoid */ verify(&head(this)->link.prev == &this.before.link.prev );
+	/* paranoid */ verify(&head(this)->link.next == &this.before.link.next );
+	/* paranoid */ verify(&tail(this)->link.prev == &this.after .link.prev );
+	/* paranoid */ verify(&tail(this)->link.next == &this.after .link.next );
+	/* paranoid */ verify(__alignof__(__intrusive_lane_t) == 128);
+	/* paranoid */ verify(__alignof__(this) == 128);
+	/* paranoid */ verifyf(((intptr_t)(&this) % 128) == 0, "Expected address to be aligned %p %% 128 == %zd", &this, ((intptr_t)(&this) % 128));
+}
+
+// Dtor is trivial
+void ^?{}( __intrusive_lane_t & this ) {
+	// Make sure the list is empty
+	/* paranoid */ verify(head(this)->link.prev == 0p );
+	/* paranoid */ verify(head(this)->link.next == tail(this) );
+	/* paranoid */ verify(tail(this)->link.next == 0p );
+	/* paranoid */ verify(tail(this)->link.prev == head(this) );
+}
+
+// Push a thread onto this lane
+// returns true of lane was empty before push, false otherwise
+bool push(__intrusive_lane_t & this, $thread * node) {
+	#if defined(__CFA_WITH_VERIFY__)
+		/* paranoid */ verify(this.lock);
+		/* paranoid */ verify(node->link.ts != 0);
+		/* paranoid */ verify(node->link.next == 0p);
+		/* paranoid */ verify(node->link.prev == 0p);
+		/* paranoid */ verify(tail(this)->link.next == 0p);
+		/* paranoid */ verify(head(this)->link.prev == 0p);
+
+		if(this.before.link.ts == 0l) {
+			/* paranoid */ verify(tail(this)->link.prev == head(this));
+			/* paranoid */ verify(head(this)->link.next == tail(this));
+		} else {
+			/* paranoid */ verify(tail(this)->link.prev != head(this));
+			/* paranoid */ verify(head(this)->link.next != tail(this));
+		}
+	#endif
+
+	// Get the relevant nodes locally
+	$thread * tail = tail(this);
+	$thread * prev = tail->link.prev;
+
+	// Do the push
+	node->link.next = tail;
+	node->link.prev = prev;
+	prev->link.next = node;
+	tail->link.prev = node;
+
+	// Update stats
+	#if !defined(__CFA_NO_SCHED_STATS__)
+		this.stat.diff++;
+		this.stat.push++;
+	#endif
+
+	verify(node->link.next == tail(this));
+
+	// Check if the queue used to be empty
+	if(this.before.link.ts == 0l) {
+		this.before.link.ts = node->link.ts;
+		/* paranoid */ verify(node->link.prev == head(this));
+		return true;
+	}
+	return false;
+}
+
+// Pop a thread from this lane (must be non-empty)
+// returns popped
+// returns true of lane was empty before push, false otherwise
+[$thread *, bool] pop(__intrusive_lane_t & this) {
+	/* paranoid */ verify(this.lock);
+	/* paranoid */ verify(this.before.link.ts != 0ul);
+
+	// Get anchors locally
+	$thread * head = head(this);
+	$thread * tail = tail(this);
+
+	// Get the relevant nodes locally
+	$thread * node = head->link.next;
+	$thread * next = node->link.next;
+
+	/* paranoid */ verify(node != tail);
+	/* paranoid */ verify(node);
+
+	// Do the pop
+	head->link.next = next;
+	next->link.prev = head;
+	node->link.[next, prev] = 0p;
+
+	// Update head time stamp
+	this.before.link.ts = next->link.ts;
+
+	// Update stats
+	#ifndef __CFA_NO_SCHED_STATS__
+		this.stat.diff--;
+		this.stat.pop ++;
+	#endif
+
+	// Check if we emptied list and return accordingly
+	/* paranoid */ verify(tail(this)->link.next == 0p);
+	/* paranoid */ verify(head(this)->link.prev == 0p);
+	if(next == tail) {
+		/* paranoid */ verify(this.before.link.ts == 0);
+		/* paranoid */ verify(tail(this)->link.prev == head(this));
+		/* paranoid */ verify(head(this)->link.next == tail(this));
+		return [node, true];
+	}
+	else {
+		/* paranoid */ verify(next->link.ts != 0);
+		/* paranoid */ verify(tail(this)->link.prev != head(this));
+		/* paranoid */ verify(head(this)->link.next != tail(this));
+		/* paranoid */ verify(this.before.link.ts != 0);
+		return [node, false];
+	}
+}
+
+// Check whether or not list is empty
+static inline bool is_empty(__intrusive_lane_t & this) {
+	// Cannot verify here since it may not be locked
+	return this.before.link.ts == 0;
+}
+
+// Return the timestamp
+static inline unsigned long long ts(__intrusive_lane_t & this) {
+	// Cannot verify here since it may not be locked
+	return this.before.link.ts;
+}
Index: libcfa/src/concurrency/snzi.hfa
===================================================================
--- libcfa/src/concurrency/snzi.hfa	(revision 13c5e19865b1e243b856f4a84487f29cc918f162)
+++ libcfa/src/concurrency/snzi.hfa	(revision 13c5e19865b1e243b856f4a84487f29cc918f162)
@@ -0,0 +1,155 @@
+#pragma once
+
+#include <assert.h>
+#include <stdint.h>
+
+union __snzi_val_t {
+	uint64_t _all;
+	struct __attribute__((packed)) {
+		char cnt;
+		uint64_t ver:56;
+	};
+};
+
+bool cas(volatile __snzi_val_t & self, __snzi_val_t & exp, char _cnt, uint64_t _ver) {
+	__snzi_val_t t;
+	t.ver = _ver;
+	t.cnt = _cnt;
+	/* paranoid */ verify(t._all == ((_ver << 8) | ((unsigned char)_cnt)));
+	return __atomic_compare_exchange_n(&self._all, &exp._all, t._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+}
+
+bool cas(volatile __snzi_val_t & self, __snzi_val_t & exp, const __snzi_val_t & tar) {
+	return __atomic_compare_exchange_n(&self._all, &exp._all, tar._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+}
+
+void ?{}( __snzi_val_t & this ) { this._all = 0; }
+void ?{}( __snzi_val_t & this, const volatile __snzi_val_t & o) { this._all = o._all; }
+
+struct __attribute__((aligned(128))) __snzi_node_t {
+	volatile __snzi_val_t value;
+	struct __snzi_node_t * parent;
+	bool is_root;
+};
+
+static inline void arrive( __snzi_node_t & );
+static inline void depart( __snzi_node_t & );
+
+#define __snzi_half -1
+
+//--------------------------------------------------
+// Root node
+static void arrive_r( __snzi_node_t & this ) {
+	/* paranoid */ verify( this.is_root );
+	__atomic_fetch_add(&this.value._all, 1, __ATOMIC_SEQ_CST);
+}
+
+static void depart_r( __snzi_node_t & this ) {
+	/* paranoid */ verify( this.is_root );
+	__atomic_fetch_sub(&this.value._all, 1, __ATOMIC_SEQ_CST);
+}
+
+//--------------------------------------------------
+// Hierarchical node
+static void arrive_h( __snzi_node_t & this ) {
+	int undoArr = 0;
+	bool success = false;
+	while(!success) {
+		__snzi_val_t x = { this.value };
+		/* paranoid */ verify(x.cnt <= 120);
+		if( x.cnt >= 1 ) {
+			if( cas( this.value, x, x.cnt + 1, x.ver ) ) {
+				success = true;
+			}
+		}
+		/* paranoid */ verify(x.cnt <= 120);
+		if( x.cnt == 0 ) {
+			if( cas( this.value, x, __snzi_half, x.ver + 1) ) {
+				success = true;
+				x.cnt = __snzi_half;
+				x.ver = x.ver + 1;
+			}
+		}
+		/* paranoid */ verify(x.cnt <= 120);
+		if( x.cnt == __snzi_half ) {
+			/* paranoid */ verify( this.parent);
+			arrive( *this.parent );
+			if( !cas( this.value, x, 1, x.ver) ) {
+				undoArr = undoArr + 1;
+			}
+		}
+	}
+
+	for(int i = 0; i < undoArr; i++) {
+		/* paranoid */ verify( this.parent );
+		depart( *this.parent );
+	}
+}
+
+static void depart_h( __snzi_node_t & this ) {
+	while(true) {
+		const __snzi_val_t x = { this.value };
+		/* paranoid */ verifyf(x.cnt >= 1, "%d", x.cnt);
+		if( cas( this.value, x, x.cnt - 1, x.ver ) ) {
+			if( x.cnt == 1 ) {
+				/* paranoid */ verify( this.parent );
+				depart( *this.parent );
+			}
+			return;
+		}
+	}
+}
+
+//--------------------------------------------------
+// All nodes
+static inline void arrive( __snzi_node_t & this ) {
+	if(this.is_root) arrive_r( this );
+	else arrive_h( this );
+}
+
+static inline void depart( __snzi_node_t & this ) {
+	if(this.is_root) depart_r( this );
+	else depart_h( this );
+}
+
+static inline bool query( __snzi_node_t & this ) {
+	/* paranoid */ verify( this.is_root );
+	return this.value._all > 0;
+}
+
+//--------------------------------------------------
+// SNZI object
+void  ?{}( __snzi_t & this, unsigned depth ) with( this ) {
+	mask = (1 << depth) - 1;
+	root = (1 << (depth + 1)) - 2;
+	nodes = alloc( root + 1 );
+
+	int width = 1 << depth;
+	for(int i = 0; i < root; i++) {
+		nodes[i].value._all = 0;
+		nodes[i].parent = &nodes[(i / 2) + width ];
+		nodes[i].is_root = false;
+	}
+
+	nodes[ root ].value._all = 0;
+	nodes[ root ].parent = 0p;
+	nodes[ root ].is_root = true;
+}
+
+void ^?{}( __snzi_t & this ) {
+	free( this.nodes );
+}
+
+static inline void arrive( __snzi_t & this, int idx) {
+	idx &= this.mask;
+	arrive( this.nodes[idx] );
+}
+
+static inline void depart( __snzi_t & this, int idx) {
+	idx &= this.mask;
+	depart( this.nodes[idx] );
+}
+
+static inline bool query( const __snzi_t & this ) {
+	return query( this.nodes[ this.root ] );
+}
Index: libcfa/src/concurrency/stats.cfa
===================================================================
--- libcfa/src/concurrency/stats.cfa	(revision b232745aa346d2a551cb84ea589c6e3eb116ad6a)
+++ libcfa/src/concurrency/stats.cfa	(revision 13c5e19865b1e243b856f4a84487f29cc918f162)
@@ -12,4 +12,5 @@
 		stats->ready.pick.pop .probe   = 0;
 		stats->ready.pick.pop .attempt = 0;
+		stats->ready.pick.pop .local   = 0;
 		stats->ready.pick.pop .success = 0;
 		stats->ready.sleep.halts   = 0;
@@ -37,7 +38,9 @@
 	void __tally_stats( struct __stats_t * cltr, struct __stats_t * proc ) {
 		__atomic_fetch_add( &cltr->ready.pick.push.attempt, proc->ready.pick.push.attempt, __ATOMIC_SEQ_CST );
+		__atomic_fetch_add( &cltr->ready.pick.push.local  , proc->ready.pick.push.local  , __ATOMIC_SEQ_CST );
 		__atomic_fetch_add( &cltr->ready.pick.push.success, proc->ready.pick.push.success, __ATOMIC_SEQ_CST );
 		__atomic_fetch_add( &cltr->ready.pick.pop .probe  , proc->ready.pick.pop .probe  , __ATOMIC_SEQ_CST );
 		__atomic_fetch_add( &cltr->ready.pick.pop .attempt, proc->ready.pick.pop .attempt, __ATOMIC_SEQ_CST );
+		__atomic_fetch_add( &cltr->ready.pick.pop .local  , proc->ready.pick.pop .local  , __ATOMIC_SEQ_CST );
 		__atomic_fetch_add( &cltr->ready.pick.pop .success, proc->ready.pick.pop .success, __ATOMIC_SEQ_CST );
 		__atomic_fetch_add( &cltr->ready.sleep.halts  , proc->ready.sleep.halts  , __ATOMIC_SEQ_CST );
@@ -63,29 +66,29 @@
 	}
 
-	void __print_stats( struct __stats_t * stats ) {
+	void __print_stats( struct __stats_t * stats ) with( *stats ) {
 
-		double push_sur = (100.0 * ((double)stats->ready.pick.push.success) / stats->ready.pick.push.attempt);
-		double pop_sur  = (100.0 * ((double)stats->ready.pick.pop .success) / stats->ready.pick.pop .attempt);
+		double push_sur = (100.0 * ((double)ready.pick.push.success) / ready.pick.push.attempt);
+		double pop_sur  = (100.0 * ((double)ready.pick.pop .success) / ready.pick.pop .attempt);
 
-		double push_len = ((double)stats->ready.pick.push.attempt) / stats->ready.pick.push.success;
-		double pop_len  = ((double)stats->ready.pick.pop .attempt) / stats->ready.pick.pop .success;
+		double push_len = ((double)ready.pick.push.attempt) / ready.pick.push.success;
+		double pop_len  = ((double)ready.pick.pop .attempt) / ready.pick.pop .success;
 
 		#if defined(HAVE_LINUX_IO_URING_H)
-			double avgrdy = ((double)submit_avg.rdy) / submit_avg.cnt;
-			double avgcsm = ((double)submit_avg.csm) / submit_avg.cnt;
-			double avgavl = ((double)submit_avg.avl) / submit_avg.cnt;
+			double avgrdy = ((double)io.submit_q.submit_avg.rdy) / io.submit_q.submit_avg.cnt;
+			double avgcsm = ((double)io.submit_q.submit_avg.csm) / io.submit_q.submit_avg.cnt;
+			double avgavl = ((double)io.submit_q.submit_avg.avl) / io.submit_q.submit_avg.cnt;
 
 			double lavgv = 0;
 			double lavgb = 0;
-			if(look_avg.cnt != 0) {
-				lavgv = ((double)look_avg.val  ) / look_avg.cnt;
-				lavgb = ((double)look_avg.block) / look_avg.cnt;
+			if(io.submit_q.look_avg.cnt != 0) {
+				lavgv = ((double)io.submit_q.look_avg.val  ) / io.submit_q.look_avg.cnt;
+				lavgb = ((double)io.submit_q.look_avg.block) / io.submit_q.look_avg.cnt;
 			}
 
 			double aavgv = 0;
 			double aavgb = 0;
-			if(alloc_avg.cnt != 0) {
-				aavgv = ((double)alloc_avg.val  ) / alloc_avg.cnt;
-				aavgb = ((double)alloc_avg.block) / alloc_avg.cnt;
+			if(io.submit_q.alloc_avg.cnt != 0) {
+				aavgv = ((double)io.submit_q.alloc_avg.val  ) / io.submit_q.alloc_avg.cnt;
+				aavgb = ((double)io.submit_q.alloc_avg.block) / io.submit_q.alloc_avg.cnt;
 			}
 		#endif
@@ -95,6 +98,6 @@
 			"- total threads run      : %'15lu\n"
 			"- total threads scheduled: %'15lu\n"
-			"- push average probe len : %'18.2lf, %'18.2lf%% (%'15lu attempts)\n"
-			"- pop  average probe len : %'18.2lf, %'18.2lf%% (%'15lu attempts)\n"
+			"- push average probe len : %'18.2lf, %'18.2lf%% (%'15lu attempts, %'15lu local)\n"
+			"- pop  average probe len : %'18.2lf, %'18.2lf%% (%'15lu attempts, %'15lu local)\n"
 			"- Idle Sleep -\n"
 			"-- halts                 : %'15lu\n"
@@ -105,36 +108,36 @@
 				"\n"
 				"----- I/O Stats -----\n"
-				"- total submit calls     : %'15llu\n"
+				"- total submit calls     : %'15lu\n"
 				"- avg ready entries      : %'18.2lf\n"
 				"- avg submitted entries  : %'18.2lf\n"
 				"- avg available entries  : %'18.2lf\n"
-				"- total ready search     : %'15llu\n"
+				"- total ready search     : %'15lu\n"
 				"- avg ready search len   : %'18.2lf\n"
 				"- avg ready search block : %'18.2lf\n"
-				"- total alloc search     : %'15llu\n"
+				"- total alloc search     : %'15lu\n"
 				"- avg alloc search len   : %'18.2lf\n"
 				"- avg alloc search block : %'18.2lf\n"
-				"- total wait calls       : %'15llu   (%'llu slow, %'llu fast)\n"
+				"- total wait calls       : %'15lu   (%'lu slow, %'lu fast)\n"
 				"- avg completion/wait    : %'18.2lf\n"
 			#endif
-			, stats->ready.pick.pop.success
-			, stats->ready.pick.push.success
-			, push_len, push_sur, stats->ready.pick.push.attempt
-			, pop_len , pop_sur , stats->ready.pick.pop .attempt
-			, stats->ready.sleep.halts, stats->ready.sleep.cancels, stats->ready.sleep.wakes, stats->ready.sleep.exits
+			, ready.pick.pop.success
+			, ready.pick.push.success
+			, push_len, push_sur, ready.pick.push.attempt, ready.pick.push.local
+			, pop_len , pop_sur , ready.pick.pop .attempt, ready.pick.pop .local
+			, ready.sleep.halts, ready.sleep.cancels, ready.sleep.wakes, ready.sleep.exits
 			#if defined(HAVE_LINUX_IO_URING_H)
-				, submit_avg.cnt
+				, io.submit_q.submit_avg.cnt
 				, avgrdy
 				, avgcsm
 				, avgavl
-				, look_avg.cnt
+				, io.submit_q.look_avg.cnt
 				, lavgv
 				, lavgb
-				, alloc_avg.cnt
+				, io.submit_q.alloc_avg.cnt
 				, aavgv
 				, aavgb
-				, completed_avg.slow_cnt + completed_avg.fast_cnt
-				, completed_avg.slow_cnt,  completed_avg.fast_cnt
-				, ((double)completed_avg.val) / (completed_avg.slow_cnt + completed_avg.fast_cnt)
+				, io.complete_q.completed_avg.slow_cnt + io.complete_q.completed_avg.fast_cnt
+				, io.complete_q.completed_avg.slow_cnt,  io.complete_q.completed_avg.fast_cnt
+				, ((double)io.complete_q.completed_avg.val) / (io.complete_q.completed_avg.slow_cnt + io.complete_q.completed_avg.fast_cnt)
 			#endif
 		);
Index: libcfa/src/concurrency/stats.hfa
===================================================================
--- libcfa/src/concurrency/stats.hfa	(revision b232745aa346d2a551cb84ea589c6e3eb116ad6a)
+++ libcfa/src/concurrency/stats.hfa	(revision 13c5e19865b1e243b856f4a84487f29cc918f162)
@@ -16,4 +16,7 @@
 				volatile uint64_t attempt;
 
+				// number of attemps at pushing something something to preferred queues
+				volatile uint64_t local;
+
 				// number of successes at pushing
 				volatile uint64_t success;
@@ -29,4 +32,7 @@
 				// number of attemps at poping something
 				volatile uint64_t attempt;
+
+				// number of attemps at poping something from preferred queues
+				volatile uint64_t local;
 
 				// number of successes at poping
@@ -64,7 +70,7 @@
 			struct {
 				struct {
-					unsigned long long int val;
-					unsigned long long int slow_cnt;
-					unsigned long long int fast_cnt;
+					volatile uint64_t val;
+					volatile uint64_t slow_cnt;
+					volatile uint64_t fast_cnt;
 				} completed_avg;
 			} complete_q;