Index: libcfa/src/concurrency/alarm.hfa
===================================================================
--- libcfa/src/concurrency/alarm.hfa	(revision 6c144d861e06011d0279338d7cb366cae8d8b84e)
+++ libcfa/src/concurrency/alarm.hfa	(revision 1eb239e4849405e30caa8ad3388f10d0aa7b4b1c)
@@ -23,5 +23,5 @@
 #include "time.hfa"
 
-#include <containers/list.hfa>
+#include "containers/list.hfa"
 
 struct $thread;
Index: libcfa/src/concurrency/io/setup.cfa
===================================================================
--- libcfa/src/concurrency/io/setup.cfa	(revision 6c144d861e06011d0279338d7cb366cae8d8b84e)
+++ libcfa/src/concurrency/io/setup.cfa	(revision 1eb239e4849405e30caa8ad3388f10d0aa7b4b1c)
@@ -228,5 +228,5 @@
 		if( cluster_context ) {
 			cluster & cltr = *thrd.curr_cluster;
-			/* paranoid */ verify( cltr.nprocessors == 0 || &cltr == mainCluster );
+			/* paranoid */ verify( cltr.idles.total == 0 || &cltr == mainCluster );
 			/* paranoid */ verify( !ready_mutate_islocked() );
 
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 6c144d861e06011d0279338d7cb366cae8d8b84e)
+++ libcfa/src/concurrency/kernel.cfa	(revision 1eb239e4849405e30caa8ad3388f10d0aa7b4b1c)
@@ -86,9 +86,12 @@
 // Kernel Scheduling logic
 static $thread * __next_thread(cluster * this);
-static bool __has_next_thread(cluster * this);
+static $thread * __next_thread_slow(cluster * this);
 static void __run_thread(processor * this, $thread * dst);
-static bool __wake_one(struct __processor_id_t * id, cluster * cltr);
-static void __halt(processor * this);
-bool __wake_proc(processor *);
+static void __wake_one(struct __processor_id_t * id, cluster * cltr);
+
+static void push  (__cluster_idles & idles, processor & proc);
+static void remove(__cluster_idles & idles, processor & proc);
+static [unsigned idle, unsigned total, * processor] query( & __cluster_idles idles );
+
 
 //=============================================================================================
@@ -118,27 +121,67 @@
 
 		$thread * readyThread = 0p;
-		for( unsigned int spin_count = 0;; spin_count++ ) {
+		MAIN_LOOP:
+		for() {
 			// Try to get the next thread
 			readyThread = __next_thread( this->cltr );
 
-			// Check if we actually found a thread
-			if( readyThread ) {
-				/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-				/* paranoid */ verifyf( readyThread->state == Ready || readyThread->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", readyThread->state, readyThread->preempted);
-				/* paranoid */ verifyf( readyThread->link.next == 0p, "Expected null got %p", readyThread->link.next );
-				__builtin_prefetch( readyThread->context.SP );
-
-				// We found a thread run it
-				__run_thread(this, readyThread);
-
-				/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+			if( !readyThread ) {
+				readyThread = __next_thread_slow( this->cltr );
 			}
 
-			if(__atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST)) break;
-
+			HALT:
 			if( !readyThread ) {
-				// Block until a thread is ready
-				__halt(this);
+				// Don't block if we are done
+				if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
+
+				#if !defined(__CFA_NO_STATISTICS__)
+					__tls_stats()->ready.sleep.halts++;
+				#endif
+
+				// Push self to idle stack
+				push(this->cltr->idles, * this);
+
+				// Confirm the ready-queue is empty
+				readyThread = __next_thread_slow( this->cltr );
+				if( readyThread ) {
+					// A thread was found, cancel the halt
+					remove(this->cltr->idles, * this);
+
+					#if !defined(__CFA_NO_STATISTICS__)
+						__tls_stats()->ready.sleep.cancels++;
+					#endif
+
+					// continue the mai loop
+					break HALT;
+				}
+
+				#if !defined(__CFA_NO_STATISTICS__)
+					if(this->print_halts) {
+						__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->id, rdtscl());
+					}
+				#endif
+
+				wait( this->idle );
+
+				#if !defined(__CFA_NO_STATISTICS__)
+					if(this->print_halts) {
+						__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->id, rdtscl());
+					}
+				#endif
+
+				// We were woken up, remove self from idle
+				remove(this->cltr->idles, * this);
+
+				// DON'T just proceed, start looking again
+				continue MAIN_LOOP;
 			}
+
+			/* paranoid */ verify( readyThread );
+
+			// We found a thread run it
+			__run_thread(this, readyThread);
+
+			// Are we done?
+			if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
 		}
 
@@ -165,4 +208,9 @@
 // from the processor coroutine to the target thread
 static void __run_thread(processor * this, $thread * thrd_dst) {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verifyf( thrd_dst->state == Ready || thrd_dst->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", thrd_dst->state, thrd_dst->preempted);
+	/* paranoid */ verifyf( thrd_dst->link.next == 0p, "Expected null got %p", thrd_dst->link.next );
+	__builtin_prefetch( thrd_dst->context.SP );
+
 	$coroutine * proc_cor = get_coroutine(this->runner);
 
@@ -244,4 +292,6 @@
 	proc_cor->state = Active;
 	kernelTLS.this_thread = 0p;
+
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 }
 
@@ -300,13 +350,5 @@
 	ready_schedule_lock  ( id );
 		push( thrd->curr_cluster, thrd );
-
-		#if !defined(__CFA_NO_STATISTICS__)
-			bool woke =
-		#endif
-			__wake_one(id, thrd->curr_cluster);
-
-		#if !defined(__CFA_NO_STATISTICS__)
-			if(woke) __tls_stats()->ready.sleep.wakes++;
-		#endif
+		__wake_one(id, thrd->curr_cluster);
 	ready_schedule_unlock( id );
 
@@ -315,25 +357,25 @@
 
 // KERNEL ONLY
-static $thread * __next_thread(cluster * this) with( *this ) {
+static inline $thread * __next_thread(cluster * this) with( *this ) {
 	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 
 	ready_schedule_lock  ( (__processor_id_t*)kernelTLS.this_processor );
-		$thread * head = pop( this );
+		$thread * thrd = pop( this );
 	ready_schedule_unlock( (__processor_id_t*)kernelTLS.this_processor );
 
 	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	return head;
+	return thrd;
 }
 
 // KERNEL ONLY
-static bool __has_next_thread(cluster * this) with( *this ) {
+static inline $thread * __next_thread_slow(cluster * this) with( *this ) {
 	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 
 	ready_schedule_lock  ( (__processor_id_t*)kernelTLS.this_processor );
-		bool not_empty = query( this );
+		$thread * thrd = pop_slow( this );
 	ready_schedule_unlock( (__processor_id_t*)kernelTLS.this_processor );
 
 	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	return not_empty;
+	return thrd;
 }
 
@@ -425,21 +467,32 @@
 //=============================================================================================
 // Wake a thread from the front if there are any
-static bool __wake_one(struct __processor_id_t * id, cluster * this) {
+static void __wake_one(struct __processor_id_t * id, cluster * this) {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 	/* paranoid */ verify( ready_schedule_islocked( id ) );
 
 	// Check if there is a sleeping processor
-	processor * p = pop(this->idles);
+	processor * p;
+	unsigned idle;
+	unsigned total;
+	[idle, total, p] = query(this->idles);
 
 	// If no one is sleeping, we are done
-	if( 0p == p ) return false;
+	if( idle == 0 ) return;
 
 	// We found a processor, wake it up
 	post( p->idle );
 
-	return true;
+	#if !defined(__CFA_NO_STATISTICS__)
+		__tls_stats()->ready.sleep.wakes++;
+	#endif
+
+	/* paranoid */ verify( ready_schedule_islocked( id ) );
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+
+	return;
 }
 
 // Unconditionnaly wake a thread
-bool __wake_proc(processor * this) {
+void __wake_proc(processor * this) {
 	__cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this);
 
@@ -448,42 +501,38 @@
 		bool ret = post( this->idle );
 	enable_interrupts( __cfaabi_dbg_ctx );
-
-	return ret;
-}
-
-static void __halt(processor * this) with( *this ) {
-	if( do_terminate ) return;
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		__tls_stats()->ready.sleep.halts++;
-	#endif
-	// Push self to queue
-	push(cltr->idles, *this);
-
-	// Makre sure we don't miss a thread
-	if( __has_next_thread(cltr) ) {
-		// A thread was posted, make sure a processor is woken up
-		struct __processor_id_t *id = (struct __processor_id_t *) this;
-		ready_schedule_lock  ( id );
-			__wake_one( id, cltr );
-		ready_schedule_unlock( id );
-		#if !defined(__CFA_NO_STATISTICS__)
-			__tls_stats()->ready.sleep.cancels++;
-		#endif
-	}
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		if(this->print_halts) {
-			__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->id, rdtscl());
-		}
-	#endif
-
-	wait( idle );
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		if(this->print_halts) {
-			__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->id, rdtscl());
-		}
-	#endif
+}
+
+static void push  (__cluster_idles & this, processor & proc) {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	lock( this );
+		this.idle++;
+		/* paranoid */ verify( this.idle <= this.total );
+
+		insert_first(this.list, proc);
+	unlock( this );
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+}
+
+static void remove(__cluster_idles & this, processor & proc) {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	lock( this );
+		this.idle--;
+		/* paranoid */ verify( this.idle >= 0 );
+
+		remove(proc);
+	unlock( this );
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+}
+
+static [unsigned idle, unsigned total, * processor] query( & __cluster_idles this ) {
+	for() {
+		uint64_t l = __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST);
+		if( 1 == (l % 2) ) { Pause(); continue; }
+		unsigned idle    = this.idle;
+		unsigned total   = this.total;
+		processor * proc = &this.list`first;
+		if(l != __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST)) { Pause(); continue; }
+		return [idle, total, proc];
+	}
 }
 
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 6c144d861e06011d0279338d7cb366cae8d8b84e)
+++ libcfa/src/concurrency/kernel.hfa	(revision 1eb239e4849405e30caa8ad3388f10d0aa7b4b1c)
@@ -20,5 +20,5 @@
 #include "coroutine.hfa"
 
-#include "containers/stackLockFree.hfa"
+#include "containers/list.hfa"
 
 extern "C" {
@@ -99,5 +99,5 @@
 
 	// Link lists fields
-	Link(processor) link;
+	DLISTED_MGD_IMPL_IN(processor)
 
 	#if !defined(__CFA_NO_STATISTICS__)
@@ -119,5 +119,5 @@
 static inline void  ?{}(processor & this, const char name[]) { this{name, *mainCluster }; }
 
-static inline Link(processor) * ?`next( processor * this ) { return &this->link; }
+DLISTED_MGD_IMPL_OUT(processor)
 
 //-----------------------------------------------------------------------------
@@ -206,4 +206,19 @@
 void ^?{}(__ready_queue_t & this);
 
+// Idle Sleep
+struct __cluster_idles {
+	// Spin lock protecting the queue
+	volatile uint64_t lock;
+
+	// Total number of processors
+	unsigned total;
+
+	// Total number of idle processors
+	unsigned idle;
+
+	// List of idle processors
+	dlist(processor, processor) list;
+};
+
 //-----------------------------------------------------------------------------
 // Cluster
@@ -219,6 +234,5 @@
 
 	// List of idle processors
-	StackLF(processor) idles;
-	volatile unsigned int nprocessors;
+	__cluster_idles idles;
 
 	// List of threads
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 6c144d861e06011d0279338d7cb366cae8d8b84e)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 1eb239e4849405e30caa8ad3388f10d0aa7b4b1c)
@@ -87,5 +87,5 @@
 //-----------------------------------------------------------------------------
 // Other Forward Declarations
-extern bool __wake_proc(processor *);
+extern void __wake_proc(processor *);
 
 //-----------------------------------------------------------------------------
@@ -475,5 +475,7 @@
 	#endif
 
-	int target = __atomic_add_fetch( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+	lock( this.cltr->idles );
+		int target = this.cltr->idles.total += 1u;
+	unlock( this.cltr->idles );
 
 	id = doregister((__processor_id_t*)&this);
@@ -493,6 +495,7 @@
 // Not a ctor, it just preps the destruction but should not destroy members
 static void deinit(processor & this) {
-
-	int target = __atomic_sub_fetch( &this.cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+	lock( this.cltr->idles );
+		int target = this.cltr->idles.total -= 1u;
+	unlock( this.cltr->idles );
 
 	// Lock the RWlock so no-one pushes/pops while we are changing the queue
@@ -501,7 +504,4 @@
 		// Adjust the ready queue size
 		ready_queue_shrink( this.cltr, target );
-
-		// Make sure we aren't on the idle queue
-		unsafe_remove( this.cltr->idles, &this );
 
 	// Unlock the RWlock
@@ -545,8 +545,14 @@
 //-----------------------------------------------------------------------------
 // Cluster
+static void ?{}(__cluster_idles & this) {
+	this.lock  = 0;
+	this.idle  = 0;
+	this.total = 0;
+	(this.list){};
+}
+
 void ?{}(cluster & this, const char name[], Duration preemption_rate, unsigned num_io, const io_context_params & io_params) with( this ) {
 	this.name = name;
 	this.preemption_rate = preemption_rate;
-	this.nprocessors = 0;
 	ready_queue{};
 
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 6c144d861e06011d0279338d7cb366cae8d8b84e)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 1eb239e4849405e30caa8ad3388f10d0aa7b4b1c)
@@ -121,4 +121,22 @@
 void     unregister( struct __processor_id_t * proc );
 
+//-----------------------------------------------------------------------
+// Cluster idle lock/unlock
+static inline void lock(__cluster_idles & this) {
+	for() {
+		uint64_t l = this.lock;
+		if(
+			(0 == (l % 2))
+			&& __atomic_compare_exchange_n(&this.lock, &l, l + 1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+		) return;
+		Pause();
+	}
+}
+
+static inline void unlock(__cluster_idles & this) {
+	/* paranoid */ verify( 1 == (this.lock % 2) );
+	__atomic_fetch_add( &this.lock, 1, __ATOMIC_SEQ_CST );
+}
+
 //=======================================================================
 // Reader-writer lock implementation
@@ -248,5 +266,12 @@
 // pop thread from the ready queue of a cluster
 // returns 0p if empty
+// May return 0p spuriously
 __attribute__((hot)) struct $thread * pop(struct cluster * cltr);
+
+//-----------------------------------------------------------------------
+// pop thread from the ready queue of a cluster
+// returns 0p if empty
+// guaranteed to find any threads added before this call
+__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr);
 
 //-----------------------------------------------------------------------
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 6c144d861e06011d0279338d7cb366cae8d8b84e)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 1eb239e4849405e30caa8ad3388f10d0aa7b4b1c)
@@ -17,4 +17,6 @@
 // #define __CFA_DEBUG_PRINT_READY_QUEUE__
 
+// #define USE_SNZI
+
 #include "bits/defs.hfa"
 #include "kernel_private.hfa"
@@ -192,5 +194,7 @@
 void ^?{}(__ready_queue_t & this) with (this) {
 	verify( 1 == lanes.count );
-	verify( !query( snzi ) );
+	#ifdef USE_SNZI
+		verify( !query( snzi ) );
+	#endif
 	free(lanes.data);
 }
@@ -198,5 +202,8 @@
 //-----------------------------------------------------------------------
 __attribute__((hot)) bool query(struct cluster * cltr) {
-	return query(cltr->ready_queue.snzi);
+	#ifdef USE_SNZI
+		return query(cltr->ready_queue.snzi);
+	#endif
+	return true;
 }
 
@@ -262,12 +269,14 @@
 	bool lane_first = push(lanes.data[i], thrd);
 
-	// If this lane used to be empty we need to do more
-	if(lane_first) {
-		// Check if the entire queue used to be empty
-		first = !query(snzi);
-
-		// Update the snzi
-		arrive( snzi, i );
-	}
+	#ifdef USE_SNZI
+		// If this lane used to be empty we need to do more
+		if(lane_first) {
+			// Check if the entire queue used to be empty
+			first = !query(snzi);
+
+			// Update the snzi
+			arrive( snzi, i );
+		}
+	#endif
 
 	// Unlock and return
@@ -294,4 +303,5 @@
 __attribute__((hot)) $thread * pop(struct cluster * cltr) with (cltr->ready_queue) {
 	/* paranoid */ verify( lanes.count > 0 );
+	unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
 	#if defined(BIAS)
 		// Don't bother trying locally too much
@@ -300,5 +310,9 @@
 
 	// As long as the list is not empty, try finding a lane that isn't empty and pop from it
-	while( query(snzi) ) {
+	#ifdef USE_SNZI
+		while( query(snzi) ) {
+	#else
+		for(25) {
+	#endif
 		// Pick two lists at random
 		unsigned i,j;
@@ -336,6 +350,6 @@
 		#endif
 
-		i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
-		j %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+		i %= count;
+		j %= count;
 
 		// try popping from the 2 picked lists
@@ -353,4 +367,21 @@
 }
 
+__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+	/* paranoid */ verify( lanes.count > 0 );
+	unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+	unsigned offset = __tls_rand();
+	for(i; count) {
+		unsigned idx = (offset + i) % count;
+		struct $thread * thrd = try_pop(cltr, idx);
+		if(thrd) {
+			return thrd;
+		}
+	}
+
+	// All lanes where empty return 0p
+	return 0p;
+}
+
+
 //-----------------------------------------------------------------------
 // Given 2 indexes, pick the list with the oldest push an try to pop from it
@@ -394,8 +425,10 @@
 	/* paranoid */ verify(lane.lock);
 
-	// If this was the last element in the lane
-	if(emptied) {
-		depart( snzi, w );
-	}
+	#ifdef USE_SNZI
+		// If this was the last element in the lane
+		if(emptied) {
+			depart( snzi, w );
+		}
+	#endif
 
 	// Unlock and return
@@ -430,7 +463,9 @@
 
 				removed = true;
-				if(emptied) {
-					depart( snzi, i );
-				}
+				#ifdef USE_SNZI
+					if(emptied) {
+						depart( snzi, i );
+					}
+				#endif
 			}
 		__atomic_unlock(&lane.lock);
@@ -494,5 +529,7 @@
 	// grow the ready queue
 	with( cltr->ready_queue ) {
-		^(snzi){};
+		#ifdef USE_SNZI
+			^(snzi){};
+		#endif
 
 		// Find new count
@@ -516,11 +553,13 @@
 		lanes.count = ncount;
 
-		// Re-create the snzi
-		snzi{ log2( lanes.count / 8 ) };
-		for( idx; (size_t)lanes.count ) {
-			if( !is_empty(lanes.data[idx]) ) {
-				arrive(snzi, idx);
-			}
-		}
+		#ifdef USE_SNZI
+			// Re-create the snzi
+			snzi{ log2( lanes.count / 8 ) };
+			for( idx; (size_t)lanes.count ) {
+				if( !is_empty(lanes.data[idx]) ) {
+					arrive(snzi, idx);
+				}
+			}
+		#endif
 	}
 
@@ -542,5 +581,7 @@
 
 	with( cltr->ready_queue ) {
-		^(snzi){};
+		#ifdef USE_SNZI
+			^(snzi){};
+		#endif
 
 		// Remember old count
@@ -596,11 +637,13 @@
 		}
 
-		// Re-create the snzi
-		snzi{ log2( lanes.count / 8 ) };
-		for( idx; (size_t)lanes.count ) {
-			if( !is_empty(lanes.data[idx]) ) {
-				arrive(snzi, idx);
-			}
-		}
+		#ifdef USE_SNZI
+			// Re-create the snzi
+			snzi{ log2( lanes.count / 8 ) };
+			for( idx; (size_t)lanes.count ) {
+				if( !is_empty(lanes.data[idx]) ) {
+					arrive(snzi, idx);
+				}
+			}
+		#endif
 	}