Index: libcfa/src/concurrency/alarm.hfa
===================================================================
--- libcfa/src/concurrency/alarm.hfa	(revision 88cafe7991e4d4b769f11c34a6c1af0ae4ecffba)
+++ libcfa/src/concurrency/alarm.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -23,5 +23,5 @@
 #include "time.hfa"
 
-#include <containers/list.hfa>
+#include "containers/list.hfa"
 
 struct $thread;
Index: libcfa/src/concurrency/coroutine.cfa
===================================================================
--- libcfa/src/concurrency/coroutine.cfa	(revision 88cafe7991e4d4b769f11c34a6c1af0ae4ecffba)
+++ libcfa/src/concurrency/coroutine.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -215,4 +215,8 @@
 		return cor;
 	}
+
+	struct $coroutine * __cfactx_cor_active(void) {
+		return active_coroutine();
+	}
 }
 
Index: libcfa/src/concurrency/invoke.c
===================================================================
--- libcfa/src/concurrency/invoke.c	(revision 88cafe7991e4d4b769f11c34a6c1af0ae4ecffba)
+++ libcfa/src/concurrency/invoke.c	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -10,6 +10,6 @@
 // Created On       : Tue Jan 17 12:27:26 2016
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Thu Aug 20 18:54:34 2020
-// Update Count     : 30
+// Last Modified On : Thu Aug 20 23:43:23 2020
+// Update Count     : 31
 //
 
@@ -29,4 +29,5 @@
 // Called from the kernel when starting a coroutine or task so must switch back to user mode.
 
+extern struct $coroutine * __cfactx_cor_active(void);
 extern struct $coroutine * __cfactx_cor_finish(void);
 extern void __cfactx_cor_leave ( struct $coroutine * );
@@ -35,4 +36,8 @@
 extern void disable_interrupts() OPTIONAL_THREAD;
 extern void enable_interrupts( __cfaabi_dbg_ctx_param );
+
+struct exception_context_t * this_exception_context() {
+	return &__get_stack( __cfactx_cor_active() )->exception_context;
+}
 
 void __cfactx_invoke_coroutine(
@@ -146,5 +151,13 @@
 
 #elif defined( __ARM_ARCH_32 )
-#warning ARM needs to be upgrade to use two parameters like X86/X64 (A.K.A. : I broke this and do not know how to fix it)
+#error ARM needs to be upgrade to use two parameters like X86/X64 (A.K.A. : I broke this and do not know how to fix it)
+	// More details about the error:
+	// To avoid the thunk problem, I changed the invoke routine to pass the main explicitly
+	// instead of relying on an assertion. This effectively hoists any required thunk one level
+	// which was enough to get to global scope in most cases.
+	// This means that __cfactx_invoke_... now takes two parameters and the FakeStack needs
+	// to be adjusted as a consequence of that.
+	// I don't know how to do that for ARM, hence the #error
+
 	struct FakeStack {
 		float fpRegs[16];								// floating point registers
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision 88cafe7991e4d4b769f11c34a6c1af0ae4ecffba)
+++ libcfa/src/concurrency/invoke.h	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -26,4 +26,11 @@
 #ifndef _INVOKE_H_
 #define _INVOKE_H_
+
+	struct __cfaehm_try_resume_node;
+	struct __cfaehm_base_exception_t;
+	struct exception_context_t {
+		struct __cfaehm_try_resume_node * top_resume;
+		struct __cfaehm_base_exception_t * current_exception;
+	};
 
 	struct __stack_context_t {
@@ -51,4 +58,7 @@
 		// base of stack
 		void * base;
+
+		// Information for exception handling.
+		struct exception_context_t exception_context;
 	};
 
@@ -84,5 +94,9 @@
 	};
 
-	static inline struct __stack_t * __get_stack( struct $coroutine * cor ) { return (struct __stack_t*)(((uintptr_t)cor->stack.storage) & ((uintptr_t)-2)); }
+	static inline struct __stack_t * __get_stack( struct $coroutine * cor ) {
+		return (struct __stack_t*)(((uintptr_t)cor->stack.storage) & ((uintptr_t)-2));
+	}
+
+	struct exception_context_t * this_exception_context();
 
 	// struct which calls the monitor is accepting
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 88cafe7991e4d4b769f11c34a6c1af0ae4ecffba)
+++ libcfa/src/concurrency/io.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -41,4 +41,67 @@
 	#include "kernel/fwd.hfa"
 	#include "io/types.hfa"
+
+	// returns true of acquired as leader or second leader
+	static inline bool try_lock( __leaderlock_t & this ) {
+		const uintptr_t thrd = 1z | (uintptr_t)active_thread();
+		bool block;
+		disable_interrupts();
+		for() {
+			struct $thread * expected = this.value;
+			if( 1p != expected && 0p != expected ) {
+				/* paranoid */ verify( thrd != (uintptr_t)expected ); // We better not already be the next leader
+				enable_interrupts( __cfaabi_dbg_ctx );
+				return false;
+			}
+			struct $thread * desired;
+			if( 0p == expected ) {
+				// If the lock isn't locked acquire it, no need to block
+				desired = 1p;
+				block = false;
+			}
+			else {
+				// If the lock is already locked try becomming the next leader
+				desired = (struct $thread *)thrd;
+				block = true;
+			}
+			if( __atomic_compare_exchange_n(&this.value, &expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) ) break;
+		}
+		if( block ) {
+			enable_interrupts( __cfaabi_dbg_ctx );
+			park( __cfaabi_dbg_ctx );
+			disable_interrupts();
+		}
+		return true;
+	}
+
+	static inline bool next( __leaderlock_t & this ) {
+		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+		struct $thread * nextt;
+		for() {
+			struct $thread * expected = this.value;
+			/* paranoid */ verify( (1 & (uintptr_t)expected) == 1 ); // The lock better be locked
+
+			struct $thread * desired;
+			if( 1p == expected ) {
+				// No next leader, just unlock
+				desired = 0p;
+				nextt   = 0p;
+			}
+			else {
+				// There is a next leader, remove but keep locked
+				desired = 1p;
+				nextt   = (struct $thread *)(~1z & (uintptr_t)expected);
+			}
+			if( __atomic_compare_exchange_n(&this.value, &expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) ) break;
+		}
+
+		if(nextt) {
+			unpark( nextt __cfaabi_dbg_ctx2 );
+			enable_interrupts( __cfaabi_dbg_ctx );
+			return true;
+		}
+		enable_interrupts( __cfaabi_dbg_ctx );
+		return false;
+	}
 
 //=============================================================================================
@@ -93,5 +156,5 @@
 //=============================================================================================
 	static unsigned __collect_submitions( struct __io_data & ring );
-	static uint32_t __release_consumed_submission( struct __io_data & ring );
+	static __u32 __release_consumed_submission( struct __io_data & ring );
 
 	static inline void process(struct io_uring_cqe & cqe ) {
@@ -100,5 +163,5 @@
 
 		data->result = cqe.res;
-		unpark( data->thrd __cfaabi_dbg_ctx2 );
+		post( data->sem );
 	}
 
@@ -136,5 +199,5 @@
 		unsigned head = *ring.completion_q.head;
 		unsigned tail = *ring.completion_q.tail;
-		const uint32_t mask = *ring.completion_q.mask;
+		const __u32 mask = *ring.completion_q.mask;
 
 		// Nothing was new return 0
@@ -143,5 +206,5 @@
 		}
 
-		uint32_t count = tail - head;
+		__u32 count = tail - head;
 		/* paranoid */ verify( count != 0 );
 		for(i; count) {
@@ -182,5 +245,5 @@
 				__STATS__( true,
 					io.complete_q.completed_avg.val += count;
-					io.complete_q.completed_avg.fast_cnt += 1;
+					io.complete_q.completed_avg.cnt += 1;
 				)
 			enable_interrupts( __cfaabi_dbg_ctx );
@@ -192,4 +255,7 @@
 			// We didn't get anything baton pass to the slow poller
 			else {
+				__STATS__( false,
+					io.complete_q.blocks += 1;
+				)
 				__cfadbg_print_safe(io_core, "Kernel I/O : Parking io poller %p\n", &this.self);
 				reset = 0;
@@ -224,5 +290,5 @@
 //
 
-	[* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring, uint64_t data ) {
+	[* struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data ) {
 		/* paranoid */ verify( data != 0 );
 
@@ -230,9 +296,9 @@
 		__attribute((unused)) int len   = 0;
 		__attribute((unused)) int block = 0;
-		uint32_t cnt = *ring.submit_q.num;
-		uint32_t mask = *ring.submit_q.mask;
+		__u32 cnt = *ring.submit_q.num;
+		__u32 mask = *ring.submit_q.mask;
 
 		disable_interrupts();
-			uint32_t off = __tls_rand();
+			__u32 off = __tls_rand();
 		enable_interrupts( __cfaabi_dbg_ctx );
 
@@ -241,8 +307,8 @@
 			// Look through the list starting at some offset
 			for(i; cnt) {
-				uint64_t expected = 0;
-				uint32_t idx = (i + off) & mask;
+				__u64 expected = 0;
+				__u32 idx = (i + off) & mask;
 				struct io_uring_sqe * sqe = &ring.submit_q.sqes[idx];
-				volatile uint64_t * udata = (volatile uint64_t *)&sqe->user_data;
+				volatile __u64 * udata = &sqe->user_data;
 
 				if( *udata == expected &&
@@ -270,5 +336,5 @@
 	}
 
-	static inline uint32_t __submit_to_ready_array( struct __io_data & ring, uint32_t idx, const uint32_t mask ) {
+	static inline __u32 __submit_to_ready_array( struct __io_data & ring, __u32 idx, const __u32 mask ) {
 		/* paranoid */ verify( idx <= mask   );
 		/* paranoid */ verify( idx != -1ul32 );
@@ -277,15 +343,15 @@
 		__attribute((unused)) int len   = 0;
 		__attribute((unused)) int block = 0;
-		uint32_t ready_mask = ring.submit_q.ready_cnt - 1;
+		__u32 ready_mask = ring.submit_q.ready_cnt - 1;
 
 		disable_interrupts();
-			uint32_t off = __tls_rand();
+			__u32 off = __tls_rand();
 		enable_interrupts( __cfaabi_dbg_ctx );
 
-		uint32_t picked;
+		__u32 picked;
 		LOOKING: for() {
 			for(i; ring.submit_q.ready_cnt) {
 				picked = (i + off) & ready_mask;
-				uint32_t expected = -1ul32;
+				__u32 expected = -1ul32;
 				if( __atomic_compare_exchange_n( &ring.submit_q.ready[picked], &expected, idx, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED ) ) {
 					break LOOKING;
@@ -297,9 +363,7 @@
 
 			block++;
-			if( try_lock(ring.submit_q.lock __cfaabi_dbg_ctx2) ) {
-				__release_consumed_submission( ring );
-				unlock( ring.submit_q.lock );
-			}
-			else {
+
+			__u32 released = __release_consumed_submission( ring );
+			if( released == 0 ) {
 				yield();
 			}
@@ -316,9 +380,9 @@
 	}
 
-	void __submit( struct io_context * ctx, uint32_t idx ) __attribute__((nonnull (1))) {
+	void __submit( struct io_context * ctx, __u32 idx ) __attribute__((nonnull (1))) {
 		__io_data & ring = *ctx->thrd.ring;
 		// Get now the data we definetely need
-		volatile uint32_t * const tail = ring.submit_q.tail;
-		const uint32_t mask  = *ring.submit_q.mask;
+		volatile __u32 * const tail = ring.submit_q.tail;
+		const __u32 mask  = *ring.submit_q.mask;
 
 		// There are 2 submission schemes, check which one we are using
@@ -332,12 +396,8 @@
 		}
 		else if( ring.eager_submits ) {
-			uint32_t picked = __submit_to_ready_array( ring, idx, mask );
-
-			for() {
-				yield();
-
-				// If some one else collected our index, we are done
-				#warning ABA problem
-				if( ring.submit_q.ready[picked] != idx ) {
+			__u32 picked = __submit_to_ready_array( ring, idx, mask );
+
+			#if defined(LEADER_LOCK)
+				if( !try_lock(ring.submit_q.submit_lock) ) {
 					__STATS__( false,
 						io.submit_q.helped += 1;
@@ -345,26 +405,48 @@
 					return;
 				}
-
-				if( try_lock(ring.submit_q.lock __cfaabi_dbg_ctx2) ) {
+				/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+				__STATS__( true,
+					io.submit_q.leader += 1;
+				)
+			#else
+				for() {
+					yield();
+
+					if( try_lock(ring.submit_q.submit_lock __cfaabi_dbg_ctx2) ) {
+						__STATS__( false,
+							io.submit_q.leader += 1;
+						)
+						break;
+					}
+
+					// If some one else collected our index, we are done
+					#warning ABA problem
+					if( ring.submit_q.ready[picked] != idx ) {
+						__STATS__( false,
+							io.submit_q.helped += 1;
+						)
+						return;
+					}
+
 					__STATS__( false,
-						io.submit_q.leader += 1;
+						io.submit_q.busy += 1;
 					)
-					break;
-				}
-
-				__STATS__( false,
-					io.submit_q.busy += 1;
-				)
-			}
+				}
+			#endif
 
 			// We got the lock
+			// Collect the submissions
 			unsigned to_submit = __collect_submitions( ring );
+
+			// Actually submit
 			int ret = __io_uring_enter( ring, to_submit, false );
-			if( ret < 0 ) {
-				unlock(ring.submit_q.lock);
-				return;
-			}
-
-			/* paranoid */ verify( ret > 0 || to_submit == 0 || (ring.ring_flags & IORING_SETUP_SQPOLL) );
+
+			#if defined(LEADER_LOCK)
+				/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+				next(ring.submit_q.submit_lock);
+			#else
+				unlock(ring.submit_q.submit_lock);
+			#endif
+			if( ret < 0 ) return;
 
 			// Release the consumed SQEs
@@ -372,15 +454,17 @@
 
 			// update statistics
-			__STATS__( true,
+			__STATS__( false,
 				io.submit_q.submit_avg.rdy += to_submit;
 				io.submit_q.submit_avg.csm += ret;
 				io.submit_q.submit_avg.cnt += 1;
 			)
-
-			unlock(ring.submit_q.lock);
 		}
 		else {
 			// get mutual exclusion
-			lock(ring.submit_q.lock __cfaabi_dbg_ctx2);
+			#if defined(LEADER_LOCK)
+				while(!try_lock(ring.submit_q.submit_lock));
+			#else
+				lock(ring.submit_q.submit_lock __cfaabi_dbg_ctx2);
+			#endif
 
 			/* paranoid */ verifyf( ring.submit_q.sqes[ idx ].user_data != 0,
@@ -420,5 +504,9 @@
 			__release_consumed_submission( ring );
 
-			unlock(ring.submit_q.lock);
+			#if defined(LEADER_LOCK)
+				next(ring.submit_q.submit_lock);
+			#else
+				unlock(ring.submit_q.submit_lock);
+			#endif
 
 			__cfadbg_print_safe( io, "Kernel I/O : Performed io_submit for %p, returned %d\n", active_thread(), ret );
@@ -426,4 +514,5 @@
 	}
 
+	// #define PARTIAL_SUBMIT 32
 	static unsigned __collect_submitions( struct __io_data & ring ) {
 		/* paranoid */ verify( ring.submit_q.ready != 0p );
@@ -431,11 +520,24 @@
 
 		unsigned to_submit = 0;
-		uint32_t tail = *ring.submit_q.tail;
-		const uint32_t mask = *ring.submit_q.mask;
+		__u32 tail = *ring.submit_q.tail;
+		const __u32 mask = *ring.submit_q.mask;
+		#if defined(PARTIAL_SUBMIT)
+			#if defined(LEADER_LOCK)
+				#error PARTIAL_SUBMIT and LEADER_LOCK cannot co-exist
+			#endif
+			const __u32 cnt = ring.submit_q.ready_cnt > PARTIAL_SUBMIT ? PARTIAL_SUBMIT : ring.submit_q.ready_cnt;
+			const __u32 offset = ring.submit_q.prev_ready;
+			ring.submit_q.prev_ready += cnt;
+		#else
+			const __u32 cnt = ring.submit_q.ready_cnt;
+			const __u32 offset = 0;
+		#endif
 
 		// Go through the list of ready submissions
-		for( i; ring.submit_q.ready_cnt ) {
+		for( c; cnt ) {
+			__u32 i = (offset + c) % ring.submit_q.ready_cnt;
+
 			// replace any submission with the sentinel, to consume it.
-			uint32_t idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
+			__u32 idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
 
 			// If it was already the sentinel, then we are done
@@ -453,16 +555,16 @@
 	}
 
-	static uint32_t __release_consumed_submission( struct __io_data & ring ) {
-		const uint32_t smask = *ring.submit_q.mask;
+	static __u32 __release_consumed_submission( struct __io_data & ring ) {
+		const __u32 smask = *ring.submit_q.mask;
 
 		if( !try_lock(ring.submit_q.release_lock __cfaabi_dbg_ctx2) ) return 0;
-		uint32_t chead = *ring.submit_q.head;
-		uint32_t phead = ring.submit_q.prev_head;
+		__u32 chead = *ring.submit_q.head;
+		__u32 phead = ring.submit_q.prev_head;
 		ring.submit_q.prev_head = chead;
 		unlock(ring.submit_q.release_lock);
 
-		uint32_t count = chead - phead;
+		__u32 count = chead - phead;
 		for( i; count ) {
-			uint32_t idx = ring.submit_q.array[ (phead + i) & smask ];
+			__u32 idx = ring.submit_q.array[ (phead + i) & smask ];
 			ring.submit_q.sqes[ idx ].user_data = 0;
 		}
Index: libcfa/src/concurrency/io/setup.cfa
===================================================================
--- libcfa/src/concurrency/io/setup.cfa	(revision 88cafe7991e4d4b769f11c34a6c1af0ae4ecffba)
+++ libcfa/src/concurrency/io/setup.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -228,5 +228,5 @@
 		if( cluster_context ) {
 			cluster & cltr = *thrd.curr_cluster;
-			/* paranoid */ verify( cltr.nprocessors == 0 || &cltr == mainCluster );
+			/* paranoid */ verify( cltr.idles.total == 0 || &cltr == mainCluster );
 			/* paranoid */ verify( !ready_mutate_islocked() );
 
@@ -298,5 +298,11 @@
 		if( params_in.poll_complete ) params.flags |= IORING_SETUP_IOPOLL;
 
-		uint32_t nentries = params_in.num_entries;
+		__u32 nentries = params_in.num_entries != 0 ? params_in.num_entries : 256;
+		if( !is_pow2(nentries) ) {
+			abort("ERROR: I/O setup 'num_entries' must be a power of 2\n");
+		}
+		if( params_in.poller_submits && params_in.eager_submits ) {
+			abort("ERROR: I/O setup 'poller_submits' and 'eager_submits' cannot be used together\n");
+		}
 
 		int fd = syscall(__NR_io_uring_setup, nentries, &params );
@@ -356,15 +362,15 @@
 		// Get the pointers from the kernel to fill the structure
 		// submit queue
-		sq.head    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
-		sq.tail    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
-		sq.mask    = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
-		sq.num     = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
-		sq.flags   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
-		sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
-		sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
+		sq.head    = (volatile __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
+		sq.tail    = (volatile __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
+		sq.mask    = (   const __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
+		sq.num     = (   const __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
+		sq.flags   = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
+		sq.dropped = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
+		sq.array   = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
 		sq.prev_head = *sq.head;
 
 		{
-			const uint32_t num = *sq.num;
+			const __u32 num = *sq.num;
 			for( i; num ) {
 				sq.sqes[i].user_data = 0ul64;
@@ -372,5 +378,5 @@
 		}
 
-		(sq.lock){};
+		(sq.submit_lock){};
 		(sq.release_lock){};
 
@@ -382,17 +388,19 @@
 				sq.ready[i] = -1ul32;
 			}
+			sq.prev_ready = 0;
 		}
 		else {
 			sq.ready_cnt = 0;
 			sq.ready = 0p;
+			sq.prev_ready = 0;
 		}
 
 		// completion queue
-		cq.head     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
-		cq.tail     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
-		cq.mask     = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
-		cq.num      = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
-		cq.overflow = (         uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
-		cq.cqes   = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
+		cq.head      = (volatile __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
+		cq.tail      = (volatile __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
+		cq.mask      = (   const __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
+		cq.num       = (   const __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
+		cq.overflow  = (         __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
+		cq.cqes = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
 
 		// some paranoid checks
@@ -442,5 +450,5 @@
 	void __ioctx_register($io_ctx_thread & ctx, struct epoll_event & ev) {
 		ev.events = EPOLLIN | EPOLLONESHOT;
-		ev.data.u64 = (uint64_t)&ctx;
+		ev.data.u64 = (__u64)&ctx;
 		int ret = epoll_ctl(iopoll.epollfd, EPOLL_CTL_ADD, ctx.ring->fd, &ev);
 		if (ret < 0) {
Index: libcfa/src/concurrency/io/types.hfa
===================================================================
--- libcfa/src/concurrency/io/types.hfa	(revision 88cafe7991e4d4b769f11c34a6c1af0ae4ecffba)
+++ libcfa/src/concurrency/io/types.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -17,5 +17,16 @@
 
 #if defined(CFA_HAVE_LINUX_IO_URING_H)
+	extern "C" {
+		#include <linux/types.h>
+	}
+
       #include "bits/locks.hfa"
+
+	#define LEADER_LOCK
+	struct __leaderlock_t {
+		struct $thread * volatile value;	// ($thread) next_leader | (bool:1) is_locked
+	};
+
+	static inline void ?{}( __leaderlock_t & this ) { this.value = 0p; }
 
 	//-----------------------------------------------------------------------
@@ -23,28 +34,33 @@
       struct __submition_data {
 		// Head and tail of the ring (associated with array)
-		volatile uint32_t * head;
-		volatile uint32_t * tail;
-		volatile uint32_t prev_head;
+		volatile __u32 * head;
+		volatile __u32 * tail;
+		volatile __u32 prev_head;
 
 		// The actual kernel ring which uses head/tail
 		// indexes into the sqes arrays
-		uint32_t * array;
+		__u32 * array;
 
 		// number of entries and mask to go with it
-		const uint32_t * num;
-		const uint32_t * mask;
+		const __u32 * num;
+		const __u32 * mask;
 
 		// Submission flags (Not sure what for)
-		uint32_t * flags;
+		__u32 * flags;
 
 		// number of sqes not submitted (whatever that means)
-		uint32_t * dropped;
+		__u32 * dropped;
 
 		// Like head/tail but not seen by the kernel
-		volatile uint32_t * ready;
-		uint32_t ready_cnt;
+		volatile __u32 * ready;
+		__u32 ready_cnt;
+		__u32 prev_ready;
 
-		__spinlock_t lock;
-		__spinlock_t release_lock;
+		#if defined(LEADER_LOCK)
+			__leaderlock_t submit_lock;
+		#else
+			__spinlock_t submit_lock;
+		#endif
+		__spinlock_t  release_lock;
 
 		// A buffer of sqes (not the actual ring)
@@ -58,13 +74,13 @@
 	struct __completion_data {
 		// Head and tail of the ring
-		volatile uint32_t * head;
-		volatile uint32_t * tail;
+		volatile __u32 * head;
+		volatile __u32 * tail;
 
 		// number of entries and mask to go with it
-		const uint32_t * mask;
-		const uint32_t * num;
+		const __u32 * mask;
+		const __u32 * num;
 
 		// number of cqes not submitted (whatever that means)
-		uint32_t * overflow;
+		__u32 * overflow;
 
 		// the kernel ring
@@ -79,5 +95,5 @@
 		struct __submition_data submit_q;
 		struct __completion_data completion_q;
-		uint32_t ring_flags;
+		__u32 ring_flags;
 		int fd;
 		bool eager_submits:1;
@@ -89,6 +105,6 @@
 	// IO user data
 	struct __io_user_data_t {
-		int32_t result;
-		$thread * thrd;
+		__s32 result;
+		oneshot sem;
 	};
 
Index: libcfa/src/concurrency/iocall.cfa
===================================================================
--- libcfa/src/concurrency/iocall.cfa	(revision 88cafe7991e4d4b769f11c34a6c1af0ae4ecffba)
+++ libcfa/src/concurrency/iocall.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -32,8 +32,8 @@
 	#include "io/types.hfa"
 
-	extern [* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring, uint64_t data );
-	extern void __submit( struct io_context * ctx, uint32_t idx ) __attribute__((nonnull (1)));
-
-	static inline void ?{}(struct io_uring_sqe & this, uint8_t opcode, int fd) {
+	extern [* struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data );
+	extern void __submit( struct io_context * ctx, __u32 idx ) __attribute__((nonnull (1)));
+
+	static inline void ?{}(struct io_uring_sqe & this, __u8 opcode, int fd) {
 		this.opcode = opcode;
 		#if !defined(IOSQE_ASYNC)
@@ -51,8 +51,8 @@
 	}
 
-	static inline void ?{}(struct io_uring_sqe & this, uint8_t opcode, int fd, void * addr, uint32_t len, uint64_t off ) {
+	static inline void ?{}(struct io_uring_sqe & this, __u8 opcode, int fd, void * addr, __u32 len, __u64 off ) {
 		(this){ opcode, fd };
 		this.off = off;
-		this.addr = (uint64_t)(uintptr_t)addr;
+		this.addr = (__u64)(uintptr_t)addr;
 		this.len = len;
 	}
@@ -101,21 +101,21 @@
 	#endif
 
-
 	#define __submit_prelude \
 		if( 0 != (submit_flags & LINK_FLAGS) ) { errno = ENOTSUP; return -1; } \
 		(void)timeout; (void)cancellation; \
 		if( !context ) context = __get_io_context(); \
-		__io_user_data_t data = { 0, active_thread() }; \
+		__io_user_data_t data = { 0 }; \
 		struct __io_data & ring = *context->thrd.ring; \
 		struct io_uring_sqe * sqe; \
-		uint32_t idx; \
-		[sqe, idx] = __submit_alloc( ring, (uint64_t)(uintptr_t)&data ); \
-		sqe->flags = REGULAR_FLAGS & submit_flags;
+		__u32 idx; \
+		__u8 sflags = REGULAR_FLAGS & submit_flags; \
+		[sqe, idx] = __submit_alloc( ring, (__u64)(uintptr_t)&data ); \
+		sqe->flags = sflags;
 
 	#define __submit_wait \
 		/*__cfaabi_bits_print_safe( STDERR_FILENO, "Preparing user data %p for %p\n", &data, data.thrd );*/ \
-		verify( sqe->user_data == (uint64_t)(uintptr_t)&data ); \
+		verify( sqe->user_data == (__u64)(uintptr_t)&data ); \
 		__submit( context, idx ); \
-		park( __cfaabi_dbg_ctx ); \
+		wait( data.sem ); \
 		if( data.result < 0 ) { \
 			errno = -data.result; \
@@ -149,5 +149,12 @@
 
 	extern int fsync(int fd);
-	extern int sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags);
+
+	#if __OFF_T_MATCHES_OFF64_T
+		typedef __off64_t off_t;
+	#else
+		typedef __off_t off_t;
+	#endif
+	typedef __off64_t off64_t;
+	extern int sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags);
 
 	struct msghdr;
@@ -160,6 +167,6 @@
 	extern int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
 
-	extern int fallocate(int fd, int mode, uint64_t offset, uint64_t len);
-	extern int posix_fadvise(int fd, uint64_t offset, uint64_t len, int advice);
+	extern int fallocate(int fd, int mode, off_t offset, off_t len);
+	extern int posix_fadvise(int fd, off_t offset, off_t len, int advice);
 	extern int madvise(void *addr, size_t length, int advice);
 
@@ -186,5 +193,12 @@
 			__submit_prelude
 
-			(*sqe){ IORING_OP_READV, fd, iov, iovcnt, offset };
+			sqe->opcode = IORING_OP_READV;
+			sqe->ioprio = 0;
+			sqe->fd = fd;
+			sqe->off = offset;
+			sqe->addr = (__u64)iov;
+			sqe->len = iovcnt;
+			sqe->rw_flags = 0;
+			sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
 
 			__submit_wait
@@ -200,5 +214,12 @@
 			__submit_prelude
 
-			(*sqe){ IORING_OP_WRITEV, fd, iov, iovcnt, offset };
+			sqe->opcode = IORING_OP_WRITEV;
+			sqe->ioprio = 0;
+			sqe->fd = fd;
+			sqe->off = offset;
+			sqe->addr = (__u64)iov;
+			sqe->len = iovcnt;
+			sqe->rw_flags = 0;
+			sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
 
 			__submit_wait
@@ -213,11 +234,18 @@
 		__submit_prelude
 
-		(*sqe){ IORING_OP_FSYNC, fd };
-
-		__submit_wait
-	#endif
-}
-
-int cfa_sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
+		sqe->opcode = IORING_OP_FSYNC;
+		sqe->ioprio = 0;
+		sqe->fd = fd;
+		sqe->off = 0;
+		sqe->addr = 0;
+		sqe->len = 0;
+		sqe->rw_flags = 0;
+		sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
+
+		__submit_wait
+	#endif
+}
+
+int cfa_sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SYNC_FILE_RANGE)
 		return sync_file_range(fd, offset, nbytes, flags);
@@ -268,5 +296,5 @@
 
 		(*sqe){ IORING_OP_SEND, sockfd };
-		sqe->addr = (uint64_t)buf;
+		sqe->addr = (__u64)buf;
 		sqe->len = len;
 		sqe->msg_flags = flags;
@@ -283,5 +311,5 @@
 
 		(*sqe){ IORING_OP_RECV, sockfd };
-		sqe->addr = (uint64_t)buf;
+		sqe->addr = (__u64)buf;
 		sqe->len = len;
 		sqe->msg_flags = flags;
@@ -298,6 +326,6 @@
 
 		(*sqe){ IORING_OP_ACCEPT, sockfd };
-		sqe->addr = (uint64_t)(uintptr_t)addr;
-		sqe->addr2 = (uint64_t)(uintptr_t)addrlen;
+		sqe->addr  = (__u64)addr;
+		sqe->addr2 = (__u64)addrlen;
 		sqe->accept_flags = flags;
 
@@ -313,12 +341,12 @@
 
 		(*sqe){ IORING_OP_CONNECT, sockfd };
-		sqe->addr = (uint64_t)(uintptr_t)addr;
-		sqe->off  = (uint64_t)(uintptr_t)addrlen;
-
-		__submit_wait
-	#endif
-}
-
-int cfa_fallocate(int fd, int mode, uint64_t offset, uint64_t len, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
+		sqe->addr = (__u64)addr;
+		sqe->off  = (__u64)addrlen;
+
+		__submit_wait
+	#endif
+}
+
+int cfa_fallocate(int fd, int mode, off_t offset, off_t len, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FALLOCATE)
 		return fallocate( fd, mode, offset, len );
@@ -337,5 +365,5 @@
 }
 
-int cfa_fadvise(int fd, uint64_t offset, uint64_t len, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
+int cfa_fadvise(int fd, off_t offset, off_t len, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FADVISE)
 		return posix_fadvise( fd, offset, len, advice );
@@ -344,5 +372,5 @@
 
 		(*sqe){ IORING_OP_FADVISE, fd };
-		sqe->off = (uint64_t)offset;
+		sqe->off = (__u64)offset;
 		sqe->len = len;
 		sqe->fadvise_advice = advice;
@@ -359,5 +387,5 @@
 
 		(*sqe){ IORING_OP_MADVISE, 0 };
-		sqe->addr = (uint64_t)addr;
+		sqe->addr = (__u64)addr;
 		sqe->len = length;
 		sqe->fadvise_advice = advice;
@@ -374,5 +402,5 @@
 
 		(*sqe){ IORING_OP_OPENAT, dirfd };
-		sqe->addr = (uint64_t)pathname;
+		sqe->addr = (__u64)pathname;
 		sqe->open_flags = flags;
 		sqe->len = mode;
@@ -407,5 +435,5 @@
 		__submit_prelude
 
-		(*sqe){ IORING_OP_STATX, dirfd, pathname, mask, (uint64_t)statxbuf };
+		(*sqe){ IORING_OP_STATX, dirfd, pathname, mask, (__u64)statxbuf };
 		sqe->statx_flags = flags;
 
@@ -449,5 +477,5 @@
 		}
 		else {
-			sqe->off = (uint64_t)-1;
+			sqe->off = (__u64)-1;
 		}
 		sqe->len = len;
@@ -457,5 +485,5 @@
 		}
 		else {
-			sqe->splice_off_in = (uint64_t)-1;
+			sqe->splice_off_in = (__u64)-1;
 		}
 		sqe->splice_flags  = flags | (SPLICE_FLAGS & submit_flags);
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 88cafe7991e4d4b769f11c34a6c1af0ae4ecffba)
+++ libcfa/src/concurrency/kernel.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -102,9 +102,12 @@
 // Kernel Scheduling logic
 static $thread * __next_thread(cluster * this);
-static bool __has_next_thread(cluster * this);
+static $thread * __next_thread_slow(cluster * this);
 static void __run_thread(processor * this, $thread * dst);
-static bool __wake_one(struct __processor_id_t * id, cluster * cltr);
-static void __halt(processor * this);
-bool __wake_proc(processor *);
+static void __wake_one(struct __processor_id_t * id, cluster * cltr);
+
+static void push  (__cluster_idles & idles, processor & proc);
+static void remove(__cluster_idles & idles, processor & proc);
+static [unsigned idle, unsigned total, * processor] query( & __cluster_idles idles );
+
 
 //=============================================================================================
@@ -116,4 +119,6 @@
 	// Do it here
 	kernelTLS.rand_seed ^= rdtscl();
+	kernelTLS.ready_rng.fwd_seed = 25214903917_l64u * (rdtscl() ^ (uintptr_t)&runner);
+	__tls_rand_advance_bck();
 
 	processor * this = runner.proc;
@@ -134,27 +139,67 @@
 
 		$thread * readyThread = 0p;
-		for( unsigned int spin_count = 0;; spin_count++ ) {
+		MAIN_LOOP:
+		for() {
 			// Try to get the next thread
 			readyThread = __next_thread( this->cltr );
 
-			// Check if we actually found a thread
-			if( readyThread ) {
-				/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-				/* paranoid */ verifyf( readyThread->state == Ready || readyThread->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", readyThread->state, readyThread->preempted);
-				/* paranoid */ verifyf( readyThread->link.next == 0p, "Expected null got %p", readyThread->link.next );
-				__builtin_prefetch( readyThread->context.SP );
-
-				// We found a thread run it
-				__run_thread(this, readyThread);
-
-				/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+			if( !readyThread ) {
+				readyThread = __next_thread_slow( this->cltr );
 			}
 
-			if(__atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST)) break;
-
+			HALT:
 			if( !readyThread ) {
-				// Block until a thread is ready
-				__halt(this);
+				// Don't block if we are done
+				if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
+
+				#if !defined(__CFA_NO_STATISTICS__)
+					__tls_stats()->ready.sleep.halts++;
+				#endif
+
+				// Push self to idle stack
+				push(this->cltr->idles, * this);
+
+				// Confirm the ready-queue is empty
+				readyThread = __next_thread_slow( this->cltr );
+				if( readyThread ) {
+					// A thread was found, cancel the halt
+					remove(this->cltr->idles, * this);
+
+					#if !defined(__CFA_NO_STATISTICS__)
+						__tls_stats()->ready.sleep.cancels++;
+					#endif
+
+					// continue the mai loop
+					break HALT;
+				}
+
+				#if !defined(__CFA_NO_STATISTICS__)
+					if(this->print_halts) {
+						__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->id, rdtscl());
+					}
+				#endif
+
+				wait( this->idle );
+
+				#if !defined(__CFA_NO_STATISTICS__)
+					if(this->print_halts) {
+						__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->id, rdtscl());
+					}
+				#endif
+
+				// We were woken up, remove self from idle
+				remove(this->cltr->idles, * this);
+
+				// DON'T just proceed, start looking again
+				continue MAIN_LOOP;
 			}
+
+			/* paranoid */ verify( readyThread );
+
+			// We found a thread run it
+			__run_thread(this, readyThread);
+
+			// Are we done?
+			if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
 		}
 
@@ -181,4 +226,9 @@
 // from the processor coroutine to the target thread
 static void __run_thread(processor * this, $thread * thrd_dst) {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verifyf( thrd_dst->state == Ready || thrd_dst->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", thrd_dst->state, thrd_dst->preempted);
+	/* paranoid */ verifyf( thrd_dst->link.next == 0p, "Expected null got %p", thrd_dst->link.next );
+	__builtin_prefetch( thrd_dst->context.SP );
+
 	$coroutine * proc_cor = get_coroutine(this->runner);
 
@@ -260,4 +310,6 @@
 	proc_cor->state = Active;
 	kernelTLS.this_thread = 0p;
+
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 }
 
@@ -316,13 +368,5 @@
 	ready_schedule_lock  ( id );
 		push( thrd->curr_cluster, thrd );
-
-		#if !defined(__CFA_NO_STATISTICS__)
-			bool woke =
-		#endif
-			__wake_one(id, thrd->curr_cluster);
-
-		#if !defined(__CFA_NO_STATISTICS__)
-			if(woke) __tls_stats()->ready.sleep.wakes++;
-		#endif
+		__wake_one(id, thrd->curr_cluster);
 	ready_schedule_unlock( id );
 
@@ -331,25 +375,25 @@
 
 // KERNEL ONLY
-static $thread * __next_thread(cluster * this) with( *this ) {
+static inline $thread * __next_thread(cluster * this) with( *this ) {
 	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 
 	ready_schedule_lock  ( (__processor_id_t*)kernelTLS.this_processor );
-		$thread * head = pop( this );
+		$thread * thrd = pop( this );
 	ready_schedule_unlock( (__processor_id_t*)kernelTLS.this_processor );
 
 	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	return head;
+	return thrd;
 }
 
 // KERNEL ONLY
-static bool __has_next_thread(cluster * this) with( *this ) {
+static inline $thread * __next_thread_slow(cluster * this) with( *this ) {
 	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 
 	ready_schedule_lock  ( (__processor_id_t*)kernelTLS.this_processor );
-		bool not_empty = query( this );
+		$thread * thrd = pop_slow( this );
 	ready_schedule_unlock( (__processor_id_t*)kernelTLS.this_processor );
 
 	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	return not_empty;
+	return thrd;
 }
 
@@ -441,21 +485,32 @@
 //=============================================================================================
 // Wake a thread from the front if there are any
-static bool __wake_one(struct __processor_id_t * id, cluster * this) {
+static void __wake_one(struct __processor_id_t * id, cluster * this) {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 	/* paranoid */ verify( ready_schedule_islocked( id ) );
 
 	// Check if there is a sleeping processor
-	processor * p = pop(this->idles);
+	processor * p;
+	unsigned idle;
+	unsigned total;
+	[idle, total, p] = query(this->idles);
 
 	// If no one is sleeping, we are done
-	if( 0p == p ) return false;
+	if( idle == 0 ) return;
 
 	// We found a processor, wake it up
 	post( p->idle );
 
-	return true;
+	#if !defined(__CFA_NO_STATISTICS__)
+		__tls_stats()->ready.sleep.wakes++;
+	#endif
+
+	/* paranoid */ verify( ready_schedule_islocked( id ) );
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+
+	return;
 }
 
 // Unconditionnaly wake a thread
-bool __wake_proc(processor * this) {
+void __wake_proc(processor * this) {
 	__cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this);
 
@@ -464,42 +519,40 @@
 		bool ret = post( this->idle );
 	enable_interrupts( __cfaabi_dbg_ctx );
-
-	return ret;
-}
-
-static void __halt(processor * this) with( *this ) {
-	if( do_terminate ) return;
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		__tls_stats()->ready.sleep.halts++;
-	#endif
-	// Push self to queue
-	push(cltr->idles, *this);
-
-	// Makre sure we don't miss a thread
-	if( __has_next_thread(cltr) ) {
-		// A thread was posted, make sure a processor is woken up
-		struct __processor_id_t *id = (struct __processor_id_t *) this;
-		ready_schedule_lock  ( id );
-			__wake_one( id, cltr );
-		ready_schedule_unlock( id );
-		#if !defined(__CFA_NO_STATISTICS__)
-			__tls_stats()->ready.sleep.cancels++;
-		#endif
-	}
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		if(this->print_halts) {
-			__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->id, rdtscl());
-		}
-	#endif
-
-	wait( idle );
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		if(this->print_halts) {
-			__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->id, rdtscl());
-		}
-	#endif
+}
+
+static void push  (__cluster_idles & this, processor & proc) {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	lock( this );
+		this.idle++;
+		/* paranoid */ verify( this.idle <= this.total );
+
+		insert_first(this.list, proc);
+	unlock( this );
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+}
+
+static void remove(__cluster_idles & this, processor & proc) {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	lock( this );
+		this.idle--;
+		/* paranoid */ verify( this.idle >= 0 );
+
+		remove(proc);
+	unlock( this );
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+}
+
+static [unsigned idle, unsigned total, * processor] query( & __cluster_idles this ) {
+	for() {
+		uint64_t l = __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST);
+		if( 1 == (l % 2) ) { Pause(); continue; }
+		unsigned idle    = this.idle;
+		unsigned total   = this.total;
+		processor * proc = &this.list`first;
+		// Compiler fence is unnecessary, but gcc-8 and older incorrectly reorder code without it
+		asm volatile("": : :"memory");
+		if(l != __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST)) { Pause(); continue; }
+		return [idle, total, proc];
+	}
 }
 
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 88cafe7991e4d4b769f11c34a6c1af0ae4ecffba)
+++ libcfa/src/concurrency/kernel.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -20,5 +20,5 @@
 #include "coroutine.hfa"
 
-#include "containers/stackLockFree.hfa"
+#include "containers/list.hfa"
 
 extern "C" {
@@ -99,5 +99,5 @@
 
 	// Link lists fields
-	Link(processor) link;
+	DLISTED_MGD_IMPL_IN(processor)
 
 	#if !defined(__CFA_NO_STATISTICS__)
@@ -119,5 +119,5 @@
 static inline void  ?{}(processor & this, const char name[]) { this{name, *mainCluster }; }
 
-static inline Link(processor) * ?`next( processor * this ) { return &this->link; }
+DLISTED_MGD_IMPL_OUT(processor)
 
 //-----------------------------------------------------------------------------
@@ -206,4 +206,19 @@
 void ^?{}(__ready_queue_t & this);
 
+// Idle Sleep
+struct __cluster_idles {
+	// Spin lock protecting the queue
+	volatile uint64_t lock;
+
+	// Total number of processors
+	unsigned total;
+
+	// Total number of idle processors
+	unsigned idle;
+
+	// List of idle processors
+	dlist(processor, processor) list;
+};
+
 //-----------------------------------------------------------------------------
 // Cluster
@@ -219,6 +234,5 @@
 
 	// List of idle processors
-	StackLF(processor) idles;
-	volatile unsigned int nprocessors;
+	__cluster_idles idles;
 
 	// List of threads
Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision 88cafe7991e4d4b769f11c34a6c1af0ae4ecffba)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -50,5 +50,11 @@
 				uint64_t rand_seed;
 			#endif
+			struct {
+				uint64_t fwd_seed;
+				uint64_t bck_seed;
+			} ready_rng;
 		} kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
+
+
 
 		static inline uint64_t __tls_rand() {
@@ -58,4 +64,32 @@
 				return __xorshift64( kernelTLS.rand_seed );
 			#endif
+		}
+
+		#define M  (1_l64u << 48_l64u)
+		#define A  (25214903917_l64u)
+		#define AI (18446708753438544741_l64u)
+		#define C  (11_l64u)
+		#define D  (16_l64u)
+
+		static inline unsigned __tls_rand_fwd() {
+
+			kernelTLS.ready_rng.fwd_seed = (A * kernelTLS.ready_rng.fwd_seed + C) & (M - 1);
+			return kernelTLS.ready_rng.fwd_seed >> D;
+		}
+
+		static inline unsigned __tls_rand_bck() {
+			unsigned int r = kernelTLS.ready_rng.bck_seed >> D;
+			kernelTLS.ready_rng.bck_seed = AI * (kernelTLS.ready_rng.bck_seed - C) & (M - 1);
+			return r;
+		}
+
+		#undef M
+		#undef A
+		#undef AI
+		#undef C
+		#undef D
+
+		static inline void __tls_rand_advance_bck(void) {
+			kernelTLS.ready_rng.bck_seed = kernelTLS.ready_rng.fwd_seed;
 		}
 	}
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 88cafe7991e4d4b769f11c34a6c1af0ae4ecffba)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -78,4 +78,8 @@
 static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info);
 
+#if defined(__CFA_WITH_VERIFY__)
+	static bool verify_fwd_bck_rng(void);
+#endif
+
 //-----------------------------------------------------------------------------
 // Forward Declarations for other modules
@@ -87,5 +91,5 @@
 //-----------------------------------------------------------------------------
 // Other Forward Declarations
-extern bool __wake_proc(processor *);
+extern void __wake_proc(processor *);
 
 //-----------------------------------------------------------------------------
@@ -158,4 +162,6 @@
 	__cfa_dbg_global_clusters.list{ __get };
 	__cfa_dbg_global_clusters.lock{};
+
+	/* paranoid */ verify( verify_fwd_bck_rng() );
 
 	// Initialize the global scheduler lock
@@ -475,5 +481,7 @@
 	#endif
 
-	int target = __atomic_add_fetch( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+	lock( this.cltr->idles );
+		int target = this.cltr->idles.total += 1u;
+	unlock( this.cltr->idles );
 
 	id = doregister((__processor_id_t*)&this);
@@ -493,6 +501,7 @@
 // Not a ctor, it just preps the destruction but should not destroy members
 static void deinit(processor & this) {
-
-	int target = __atomic_sub_fetch( &this.cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+	lock( this.cltr->idles );
+		int target = this.cltr->idles.total -= 1u;
+	unlock( this.cltr->idles );
 
 	// Lock the RWlock so no-one pushes/pops while we are changing the queue
@@ -501,7 +510,4 @@
 		// Adjust the ready queue size
 		ready_queue_shrink( this.cltr, target );
-
-		// Make sure we aren't on the idle queue
-		unsafe_remove( this.cltr->idles, &this );
 
 	// Unlock the RWlock
@@ -516,5 +522,8 @@
 	( this.terminated ){ 0 };
 	( this.runner ){};
-	init( this, name, _cltr );
+
+	disable_interrupts();
+		init( this, name, _cltr );
+	enable_interrupts( __cfaabi_dbg_ctx );
 
 	__cfadbg_print_safe(runtime_core, "Kernel : Starting core %p\n", &this);
@@ -540,13 +549,21 @@
 	free( this.stack );
 
-	deinit( this );
+	disable_interrupts();
+		deinit( this );
+	enable_interrupts( __cfaabi_dbg_ctx );
 }
 
 //-----------------------------------------------------------------------------
 // Cluster
+static void ?{}(__cluster_idles & this) {
+	this.lock  = 0;
+	this.idle  = 0;
+	this.total = 0;
+	(this.list){};
+}
+
 void ?{}(cluster & this, const char name[], Duration preemption_rate, unsigned num_io, const io_context_params & io_params) with( this ) {
 	this.name = name;
 	this.preemption_rate = preemption_rate;
-	this.nprocessors = 0;
 	ready_queue{};
 
@@ -666,2 +683,23 @@
 	return stack;
 }
+
+#if defined(__CFA_WITH_VERIFY__)
+static bool verify_fwd_bck_rng(void) {
+	kernelTLS.ready_rng.fwd_seed = 25214903917_l64u * (rdtscl() ^ (uintptr_t)&verify_fwd_bck_rng);
+
+	unsigned values[10];
+	for(i; 10) {
+		values[i] = __tls_rand_fwd();
+	}
+
+	__tls_rand_advance_bck();
+
+	for ( i; 9 -~= 0 ) {
+		if(values[i] != __tls_rand_bck()) {
+			return false;
+		}
+	}
+
+	return true;
+}
+#endif
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 88cafe7991e4d4b769f11c34a6c1af0ae4ecffba)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -121,4 +121,22 @@
 void     unregister( struct __processor_id_t * proc );
 
+//-----------------------------------------------------------------------
+// Cluster idle lock/unlock
+static inline void lock(__cluster_idles & this) {
+	for() {
+		uint64_t l = this.lock;
+		if(
+			(0 == (l % 2))
+			&& __atomic_compare_exchange_n(&this.lock, &l, l + 1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+		) return;
+		Pause();
+	}
+}
+
+static inline void unlock(__cluster_idles & this) {
+	/* paranoid */ verify( 1 == (this.lock % 2) );
+	__atomic_fetch_add( &this.lock, 1, __ATOMIC_SEQ_CST );
+}
+
 //=======================================================================
 // Reader-writer lock implementation
@@ -248,5 +266,12 @@
 // pop thread from the ready queue of a cluster
 // returns 0p if empty
+// May return 0p spuriously
 __attribute__((hot)) struct $thread * pop(struct cluster * cltr);
+
+//-----------------------------------------------------------------------
+// pop thread from the ready queue of a cluster
+// returns 0p if empty
+// guaranteed to find any threads added before this call
+__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr);
 
 //-----------------------------------------------------------------------
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 88cafe7991e4d4b769f11c34a6c1af0ae4ecffba)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -17,4 +17,6 @@
 // #define __CFA_DEBUG_PRINT_READY_QUEUE__
 
+// #define USE_SNZI
+
 #include "bits/defs.hfa"
 #include "kernel_private.hfa"
@@ -148,4 +150,6 @@
 //  queues or removing them.
 uint_fast32_t ready_mutate_lock( void ) with(*__scheduler_lock) {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+
 	// Step 1 : lock global lock
 	// It is needed to avoid processors that register mid Critical-Section
@@ -162,8 +166,11 @@
 	}
 
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 	return s;
 }
 
 void ready_mutate_unlock( uint_fast32_t last_s ) with(*__scheduler_lock) {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+
 	// Step 1 : release local locks
 	// This must be done while the global lock is held to avoid
@@ -180,4 +187,6 @@
 	/*paranoid*/ assert(true == lock);
 	__atomic_store_n(&lock, (bool)false, __ATOMIC_RELEASE);
+
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 }
 
@@ -192,5 +201,7 @@
 void ^?{}(__ready_queue_t & this) with (this) {
 	verify( 1 == lanes.count );
-	verify( !query( snzi ) );
+	#ifdef USE_SNZI
+		verify( !query( snzi ) );
+	#endif
 	free(lanes.data);
 }
@@ -198,5 +209,8 @@
 //-----------------------------------------------------------------------
 __attribute__((hot)) bool query(struct cluster * cltr) {
-	return query(cltr->ready_queue.snzi);
+	#ifdef USE_SNZI
+		return query(cltr->ready_queue.snzi);
+	#endif
+	return true;
 }
 
@@ -262,12 +276,14 @@
 	bool lane_first = push(lanes.data[i], thrd);
 
-	// If this lane used to be empty we need to do more
-	if(lane_first) {
-		// Check if the entire queue used to be empty
-		first = !query(snzi);
-
-		// Update the snzi
-		arrive( snzi, i );
-	}
+	#ifdef USE_SNZI
+		// If this lane used to be empty we need to do more
+		if(lane_first) {
+			// Check if the entire queue used to be empty
+			first = !query(snzi);
+
+			// Update the snzi
+			arrive( snzi, i );
+		}
+	#endif
 
 	// Unlock and return
@@ -294,4 +310,5 @@
 __attribute__((hot)) $thread * pop(struct cluster * cltr) with (cltr->ready_queue) {
 	/* paranoid */ verify( lanes.count > 0 );
+	unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
 	#if defined(BIAS)
 		// Don't bother trying locally too much
@@ -300,5 +317,9 @@
 
 	// As long as the list is not empty, try finding a lane that isn't empty and pop from it
-	while( query(snzi) ) {
+	#ifdef USE_SNZI
+		while( query(snzi) ) {
+	#else
+		for(25) {
+	#endif
 		// Pick two lists at random
 		unsigned i,j;
@@ -336,6 +357,6 @@
 		#endif
 
-		i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
-		j %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+		i %= count;
+		j %= count;
 
 		// try popping from the 2 picked lists
@@ -353,4 +374,21 @@
 }
 
+__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+	/* paranoid */ verify( lanes.count > 0 );
+	unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+	unsigned offset = __tls_rand();
+	for(i; count) {
+		unsigned idx = (offset + i) % count;
+		struct $thread * thrd = try_pop(cltr, idx);
+		if(thrd) {
+			return thrd;
+		}
+	}
+
+	// All lanes where empty return 0p
+	return 0p;
+}
+
+
 //-----------------------------------------------------------------------
 // Given 2 indexes, pick the list with the oldest push an try to pop from it
@@ -388,14 +426,15 @@
 	// Actually pop the list
 	struct $thread * thrd;
-	bool emptied;
-	[thrd, emptied] = pop(lane);
+	thrd = pop(lane);
 
 	/* paranoid */ verify(thrd);
 	/* paranoid */ verify(lane.lock);
 
-	// If this was the last element in the lane
-	if(emptied) {
-		depart( snzi, w );
-	}
+	#ifdef USE_SNZI
+		// If this was the last element in the lane
+		if(emptied) {
+			depart( snzi, w );
+		}
+	#endif
 
 	// Unlock and return
@@ -424,13 +463,14 @@
 			if(head(lane)->link.next == thrd) {
 				$thread * pthrd;
-				bool emptied;
-				[pthrd, emptied] = pop(lane);
+				pthrd = pop(lane);
 
 				/* paranoid */ verify( pthrd == thrd );
 
 				removed = true;
-				if(emptied) {
-					depart( snzi, i );
-				}
+				#ifdef USE_SNZI
+					if(emptied) {
+						depart( snzi, i );
+					}
+				#endif
 			}
 		__atomic_unlock(&lane.lock);
@@ -494,5 +534,7 @@
 	// grow the ready queue
 	with( cltr->ready_queue ) {
-		^(snzi){};
+		#ifdef USE_SNZI
+			^(snzi){};
+		#endif
 
 		// Find new count
@@ -516,11 +558,13 @@
 		lanes.count = ncount;
 
-		// Re-create the snzi
-		snzi{ log2( lanes.count / 8 ) };
-		for( idx; (size_t)lanes.count ) {
-			if( !is_empty(lanes.data[idx]) ) {
-				arrive(snzi, idx);
-			}
-		}
+		#ifdef USE_SNZI
+			// Re-create the snzi
+			snzi{ log2( lanes.count / 8 ) };
+			for( idx; (size_t)lanes.count ) {
+				if( !is_empty(lanes.data[idx]) ) {
+					arrive(snzi, idx);
+				}
+			}
+		#endif
 	}
 
@@ -542,5 +586,7 @@
 
 	with( cltr->ready_queue ) {
-		^(snzi){};
+		#ifdef USE_SNZI
+			^(snzi){};
+		#endif
 
 		// Remember old count
@@ -567,6 +613,5 @@
 			while(!is_empty(lanes.data[idx])) {
 				struct $thread * thrd;
-				__attribute__((unused)) bool _;
-				[thrd, _] = pop(lanes.data[idx]);
+				thrd = pop(lanes.data[idx]);
 
 				push(cltr, thrd);
@@ -596,11 +641,13 @@
 		}
 
-		// Re-create the snzi
-		snzi{ log2( lanes.count / 8 ) };
-		for( idx; (size_t)lanes.count ) {
-			if( !is_empty(lanes.data[idx]) ) {
-				arrive(snzi, idx);
-			}
-		}
+		#ifdef USE_SNZI
+			// Re-create the snzi
+			snzi{ log2( lanes.count / 8 ) };
+			for( idx; (size_t)lanes.count ) {
+				if( !is_empty(lanes.data[idx]) ) {
+					arrive(snzi, idx);
+				}
+			}
+		#endif
 	}
 
Index: libcfa/src/concurrency/ready_subqueue.hfa
===================================================================
--- libcfa/src/concurrency/ready_subqueue.hfa	(revision 88cafe7991e4d4b769f11c34a6c1af0ae4ecffba)
+++ libcfa/src/concurrency/ready_subqueue.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -144,5 +144,5 @@
 // returns popped
 // returns true of lane was empty before push, false otherwise
-[$thread *, bool] pop(__intrusive_lane_t & this) {
+$thread * pop(__intrusive_lane_t & this) {
 	/* paranoid */ verify(this.lock);
 	/* paranoid */ verify(this.before.link.ts != 0ul);
@@ -162,5 +162,6 @@
 	head->link.next = next;
 	next->link.prev = head;
-	node->link.[next, prev] = 0p;
+	node->link.next = 0p;
+	node->link.prev = 0p;
 
 	// Update head time stamp
@@ -180,5 +181,5 @@
 		/* paranoid */ verify(tail(this)->link.prev == head(this));
 		/* paranoid */ verify(head(this)->link.next == tail(this));
-		return [node, true];
+		return node;
 	}
 	else {
@@ -187,5 +188,5 @@
 		/* paranoid */ verify(head(this)->link.next != tail(this));
 		/* paranoid */ verify(this.before.link.ts != 0);
-		return [node, false];
+		return node;
 	}
 }
Index: libcfa/src/concurrency/stats.cfa
===================================================================
--- libcfa/src/concurrency/stats.cfa	(revision 88cafe7991e4d4b769f11c34a6c1af0ae4ecffba)
+++ libcfa/src/concurrency/stats.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -38,6 +38,6 @@
 			stats->io.submit_q.busy   = 0;
 			stats->io.complete_q.completed_avg.val = 0;
-			stats->io.complete_q.completed_avg.slow_cnt = 0;
-			stats->io.complete_q.completed_avg.fast_cnt = 0;
+			stats->io.complete_q.completed_avg.cnt = 0;
+			stats->io.complete_q.blocks = 0;
 		#endif
 	}
@@ -60,20 +60,20 @@
 
 		#if defined(CFA_HAVE_LINUX_IO_URING_H)
-			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.rdy          , proc->io.submit_q.submit_avg.rdy          , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.csm          , proc->io.submit_q.submit_avg.csm          , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.avl          , proc->io.submit_q.submit_avg.avl          , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.cnt          , proc->io.submit_q.submit_avg.cnt          , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.look_avg.val            , proc->io.submit_q.look_avg.val            , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.look_avg.cnt            , proc->io.submit_q.look_avg.cnt            , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.look_avg.block          , proc->io.submit_q.look_avg.block          , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.val           , proc->io.submit_q.alloc_avg.val           , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.cnt           , proc->io.submit_q.alloc_avg.cnt           , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.block         , proc->io.submit_q.alloc_avg.block         , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.helped                  , proc->io.submit_q.helped                  , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.leader                  , proc->io.submit_q.leader                  , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.busy                    , proc->io.submit_q.busy                    , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.val     , proc->io.complete_q.completed_avg.val     , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.slow_cnt, proc->io.complete_q.completed_avg.slow_cnt, __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.fast_cnt, proc->io.complete_q.completed_avg.fast_cnt, __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.rdy     , proc->io.submit_q.submit_avg.rdy     , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.csm     , proc->io.submit_q.submit_avg.csm     , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.avl     , proc->io.submit_q.submit_avg.avl     , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.cnt     , proc->io.submit_q.submit_avg.cnt     , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.look_avg.val       , proc->io.submit_q.look_avg.val       , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.look_avg.cnt       , proc->io.submit_q.look_avg.cnt       , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.look_avg.block     , proc->io.submit_q.look_avg.block     , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.val      , proc->io.submit_q.alloc_avg.val      , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.cnt      , proc->io.submit_q.alloc_avg.cnt      , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.block    , proc->io.submit_q.alloc_avg.block    , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.helped             , proc->io.submit_q.helped             , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.leader             , proc->io.submit_q.leader             , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.busy               , proc->io.submit_q.busy               , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.val, proc->io.complete_q.completed_avg.val, __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.cnt, proc->io.complete_q.completed_avg.cnt, __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.complete_q.blocks           , proc->io.complete_q.blocks           , __ATOMIC_SEQ_CST );
 		#endif
 	}
@@ -154,6 +154,7 @@
 					"- avg alloc search len   : %'18.2lf\n"
 					"- avg alloc search block : %'18.2lf\n"
-					"- total wait calls       : %'15" PRIu64 "   (%'" PRIu64 " slow, %'" PRIu64 " fast)\n"
+					"- total wait calls       : %'15" PRIu64 "\n"
 					"- avg completion/wait    : %'18.2lf\n"
+					"- total completion blocks: %'15" PRIu64 "\n"
 					"\n"
 					, cluster ? "Cluster" : "Processor",  name, id
@@ -165,7 +166,7 @@
 					, io.submit_q.alloc_avg.cnt
 					, aavgv, aavgb
-					, io.complete_q.completed_avg.slow_cnt + io.complete_q.completed_avg.fast_cnt
-					, io.complete_q.completed_avg.slow_cnt,  io.complete_q.completed_avg.fast_cnt
-					, ((double)io.complete_q.completed_avg.val) / (io.complete_q.completed_avg.slow_cnt + io.complete_q.completed_avg.fast_cnt)
+					, io.complete_q.completed_avg.cnt
+					, ((double)io.complete_q.completed_avg.val) / io.complete_q.completed_avg.cnt
+					, io.complete_q.blocks
 				);
 			}
Index: libcfa/src/concurrency/stats.hfa
===================================================================
--- libcfa/src/concurrency/stats.hfa	(revision 88cafe7991e4d4b769f11c34a6c1af0ae4ecffba)
+++ libcfa/src/concurrency/stats.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -90,7 +90,7 @@
 				struct {
 					volatile uint64_t val;
-					volatile uint64_t slow_cnt;
-					volatile uint64_t fast_cnt;
+					volatile uint64_t cnt;
 				} completed_avg;
+				volatile uint64_t blocks;
 			} complete_q;
 		};
