Index: libcfa/src/Makefile.am
===================================================================
--- libcfa/src/Makefile.am	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/Makefile.am	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -44,5 +44,5 @@
 
 headers = common.hfa fstream.hfa heap.hfa iostream.hfa iterator.hfa limits.hfa rational.hfa \
-		time.hfa stdlib.hfa memory.hfa \
+		time.hfa stdlib.hfa parseargs.hfa \
 		containers/maybe.hfa containers/pair.hfa containers/result.hfa containers/vector.hfa
 
Index: libcfa/src/bits/locks.hfa
===================================================================
--- libcfa/src/bits/locks.hfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/bits/locks.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -217,3 +217,37 @@
 		}
 	}
+
+	// Semaphore which only supports a single thread and one post
+	// Semaphore which only supports a single thread
+	struct oneshot {
+		struct $thread * volatile ptr;
+	};
+
+	static inline {
+		void  ?{}(oneshot & this) {
+			this.ptr = 0p;
+		}
+
+		void ^?{}(oneshot & this) {}
+
+		bool wait(oneshot & this) {
+			for() {
+				struct $thread * expected = this.ptr;
+				if(expected == 1p) return false;
+				/* paranoid */ verify( expected == 0p );
+				if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					park( __cfaabi_dbg_ctx );
+					/* paranoid */ verify( this.ptr == 1p );
+					return true;
+				}
+			}
+		}
+
+		bool post(oneshot & this) {
+			struct $thread * got = __atomic_exchange_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
+			if( got == 0p ) return false;
+			unpark( got __cfaabi_dbg_ctx2 );
+			return true;
+		}
+	}
 #endif
Index: libcfa/src/common.hfa
===================================================================
--- libcfa/src/common.hfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/common.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -10,6 +10,6 @@
 // Created On       : Wed Jul 11 17:54:36 2018
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Thu Jul 12 08:02:18 2018
-// Update Count     : 5
+// Last Modified On : Sat Aug 15 08:51:29 2020
+// Update Count     : 14
 // 
 
@@ -67,7 +67,13 @@
 
 static inline {
+	char min( char t1, char t2 ) { return t1 < t2 ? t1 : t2; } // optimization
+	intptr_t min( intptr_t t1, intptr_t t2 ) { return t1 < t2 ? t1 : t2; } // optimization
+	uintptr_t min( uintptr_t t1, uintptr_t t2 ) { return t1 < t2 ? t1 : t2; } // optimization
 	forall( otype T | { int ?<?( T, T ); } )
 	T min( T t1, T t2 ) { return t1 < t2 ? t1 : t2; }
 
+	char max( char t1, char t2 ) { return t1 > t2 ? t1 : t2; } // optimization
+	intptr_t max( intptr_t t1, intptr_t t2 ) { return t1 > t2 ? t1 : t2; } // optimization
+	uintptr_t max( uintptr_t t1, uintptr_t t2 ) { return t1 > t2 ? t1 : t2; } // optimization
 	forall( otype T | { int ?>?( T, T ); } )
 	T max( T t1, T t2 ) { return t1 > t2 ? t1 : t2; }
Index: libcfa/src/concurrency/alarm.hfa
===================================================================
--- libcfa/src/concurrency/alarm.hfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/concurrency/alarm.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -23,5 +23,5 @@
 #include "time.hfa"
 
-#include <containers/list.hfa>
+#include "containers/list.hfa"
 
 struct $thread;
Index: libcfa/src/concurrency/coroutine.cfa
===================================================================
--- libcfa/src/concurrency/coroutine.cfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/concurrency/coroutine.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -215,4 +215,8 @@
 		return cor;
 	}
+
+	struct $coroutine * __cfactx_cor_active(void) {
+		return active_coroutine();
+	}
 }
 
Index: libcfa/src/concurrency/invoke.c
===================================================================
--- libcfa/src/concurrency/invoke.c	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/concurrency/invoke.c	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -10,6 +10,6 @@
 // Created On       : Tue Jan 17 12:27:26 2016
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Thu Aug 20 18:54:34 2020
-// Update Count     : 30
+// Last Modified On : Thu Aug 20 23:43:23 2020
+// Update Count     : 31
 //
 
@@ -29,4 +29,5 @@
 // Called from the kernel when starting a coroutine or task so must switch back to user mode.
 
+extern struct $coroutine * __cfactx_cor_active(void);
 extern struct $coroutine * __cfactx_cor_finish(void);
 extern void __cfactx_cor_leave ( struct $coroutine * );
@@ -35,4 +36,8 @@
 extern void disable_interrupts() OPTIONAL_THREAD;
 extern void enable_interrupts( __cfaabi_dbg_ctx_param );
+
+struct exception_context_t * this_exception_context() {
+	return &__get_stack( __cfactx_cor_active() )->exception_context;
+}
 
 void __cfactx_invoke_coroutine(
@@ -146,5 +151,13 @@
 
 #elif defined( __ARM_ARCH_32 )
-#warning ARM needs to be upgrade to use two parameters like X86/X64 (A.K.A. : I broke this and do not know how to fix it)
+#error ARM needs to be upgrade to use two parameters like X86/X64 (A.K.A. : I broke this and do not know how to fix it)
+	// More details about the error:
+	// To avoid the thunk problem, I changed the invoke routine to pass the main explicitly
+	// instead of relying on an assertion. This effectively hoists any required thunk one level
+	// which was enough to get to global scope in most cases.
+	// This means that __cfactx_invoke_... now takes two parameters and the FakeStack needs
+	// to be adjusted as a consequence of that.
+	// I don't know how to do that for ARM, hence the #error
+
 	struct FakeStack {
 		float fpRegs[16];								// floating point registers
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/concurrency/invoke.h	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -26,4 +26,11 @@
 #ifndef _INVOKE_H_
 #define _INVOKE_H_
+
+	struct __cfaehm_try_resume_node;
+	struct __cfaehm_base_exception_t;
+	struct exception_context_t {
+		struct __cfaehm_try_resume_node * top_resume;
+		struct __cfaehm_base_exception_t * current_exception;
+	};
 
 	struct __stack_context_t {
@@ -51,4 +58,7 @@
 		// base of stack
 		void * base;
+
+		// Information for exception handling.
+		struct exception_context_t exception_context;
 	};
 
@@ -84,5 +94,9 @@
 	};
 
-	static inline struct __stack_t * __get_stack( struct $coroutine * cor ) { return (struct __stack_t*)(((uintptr_t)cor->stack.storage) & ((uintptr_t)-2)); }
+	static inline struct __stack_t * __get_stack( struct $coroutine * cor ) {
+		return (struct __stack_t*)(((uintptr_t)cor->stack.storage) & ((uintptr_t)-2));
+	}
+
+	struct exception_context_t * this_exception_context();
 
 	// struct which calls the monitor is accepting
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/concurrency/io.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -41,4 +41,67 @@
 	#include "kernel/fwd.hfa"
 	#include "io/types.hfa"
+
+	// returns true of acquired as leader or second leader
+	static inline bool try_lock( __leaderlock_t & this ) {
+		const uintptr_t thrd = 1z | (uintptr_t)active_thread();
+		bool block;
+		disable_interrupts();
+		for() {
+			struct $thread * expected = this.value;
+			if( 1p != expected && 0p != expected ) {
+				/* paranoid */ verify( thrd != (uintptr_t)expected ); // We better not already be the next leader
+				enable_interrupts( __cfaabi_dbg_ctx );
+				return false;
+			}
+			struct $thread * desired;
+			if( 0p == expected ) {
+				// If the lock isn't locked acquire it, no need to block
+				desired = 1p;
+				block = false;
+			}
+			else {
+				// If the lock is already locked try becomming the next leader
+				desired = (struct $thread *)thrd;
+				block = true;
+			}
+			if( __atomic_compare_exchange_n(&this.value, &expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) ) break;
+		}
+		if( block ) {
+			enable_interrupts( __cfaabi_dbg_ctx );
+			park( __cfaabi_dbg_ctx );
+			disable_interrupts();
+		}
+		return true;
+	}
+
+	static inline bool next( __leaderlock_t & this ) {
+		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+		struct $thread * nextt;
+		for() {
+			struct $thread * expected = this.value;
+			/* paranoid */ verify( (1 & (uintptr_t)expected) == 1 ); // The lock better be locked
+
+			struct $thread * desired;
+			if( 1p == expected ) {
+				// No next leader, just unlock
+				desired = 0p;
+				nextt   = 0p;
+			}
+			else {
+				// There is a next leader, remove but keep locked
+				desired = 1p;
+				nextt   = (struct $thread *)(~1z & (uintptr_t)expected);
+			}
+			if( __atomic_compare_exchange_n(&this.value, &expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) ) break;
+		}
+
+		if(nextt) {
+			unpark( nextt __cfaabi_dbg_ctx2 );
+			enable_interrupts( __cfaabi_dbg_ctx );
+			return true;
+		}
+		enable_interrupts( __cfaabi_dbg_ctx );
+		return false;
+	}
 
 //=============================================================================================
@@ -93,5 +156,5 @@
 //=============================================================================================
 	static unsigned __collect_submitions( struct __io_data & ring );
-	static uint32_t __release_consumed_submission( struct __io_data & ring );
+	static __u32 __release_consumed_submission( struct __io_data & ring );
 
 	static inline void process(struct io_uring_cqe & cqe ) {
@@ -100,5 +163,5 @@
 
 		data->result = cqe.res;
-		unpark( data->thrd __cfaabi_dbg_ctx2 );
+		post( data->sem );
 	}
 
@@ -136,5 +199,5 @@
 		unsigned head = *ring.completion_q.head;
 		unsigned tail = *ring.completion_q.tail;
-		const uint32_t mask = *ring.completion_q.mask;
+		const __u32 mask = *ring.completion_q.mask;
 
 		// Nothing was new return 0
@@ -143,5 +206,5 @@
 		}
 
-		uint32_t count = tail - head;
+		__u32 count = tail - head;
 		/* paranoid */ verify( count != 0 );
 		for(i; count) {
@@ -182,5 +245,5 @@
 				__STATS__( true,
 					io.complete_q.completed_avg.val += count;
-					io.complete_q.completed_avg.fast_cnt += 1;
+					io.complete_q.completed_avg.cnt += 1;
 				)
 			enable_interrupts( __cfaabi_dbg_ctx );
@@ -192,4 +255,7 @@
 			// We didn't get anything baton pass to the slow poller
 			else {
+				__STATS__( false,
+					io.complete_q.blocks += 1;
+				)
 				__cfadbg_print_safe(io_core, "Kernel I/O : Parking io poller %p\n", &this.self);
 				reset = 0;
@@ -224,5 +290,5 @@
 //
 
-	[* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring, uint64_t data ) {
+	[* struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data ) {
 		/* paranoid */ verify( data != 0 );
 
@@ -230,9 +296,9 @@
 		__attribute((unused)) int len   = 0;
 		__attribute((unused)) int block = 0;
-		uint32_t cnt = *ring.submit_q.num;
-		uint32_t mask = *ring.submit_q.mask;
+		__u32 cnt = *ring.submit_q.num;
+		__u32 mask = *ring.submit_q.mask;
 
 		disable_interrupts();
-			uint32_t off = __tls_rand();
+			__u32 off = __tls_rand();
 		enable_interrupts( __cfaabi_dbg_ctx );
 
@@ -241,8 +307,8 @@
 			// Look through the list starting at some offset
 			for(i; cnt) {
-				uint64_t expected = 0;
-				uint32_t idx = (i + off) & mask;
+				__u64 expected = 0;
+				__u32 idx = (i + off) & mask;
 				struct io_uring_sqe * sqe = &ring.submit_q.sqes[idx];
-				volatile uint64_t * udata = (volatile uint64_t *)&sqe->user_data;
+				volatile __u64 * udata = &sqe->user_data;
 
 				if( *udata == expected &&
@@ -270,5 +336,5 @@
 	}
 
-	static inline uint32_t __submit_to_ready_array( struct __io_data & ring, uint32_t idx, const uint32_t mask ) {
+	static inline __u32 __submit_to_ready_array( struct __io_data & ring, __u32 idx, const __u32 mask ) {
 		/* paranoid */ verify( idx <= mask   );
 		/* paranoid */ verify( idx != -1ul32 );
@@ -277,15 +343,15 @@
 		__attribute((unused)) int len   = 0;
 		__attribute((unused)) int block = 0;
-		uint32_t ready_mask = ring.submit_q.ready_cnt - 1;
+		__u32 ready_mask = ring.submit_q.ready_cnt - 1;
 
 		disable_interrupts();
-			uint32_t off = __tls_rand();
+			__u32 off = __tls_rand();
 		enable_interrupts( __cfaabi_dbg_ctx );
 
-		uint32_t picked;
+		__u32 picked;
 		LOOKING: for() {
 			for(i; ring.submit_q.ready_cnt) {
 				picked = (i + off) & ready_mask;
-				uint32_t expected = -1ul32;
+				__u32 expected = -1ul32;
 				if( __atomic_compare_exchange_n( &ring.submit_q.ready[picked], &expected, idx, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED ) ) {
 					break LOOKING;
@@ -297,9 +363,7 @@
 
 			block++;
-			if( try_lock(ring.submit_q.lock __cfaabi_dbg_ctx2) ) {
-				__release_consumed_submission( ring );
-				unlock( ring.submit_q.lock );
-			}
-			else {
+
+			__u32 released = __release_consumed_submission( ring );
+			if( released == 0 ) {
 				yield();
 			}
@@ -316,9 +380,9 @@
 	}
 
-	void __submit( struct io_context * ctx, uint32_t idx ) __attribute__((nonnull (1))) {
+	void __submit( struct io_context * ctx, __u32 idx ) __attribute__((nonnull (1))) {
 		__io_data & ring = *ctx->thrd.ring;
 		// Get now the data we definetely need
-		volatile uint32_t * const tail = ring.submit_q.tail;
-		const uint32_t mask  = *ring.submit_q.mask;
+		volatile __u32 * const tail = ring.submit_q.tail;
+		const __u32 mask  = *ring.submit_q.mask;
 
 		// There are 2 submission schemes, check which one we are using
@@ -332,12 +396,8 @@
 		}
 		else if( ring.eager_submits ) {
-			uint32_t picked = __submit_to_ready_array( ring, idx, mask );
-
-			for() {
-				yield();
-
-				// If some one else collected our index, we are done
-				#warning ABA problem
-				if( ring.submit_q.ready[picked] != idx ) {
+			__u32 picked = __submit_to_ready_array( ring, idx, mask );
+
+			#if defined(LEADER_LOCK)
+				if( !try_lock(ring.submit_q.submit_lock) ) {
 					__STATS__( false,
 						io.submit_q.helped += 1;
@@ -345,26 +405,48 @@
 					return;
 				}
-
-				if( try_lock(ring.submit_q.lock __cfaabi_dbg_ctx2) ) {
+				/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+				__STATS__( true,
+					io.submit_q.leader += 1;
+				)
+			#else
+				for() {
+					yield();
+
+					if( try_lock(ring.submit_q.submit_lock __cfaabi_dbg_ctx2) ) {
+						__STATS__( false,
+							io.submit_q.leader += 1;
+						)
+						break;
+					}
+
+					// If some one else collected our index, we are done
+					#warning ABA problem
+					if( ring.submit_q.ready[picked] != idx ) {
+						__STATS__( false,
+							io.submit_q.helped += 1;
+						)
+						return;
+					}
+
 					__STATS__( false,
-						io.submit_q.leader += 1;
+						io.submit_q.busy += 1;
 					)
-					break;
-				}
-
-				__STATS__( false,
-					io.submit_q.busy += 1;
-				)
-			}
+				}
+			#endif
 
 			// We got the lock
+			// Collect the submissions
 			unsigned to_submit = __collect_submitions( ring );
+
+			// Actually submit
 			int ret = __io_uring_enter( ring, to_submit, false );
-			if( ret < 0 ) {
-				unlock(ring.submit_q.lock);
-				return;
-			}
-
-			/* paranoid */ verify( ret > 0 || to_submit == 0 || (ring.ring_flags & IORING_SETUP_SQPOLL) );
+
+			#if defined(LEADER_LOCK)
+				/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+				next(ring.submit_q.submit_lock);
+			#else
+				unlock(ring.submit_q.submit_lock);
+			#endif
+			if( ret < 0 ) return;
 
 			// Release the consumed SQEs
@@ -372,15 +454,17 @@
 
 			// update statistics
-			__STATS__( true,
+			__STATS__( false,
 				io.submit_q.submit_avg.rdy += to_submit;
 				io.submit_q.submit_avg.csm += ret;
 				io.submit_q.submit_avg.cnt += 1;
 			)
-
-			unlock(ring.submit_q.lock);
 		}
 		else {
 			// get mutual exclusion
-			lock(ring.submit_q.lock __cfaabi_dbg_ctx2);
+			#if defined(LEADER_LOCK)
+				while(!try_lock(ring.submit_q.submit_lock));
+			#else
+				lock(ring.submit_q.submit_lock __cfaabi_dbg_ctx2);
+			#endif
 
 			/* paranoid */ verifyf( ring.submit_q.sqes[ idx ].user_data != 0,
@@ -420,5 +504,9 @@
 			__release_consumed_submission( ring );
 
-			unlock(ring.submit_q.lock);
+			#if defined(LEADER_LOCK)
+				next(ring.submit_q.submit_lock);
+			#else
+				unlock(ring.submit_q.submit_lock);
+			#endif
 
 			__cfadbg_print_safe( io, "Kernel I/O : Performed io_submit for %p, returned %d\n", active_thread(), ret );
@@ -426,4 +514,5 @@
 	}
 
+	// #define PARTIAL_SUBMIT 32
 	static unsigned __collect_submitions( struct __io_data & ring ) {
 		/* paranoid */ verify( ring.submit_q.ready != 0p );
@@ -431,11 +520,24 @@
 
 		unsigned to_submit = 0;
-		uint32_t tail = *ring.submit_q.tail;
-		const uint32_t mask = *ring.submit_q.mask;
+		__u32 tail = *ring.submit_q.tail;
+		const __u32 mask = *ring.submit_q.mask;
+		#if defined(PARTIAL_SUBMIT)
+			#if defined(LEADER_LOCK)
+				#error PARTIAL_SUBMIT and LEADER_LOCK cannot co-exist
+			#endif
+			const __u32 cnt = ring.submit_q.ready_cnt > PARTIAL_SUBMIT ? PARTIAL_SUBMIT : ring.submit_q.ready_cnt;
+			const __u32 offset = ring.submit_q.prev_ready;
+			ring.submit_q.prev_ready += cnt;
+		#else
+			const __u32 cnt = ring.submit_q.ready_cnt;
+			const __u32 offset = 0;
+		#endif
 
 		// Go through the list of ready submissions
-		for( i; ring.submit_q.ready_cnt ) {
+		for( c; cnt ) {
+			__u32 i = (offset + c) % ring.submit_q.ready_cnt;
+
 			// replace any submission with the sentinel, to consume it.
-			uint32_t idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
+			__u32 idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
 
 			// If it was already the sentinel, then we are done
@@ -453,16 +555,16 @@
 	}
 
-	static uint32_t __release_consumed_submission( struct __io_data & ring ) {
-		const uint32_t smask = *ring.submit_q.mask;
+	static __u32 __release_consumed_submission( struct __io_data & ring ) {
+		const __u32 smask = *ring.submit_q.mask;
 
 		if( !try_lock(ring.submit_q.release_lock __cfaabi_dbg_ctx2) ) return 0;
-		uint32_t chead = *ring.submit_q.head;
-		uint32_t phead = ring.submit_q.prev_head;
+		__u32 chead = *ring.submit_q.head;
+		__u32 phead = ring.submit_q.prev_head;
 		ring.submit_q.prev_head = chead;
 		unlock(ring.submit_q.release_lock);
 
-		uint32_t count = chead - phead;
+		__u32 count = chead - phead;
 		for( i; count ) {
-			uint32_t idx = ring.submit_q.array[ (phead + i) & smask ];
+			__u32 idx = ring.submit_q.array[ (phead + i) & smask ];
 			ring.submit_q.sqes[ idx ].user_data = 0;
 		}
Index: libcfa/src/concurrency/io/setup.cfa
===================================================================
--- libcfa/src/concurrency/io/setup.cfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/concurrency/io/setup.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -228,5 +228,5 @@
 		if( cluster_context ) {
 			cluster & cltr = *thrd.curr_cluster;
-			/* paranoid */ verify( cltr.nprocessors == 0 || &cltr == mainCluster );
+			/* paranoid */ verify( cltr.idles.total == 0 || &cltr == mainCluster );
 			/* paranoid */ verify( !ready_mutate_islocked() );
 
@@ -298,5 +298,11 @@
 		if( params_in.poll_complete ) params.flags |= IORING_SETUP_IOPOLL;
 
-		uint32_t nentries = params_in.num_entries;
+		__u32 nentries = params_in.num_entries != 0 ? params_in.num_entries : 256;
+		if( !is_pow2(nentries) ) {
+			abort("ERROR: I/O setup 'num_entries' must be a power of 2\n");
+		}
+		if( params_in.poller_submits && params_in.eager_submits ) {
+			abort("ERROR: I/O setup 'poller_submits' and 'eager_submits' cannot be used together\n");
+		}
 
 		int fd = syscall(__NR_io_uring_setup, nentries, &params );
@@ -356,15 +362,15 @@
 		// Get the pointers from the kernel to fill the structure
 		// submit queue
-		sq.head    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
-		sq.tail    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
-		sq.mask    = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
-		sq.num     = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
-		sq.flags   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
-		sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
-		sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
+		sq.head    = (volatile __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
+		sq.tail    = (volatile __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
+		sq.mask    = (   const __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
+		sq.num     = (   const __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
+		sq.flags   = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
+		sq.dropped = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
+		sq.array   = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
 		sq.prev_head = *sq.head;
 
 		{
-			const uint32_t num = *sq.num;
+			const __u32 num = *sq.num;
 			for( i; num ) {
 				sq.sqes[i].user_data = 0ul64;
@@ -372,5 +378,5 @@
 		}
 
-		(sq.lock){};
+		(sq.submit_lock){};
 		(sq.release_lock){};
 
@@ -382,17 +388,19 @@
 				sq.ready[i] = -1ul32;
 			}
+			sq.prev_ready = 0;
 		}
 		else {
 			sq.ready_cnt = 0;
 			sq.ready = 0p;
+			sq.prev_ready = 0;
 		}
 
 		// completion queue
-		cq.head     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
-		cq.tail     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
-		cq.mask     = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
-		cq.num      = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
-		cq.overflow = (         uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
-		cq.cqes   = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
+		cq.head      = (volatile __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
+		cq.tail      = (volatile __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
+		cq.mask      = (   const __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
+		cq.num       = (   const __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
+		cq.overflow  = (         __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
+		cq.cqes = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
 
 		// some paranoid checks
@@ -442,5 +450,5 @@
 	void __ioctx_register($io_ctx_thread & ctx, struct epoll_event & ev) {
 		ev.events = EPOLLIN | EPOLLONESHOT;
-		ev.data.u64 = (uint64_t)&ctx;
+		ev.data.u64 = (__u64)&ctx;
 		int ret = epoll_ctl(iopoll.epollfd, EPOLL_CTL_ADD, ctx.ring->fd, &ev);
 		if (ret < 0) {
Index: libcfa/src/concurrency/io/types.hfa
===================================================================
--- libcfa/src/concurrency/io/types.hfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/concurrency/io/types.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -17,5 +17,16 @@
 
 #if defined(CFA_HAVE_LINUX_IO_URING_H)
+	extern "C" {
+		#include <linux/types.h>
+	}
+
       #include "bits/locks.hfa"
+
+	#define LEADER_LOCK
+	struct __leaderlock_t {
+		struct $thread * volatile value;	// ($thread) next_leader | (bool:1) is_locked
+	};
+
+	static inline void ?{}( __leaderlock_t & this ) { this.value = 0p; }
 
 	//-----------------------------------------------------------------------
@@ -23,28 +34,33 @@
       struct __submition_data {
 		// Head and tail of the ring (associated with array)
-		volatile uint32_t * head;
-		volatile uint32_t * tail;
-		volatile uint32_t prev_head;
+		volatile __u32 * head;
+		volatile __u32 * tail;
+		volatile __u32 prev_head;
 
 		// The actual kernel ring which uses head/tail
 		// indexes into the sqes arrays
-		uint32_t * array;
+		__u32 * array;
 
 		// number of entries and mask to go with it
-		const uint32_t * num;
-		const uint32_t * mask;
+		const __u32 * num;
+		const __u32 * mask;
 
 		// Submission flags (Not sure what for)
-		uint32_t * flags;
+		__u32 * flags;
 
 		// number of sqes not submitted (whatever that means)
-		uint32_t * dropped;
+		__u32 * dropped;
 
 		// Like head/tail but not seen by the kernel
-		volatile uint32_t * ready;
-		uint32_t ready_cnt;
+		volatile __u32 * ready;
+		__u32 ready_cnt;
+		__u32 prev_ready;
 
-		__spinlock_t lock;
-		__spinlock_t release_lock;
+		#if defined(LEADER_LOCK)
+			__leaderlock_t submit_lock;
+		#else
+			__spinlock_t submit_lock;
+		#endif
+		__spinlock_t  release_lock;
 
 		// A buffer of sqes (not the actual ring)
@@ -58,13 +74,13 @@
 	struct __completion_data {
 		// Head and tail of the ring
-		volatile uint32_t * head;
-		volatile uint32_t * tail;
+		volatile __u32 * head;
+		volatile __u32 * tail;
 
 		// number of entries and mask to go with it
-		const uint32_t * mask;
-		const uint32_t * num;
+		const __u32 * mask;
+		const __u32 * num;
 
 		// number of cqes not submitted (whatever that means)
-		uint32_t * overflow;
+		__u32 * overflow;
 
 		// the kernel ring
@@ -79,5 +95,5 @@
 		struct __submition_data submit_q;
 		struct __completion_data completion_q;
-		uint32_t ring_flags;
+		__u32 ring_flags;
 		int fd;
 		bool eager_submits:1;
@@ -89,6 +105,6 @@
 	// IO user data
 	struct __io_user_data_t {
-		int32_t result;
-		$thread * thrd;
+		__s32 result;
+		oneshot sem;
 	};
 
Index: libcfa/src/concurrency/iocall.cfa
===================================================================
--- libcfa/src/concurrency/iocall.cfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/concurrency/iocall.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -32,8 +32,8 @@
 	#include "io/types.hfa"
 
-	extern [* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring, uint64_t data );
-	extern void __submit( struct io_context * ctx, uint32_t idx ) __attribute__((nonnull (1)));
-
-	static inline void ?{}(struct io_uring_sqe & this, uint8_t opcode, int fd) {
+	extern [* struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data );
+	extern void __submit( struct io_context * ctx, __u32 idx ) __attribute__((nonnull (1)));
+
+	static inline void ?{}(struct io_uring_sqe & this, __u8 opcode, int fd) {
 		this.opcode = opcode;
 		#if !defined(IOSQE_ASYNC)
@@ -51,8 +51,8 @@
 	}
 
-	static inline void ?{}(struct io_uring_sqe & this, uint8_t opcode, int fd, void * addr, uint32_t len, uint64_t off ) {
+	static inline void ?{}(struct io_uring_sqe & this, __u8 opcode, int fd, void * addr, __u32 len, __u64 off ) {
 		(this){ opcode, fd };
 		this.off = off;
-		this.addr = (uint64_t)(uintptr_t)addr;
+		this.addr = (__u64)(uintptr_t)addr;
 		this.len = len;
 	}
@@ -101,21 +101,21 @@
 	#endif
 
-
 	#define __submit_prelude \
 		if( 0 != (submit_flags & LINK_FLAGS) ) { errno = ENOTSUP; return -1; } \
 		(void)timeout; (void)cancellation; \
 		if( !context ) context = __get_io_context(); \
-		__io_user_data_t data = { 0, active_thread() }; \
+		__io_user_data_t data = { 0 }; \
 		struct __io_data & ring = *context->thrd.ring; \
 		struct io_uring_sqe * sqe; \
-		uint32_t idx; \
-		[sqe, idx] = __submit_alloc( ring, (uint64_t)(uintptr_t)&data ); \
-		sqe->flags = REGULAR_FLAGS & submit_flags;
+		__u32 idx; \
+		__u8 sflags = REGULAR_FLAGS & submit_flags; \
+		[sqe, idx] = __submit_alloc( ring, (__u64)(uintptr_t)&data ); \
+		sqe->flags = sflags;
 
 	#define __submit_wait \
 		/*__cfaabi_bits_print_safe( STDERR_FILENO, "Preparing user data %p for %p\n", &data, data.thrd );*/ \
-		verify( sqe->user_data == (uint64_t)(uintptr_t)&data ); \
+		verify( sqe->user_data == (__u64)(uintptr_t)&data ); \
 		__submit( context, idx ); \
-		park( __cfaabi_dbg_ctx ); \
+		wait( data.sem ); \
 		if( data.result < 0 ) { \
 			errno = -data.result; \
@@ -149,5 +149,12 @@
 
 	extern int fsync(int fd);
-	extern int sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags);
+
+	#if __OFF_T_MATCHES_OFF64_T
+		typedef __off64_t off_t;
+	#else
+		typedef __off_t off_t;
+	#endif
+	typedef __off64_t off64_t;
+	extern int sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags);
 
 	struct msghdr;
@@ -160,6 +167,6 @@
 	extern int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
 
-	extern int fallocate(int fd, int mode, uint64_t offset, uint64_t len);
-	extern int posix_fadvise(int fd, uint64_t offset, uint64_t len, int advice);
+	extern int fallocate(int fd, int mode, off_t offset, off_t len);
+	extern int posix_fadvise(int fd, off_t offset, off_t len, int advice);
 	extern int madvise(void *addr, size_t length, int advice);
 
@@ -186,5 +193,12 @@
 			__submit_prelude
 
-			(*sqe){ IORING_OP_READV, fd, iov, iovcnt, offset };
+			sqe->opcode = IORING_OP_READV;
+			sqe->ioprio = 0;
+			sqe->fd = fd;
+			sqe->off = offset;
+			sqe->addr = (__u64)iov;
+			sqe->len = iovcnt;
+			sqe->rw_flags = 0;
+			sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
 
 			__submit_wait
@@ -200,5 +214,12 @@
 			__submit_prelude
 
-			(*sqe){ IORING_OP_WRITEV, fd, iov, iovcnt, offset };
+			sqe->opcode = IORING_OP_WRITEV;
+			sqe->ioprio = 0;
+			sqe->fd = fd;
+			sqe->off = offset;
+			sqe->addr = (__u64)iov;
+			sqe->len = iovcnt;
+			sqe->rw_flags = 0;
+			sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
 
 			__submit_wait
@@ -213,11 +234,18 @@
 		__submit_prelude
 
-		(*sqe){ IORING_OP_FSYNC, fd };
-
-		__submit_wait
-	#endif
-}
-
-int cfa_sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
+		sqe->opcode = IORING_OP_FSYNC;
+		sqe->ioprio = 0;
+		sqe->fd = fd;
+		sqe->off = 0;
+		sqe->addr = 0;
+		sqe->len = 0;
+		sqe->rw_flags = 0;
+		sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
+
+		__submit_wait
+	#endif
+}
+
+int cfa_sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SYNC_FILE_RANGE)
 		return sync_file_range(fd, offset, nbytes, flags);
@@ -268,5 +296,5 @@
 
 		(*sqe){ IORING_OP_SEND, sockfd };
-		sqe->addr = (uint64_t)buf;
+		sqe->addr = (__u64)buf;
 		sqe->len = len;
 		sqe->msg_flags = flags;
@@ -283,5 +311,5 @@
 
 		(*sqe){ IORING_OP_RECV, sockfd };
-		sqe->addr = (uint64_t)buf;
+		sqe->addr = (__u64)buf;
 		sqe->len = len;
 		sqe->msg_flags = flags;
@@ -298,6 +326,6 @@
 
 		(*sqe){ IORING_OP_ACCEPT, sockfd };
-		sqe->addr = (uint64_t)(uintptr_t)addr;
-		sqe->addr2 = (uint64_t)(uintptr_t)addrlen;
+		sqe->addr  = (__u64)addr;
+		sqe->addr2 = (__u64)addrlen;
 		sqe->accept_flags = flags;
 
@@ -313,12 +341,12 @@
 
 		(*sqe){ IORING_OP_CONNECT, sockfd };
-		sqe->addr = (uint64_t)(uintptr_t)addr;
-		sqe->off  = (uint64_t)(uintptr_t)addrlen;
-
-		__submit_wait
-	#endif
-}
-
-int cfa_fallocate(int fd, int mode, uint64_t offset, uint64_t len, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
+		sqe->addr = (__u64)addr;
+		sqe->off  = (__u64)addrlen;
+
+		__submit_wait
+	#endif
+}
+
+int cfa_fallocate(int fd, int mode, off_t offset, off_t len, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FALLOCATE)
 		return fallocate( fd, mode, offset, len );
@@ -337,5 +365,5 @@
 }
 
-int cfa_fadvise(int fd, uint64_t offset, uint64_t len, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
+int cfa_fadvise(int fd, off_t offset, off_t len, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FADVISE)
 		return posix_fadvise( fd, offset, len, advice );
@@ -344,5 +372,5 @@
 
 		(*sqe){ IORING_OP_FADVISE, fd };
-		sqe->off = (uint64_t)offset;
+		sqe->off = (__u64)offset;
 		sqe->len = len;
 		sqe->fadvise_advice = advice;
@@ -359,5 +387,5 @@
 
 		(*sqe){ IORING_OP_MADVISE, 0 };
-		sqe->addr = (uint64_t)addr;
+		sqe->addr = (__u64)addr;
 		sqe->len = length;
 		sqe->fadvise_advice = advice;
@@ -374,5 +402,5 @@
 
 		(*sqe){ IORING_OP_OPENAT, dirfd };
-		sqe->addr = (uint64_t)pathname;
+		sqe->addr = (__u64)pathname;
 		sqe->open_flags = flags;
 		sqe->len = mode;
@@ -407,5 +435,5 @@
 		__submit_prelude
 
-		(*sqe){ IORING_OP_STATX, dirfd, pathname, mask, (uint64_t)statxbuf };
+		(*sqe){ IORING_OP_STATX, dirfd, pathname, mask, (__u64)statxbuf };
 		sqe->statx_flags = flags;
 
@@ -449,5 +477,5 @@
 		}
 		else {
-			sqe->off = (uint64_t)-1;
+			sqe->off = (__u64)-1;
 		}
 		sqe->len = len;
@@ -457,5 +485,5 @@
 		}
 		else {
-			sqe->splice_off_in = (uint64_t)-1;
+			sqe->splice_off_in = (__u64)-1;
 		}
 		sqe->splice_flags  = flags | (SPLICE_FLAGS & submit_flags);
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/concurrency/kernel.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -102,9 +102,12 @@
 // Kernel Scheduling logic
 static $thread * __next_thread(cluster * this);
-static bool __has_next_thread(cluster * this);
+static $thread * __next_thread_slow(cluster * this);
 static void __run_thread(processor * this, $thread * dst);
-static bool __wake_one(struct __processor_id_t * id, cluster * cltr);
-static void __halt(processor * this);
-bool __wake_proc(processor *);
+static void __wake_one(struct __processor_id_t * id, cluster * cltr);
+
+static void push  (__cluster_idles & idles, processor & proc);
+static void remove(__cluster_idles & idles, processor & proc);
+static [unsigned idle, unsigned total, * processor] query( & __cluster_idles idles );
+
 
 //=============================================================================================
@@ -116,4 +119,6 @@
 	// Do it here
 	kernelTLS.rand_seed ^= rdtscl();
+	kernelTLS.ready_rng.fwd_seed = 25214903917_l64u * (rdtscl() ^ (uintptr_t)&runner);
+	__tls_rand_advance_bck();
 
 	processor * this = runner.proc;
@@ -134,27 +139,67 @@
 
 		$thread * readyThread = 0p;
-		for( unsigned int spin_count = 0;; spin_count++ ) {
+		MAIN_LOOP:
+		for() {
 			// Try to get the next thread
 			readyThread = __next_thread( this->cltr );
 
-			// Check if we actually found a thread
-			if( readyThread ) {
-				/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-				/* paranoid */ verifyf( readyThread->state == Ready || readyThread->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", readyThread->state, readyThread->preempted);
-				/* paranoid */ verifyf( readyThread->link.next == 0p, "Expected null got %p", readyThread->link.next );
-				__builtin_prefetch( readyThread->context.SP );
-
-				// We found a thread run it
-				__run_thread(this, readyThread);
-
-				/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+			if( !readyThread ) {
+				readyThread = __next_thread_slow( this->cltr );
 			}
 
-			if(__atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST)) break;
-
+			HALT:
 			if( !readyThread ) {
-				// Block until a thread is ready
-				__halt(this);
+				// Don't block if we are done
+				if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
+
+				#if !defined(__CFA_NO_STATISTICS__)
+					__tls_stats()->ready.sleep.halts++;
+				#endif
+
+				// Push self to idle stack
+				push(this->cltr->idles, * this);
+
+				// Confirm the ready-queue is empty
+				readyThread = __next_thread_slow( this->cltr );
+				if( readyThread ) {
+					// A thread was found, cancel the halt
+					remove(this->cltr->idles, * this);
+
+					#if !defined(__CFA_NO_STATISTICS__)
+						__tls_stats()->ready.sleep.cancels++;
+					#endif
+
+					// continue the mai loop
+					break HALT;
+				}
+
+				#if !defined(__CFA_NO_STATISTICS__)
+					if(this->print_halts) {
+						__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->id, rdtscl());
+					}
+				#endif
+
+				wait( this->idle );
+
+				#if !defined(__CFA_NO_STATISTICS__)
+					if(this->print_halts) {
+						__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->id, rdtscl());
+					}
+				#endif
+
+				// We were woken up, remove self from idle
+				remove(this->cltr->idles, * this);
+
+				// DON'T just proceed, start looking again
+				continue MAIN_LOOP;
 			}
+
+			/* paranoid */ verify( readyThread );
+
+			// We found a thread run it
+			__run_thread(this, readyThread);
+
+			// Are we done?
+			if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
 		}
 
@@ -181,4 +226,9 @@
 // from the processor coroutine to the target thread
 static void __run_thread(processor * this, $thread * thrd_dst) {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verifyf( thrd_dst->state == Ready || thrd_dst->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", thrd_dst->state, thrd_dst->preempted);
+	/* paranoid */ verifyf( thrd_dst->link.next == 0p, "Expected null got %p", thrd_dst->link.next );
+	__builtin_prefetch( thrd_dst->context.SP );
+
 	$coroutine * proc_cor = get_coroutine(this->runner);
 
@@ -260,4 +310,6 @@
 	proc_cor->state = Active;
 	kernelTLS.this_thread = 0p;
+
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 }
 
@@ -316,13 +368,5 @@
 	ready_schedule_lock  ( id );
 		push( thrd->curr_cluster, thrd );
-
-		#if !defined(__CFA_NO_STATISTICS__)
-			bool woke =
-		#endif
-			__wake_one(id, thrd->curr_cluster);
-
-		#if !defined(__CFA_NO_STATISTICS__)
-			if(woke) __tls_stats()->ready.sleep.wakes++;
-		#endif
+		__wake_one(id, thrd->curr_cluster);
 	ready_schedule_unlock( id );
 
@@ -331,25 +375,25 @@
 
 // KERNEL ONLY
-static $thread * __next_thread(cluster * this) with( *this ) {
+static inline $thread * __next_thread(cluster * this) with( *this ) {
 	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 
 	ready_schedule_lock  ( (__processor_id_t*)kernelTLS.this_processor );
-		$thread * head = pop( this );
+		$thread * thrd = pop( this );
 	ready_schedule_unlock( (__processor_id_t*)kernelTLS.this_processor );
 
 	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	return head;
+	return thrd;
 }
 
 // KERNEL ONLY
-static bool __has_next_thread(cluster * this) with( *this ) {
+static inline $thread * __next_thread_slow(cluster * this) with( *this ) {
 	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 
 	ready_schedule_lock  ( (__processor_id_t*)kernelTLS.this_processor );
-		bool not_empty = query( this );
+		$thread * thrd = pop_slow( this );
 	ready_schedule_unlock( (__processor_id_t*)kernelTLS.this_processor );
 
 	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	return not_empty;
+	return thrd;
 }
 
@@ -441,21 +485,32 @@
 //=============================================================================================
 // Wake a thread from the front if there are any
-static bool __wake_one(struct __processor_id_t * id, cluster * this) {
+static void __wake_one(struct __processor_id_t * id, cluster * this) {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 	/* paranoid */ verify( ready_schedule_islocked( id ) );
 
 	// Check if there is a sleeping processor
-	processor * p = pop(this->idles);
+	processor * p;
+	unsigned idle;
+	unsigned total;
+	[idle, total, p] = query(this->idles);
 
 	// If no one is sleeping, we are done
-	if( 0p == p ) return false;
+	if( idle == 0 ) return;
 
 	// We found a processor, wake it up
 	post( p->idle );
 
-	return true;
+	#if !defined(__CFA_NO_STATISTICS__)
+		__tls_stats()->ready.sleep.wakes++;
+	#endif
+
+	/* paranoid */ verify( ready_schedule_islocked( id ) );
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+
+	return;
 }
 
 // Unconditionnaly wake a thread
-bool __wake_proc(processor * this) {
+void __wake_proc(processor * this) {
 	__cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this);
 
@@ -464,42 +519,40 @@
 		bool ret = post( this->idle );
 	enable_interrupts( __cfaabi_dbg_ctx );
-
-	return ret;
-}
-
-static void __halt(processor * this) with( *this ) {
-	if( do_terminate ) return;
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		__tls_stats()->ready.sleep.halts++;
-	#endif
-	// Push self to queue
-	push(cltr->idles, *this);
-
-	// Makre sure we don't miss a thread
-	if( __has_next_thread(cltr) ) {
-		// A thread was posted, make sure a processor is woken up
-		struct __processor_id_t *id = (struct __processor_id_t *) this;
-		ready_schedule_lock  ( id );
-			__wake_one( id, cltr );
-		ready_schedule_unlock( id );
-		#if !defined(__CFA_NO_STATISTICS__)
-			__tls_stats()->ready.sleep.cancels++;
-		#endif
-	}
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		if(this->print_halts) {
-			__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->id, rdtscl());
-		}
-	#endif
-
-	wait( idle );
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		if(this->print_halts) {
-			__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->id, rdtscl());
-		}
-	#endif
+}
+
+static void push  (__cluster_idles & this, processor & proc) {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	lock( this );
+		this.idle++;
+		/* paranoid */ verify( this.idle <= this.total );
+
+		insert_first(this.list, proc);
+	unlock( this );
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+}
+
+static void remove(__cluster_idles & this, processor & proc) {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	lock( this );
+		this.idle--;
+		/* paranoid */ verify( this.idle >= 0 );
+
+		remove(proc);
+	unlock( this );
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+}
+
+static [unsigned idle, unsigned total, * processor] query( & __cluster_idles this ) {
+	for() {
+		uint64_t l = __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST);
+		if( 1 == (l % 2) ) { Pause(); continue; }
+		unsigned idle    = this.idle;
+		unsigned total   = this.total;
+		processor * proc = &this.list`first;
+		// Compiler fence is unnecessary, but gcc-8 and older incorrectly reorder code without it
+		asm volatile("": : :"memory");
+		if(l != __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST)) { Pause(); continue; }
+		return [idle, total, proc];
+	}
 }
 
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/concurrency/kernel.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -20,5 +20,5 @@
 #include "coroutine.hfa"
 
-#include "containers/stackLockFree.hfa"
+#include "containers/list.hfa"
 
 extern "C" {
@@ -99,5 +99,5 @@
 
 	// Link lists fields
-	Link(processor) link;
+	DLISTED_MGD_IMPL_IN(processor)
 
 	#if !defined(__CFA_NO_STATISTICS__)
@@ -119,5 +119,5 @@
 static inline void  ?{}(processor & this, const char name[]) { this{name, *mainCluster }; }
 
-static inline Link(processor) * ?`next( processor * this ) { return &this->link; }
+DLISTED_MGD_IMPL_OUT(processor)
 
 //-----------------------------------------------------------------------------
@@ -206,4 +206,19 @@
 void ^?{}(__ready_queue_t & this);
 
+// Idle Sleep
+struct __cluster_idles {
+	// Spin lock protecting the queue
+	volatile uint64_t lock;
+
+	// Total number of processors
+	unsigned total;
+
+	// Total number of idle processors
+	unsigned idle;
+
+	// List of idle processors
+	dlist(processor, processor) list;
+};
+
 //-----------------------------------------------------------------------------
 // Cluster
@@ -219,6 +234,5 @@
 
 	// List of idle processors
-	StackLF(processor) idles;
-	volatile unsigned int nprocessors;
+	__cluster_idles idles;
 
 	// List of threads
Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -50,5 +50,11 @@
 				uint64_t rand_seed;
 			#endif
+			struct {
+				uint64_t fwd_seed;
+				uint64_t bck_seed;
+			} ready_rng;
 		} kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
+
+
 
 		static inline uint64_t __tls_rand() {
@@ -58,4 +64,32 @@
 				return __xorshift64( kernelTLS.rand_seed );
 			#endif
+		}
+
+		#define M  (1_l64u << 48_l64u)
+		#define A  (25214903917_l64u)
+		#define AI (18446708753438544741_l64u)
+		#define C  (11_l64u)
+		#define D  (16_l64u)
+
+		static inline unsigned __tls_rand_fwd() {
+
+			kernelTLS.ready_rng.fwd_seed = (A * kernelTLS.ready_rng.fwd_seed + C) & (M - 1);
+			return kernelTLS.ready_rng.fwd_seed >> D;
+		}
+
+		static inline unsigned __tls_rand_bck() {
+			unsigned int r = kernelTLS.ready_rng.bck_seed >> D;
+			kernelTLS.ready_rng.bck_seed = AI * (kernelTLS.ready_rng.bck_seed - C) & (M - 1);
+			return r;
+		}
+
+		#undef M
+		#undef A
+		#undef AI
+		#undef C
+		#undef D
+
+		static inline void __tls_rand_advance_bck(void) {
+			kernelTLS.ready_rng.bck_seed = kernelTLS.ready_rng.fwd_seed;
 		}
 	}
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -78,4 +78,8 @@
 static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info);
 
+#if defined(__CFA_WITH_VERIFY__)
+	static bool verify_fwd_bck_rng(void);
+#endif
+
 //-----------------------------------------------------------------------------
 // Forward Declarations for other modules
@@ -87,5 +91,5 @@
 //-----------------------------------------------------------------------------
 // Other Forward Declarations
-extern bool __wake_proc(processor *);
+extern void __wake_proc(processor *);
 
 //-----------------------------------------------------------------------------
@@ -158,4 +162,6 @@
 	__cfa_dbg_global_clusters.list{ __get };
 	__cfa_dbg_global_clusters.lock{};
+
+	/* paranoid */ verify( verify_fwd_bck_rng() );
 
 	// Initialize the global scheduler lock
@@ -475,5 +481,7 @@
 	#endif
 
-	int target = __atomic_add_fetch( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+	lock( this.cltr->idles );
+		int target = this.cltr->idles.total += 1u;
+	unlock( this.cltr->idles );
 
 	id = doregister((__processor_id_t*)&this);
@@ -493,6 +501,7 @@
 // Not a ctor, it just preps the destruction but should not destroy members
 static void deinit(processor & this) {
-
-	int target = __atomic_sub_fetch( &this.cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+	lock( this.cltr->idles );
+		int target = this.cltr->idles.total -= 1u;
+	unlock( this.cltr->idles );
 
 	// Lock the RWlock so no-one pushes/pops while we are changing the queue
@@ -501,7 +510,4 @@
 		// Adjust the ready queue size
 		ready_queue_shrink( this.cltr, target );
-
-		// Make sure we aren't on the idle queue
-		unsafe_remove( this.cltr->idles, &this );
 
 	// Unlock the RWlock
@@ -516,5 +522,8 @@
 	( this.terminated ){ 0 };
 	( this.runner ){};
-	init( this, name, _cltr );
+
+	disable_interrupts();
+		init( this, name, _cltr );
+	enable_interrupts( __cfaabi_dbg_ctx );
 
 	__cfadbg_print_safe(runtime_core, "Kernel : Starting core %p\n", &this);
@@ -540,13 +549,21 @@
 	free( this.stack );
 
-	deinit( this );
+	disable_interrupts();
+		deinit( this );
+	enable_interrupts( __cfaabi_dbg_ctx );
 }
 
 //-----------------------------------------------------------------------------
 // Cluster
+static void ?{}(__cluster_idles & this) {
+	this.lock  = 0;
+	this.idle  = 0;
+	this.total = 0;
+	(this.list){};
+}
+
 void ?{}(cluster & this, const char name[], Duration preemption_rate, unsigned num_io, const io_context_params & io_params) with( this ) {
 	this.name = name;
 	this.preemption_rate = preemption_rate;
-	this.nprocessors = 0;
 	ready_queue{};
 
@@ -666,2 +683,23 @@
 	return stack;
 }
+
+#if defined(__CFA_WITH_VERIFY__)
+static bool verify_fwd_bck_rng(void) {
+	kernelTLS.ready_rng.fwd_seed = 25214903917_l64u * (rdtscl() ^ (uintptr_t)&verify_fwd_bck_rng);
+
+	unsigned values[10];
+	for(i; 10) {
+		values[i] = __tls_rand_fwd();
+	}
+
+	__tls_rand_advance_bck();
+
+	for ( i; 9 -~= 0 ) {
+		if(values[i] != __tls_rand_bck()) {
+			return false;
+		}
+	}
+
+	return true;
+}
+#endif
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -121,4 +121,22 @@
 void     unregister( struct __processor_id_t * proc );
 
+//-----------------------------------------------------------------------
+// Cluster idle lock/unlock
+static inline void lock(__cluster_idles & this) {
+	for() {
+		uint64_t l = this.lock;
+		if(
+			(0 == (l % 2))
+			&& __atomic_compare_exchange_n(&this.lock, &l, l + 1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+		) return;
+		Pause();
+	}
+}
+
+static inline void unlock(__cluster_idles & this) {
+	/* paranoid */ verify( 1 == (this.lock % 2) );
+	__atomic_fetch_add( &this.lock, 1, __ATOMIC_SEQ_CST );
+}
+
 //=======================================================================
 // Reader-writer lock implementation
@@ -248,5 +266,12 @@
 // pop thread from the ready queue of a cluster
 // returns 0p if empty
+// May return 0p spuriously
 __attribute__((hot)) struct $thread * pop(struct cluster * cltr);
+
+//-----------------------------------------------------------------------
+// pop thread from the ready queue of a cluster
+// returns 0p if empty
+// guaranteed to find any threads added before this call
+__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr);
 
 //-----------------------------------------------------------------------
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -17,4 +17,6 @@
 // #define __CFA_DEBUG_PRINT_READY_QUEUE__
 
+// #define USE_SNZI
+
 #include "bits/defs.hfa"
 #include "kernel_private.hfa"
@@ -148,4 +150,6 @@
 //  queues or removing them.
 uint_fast32_t ready_mutate_lock( void ) with(*__scheduler_lock) {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+
 	// Step 1 : lock global lock
 	// It is needed to avoid processors that register mid Critical-Section
@@ -162,8 +166,11 @@
 	}
 
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 	return s;
 }
 
 void ready_mutate_unlock( uint_fast32_t last_s ) with(*__scheduler_lock) {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+
 	// Step 1 : release local locks
 	// This must be done while the global lock is held to avoid
@@ -180,4 +187,6 @@
 	/*paranoid*/ assert(true == lock);
 	__atomic_store_n(&lock, (bool)false, __ATOMIC_RELEASE);
+
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 }
 
@@ -192,5 +201,7 @@
 void ^?{}(__ready_queue_t & this) with (this) {
 	verify( 1 == lanes.count );
-	verify( !query( snzi ) );
+	#ifdef USE_SNZI
+		verify( !query( snzi ) );
+	#endif
 	free(lanes.data);
 }
@@ -198,5 +209,8 @@
 //-----------------------------------------------------------------------
 __attribute__((hot)) bool query(struct cluster * cltr) {
-	return query(cltr->ready_queue.snzi);
+	#ifdef USE_SNZI
+		return query(cltr->ready_queue.snzi);
+	#endif
+	return true;
 }
 
@@ -262,12 +276,14 @@
 	bool lane_first = push(lanes.data[i], thrd);
 
-	// If this lane used to be empty we need to do more
-	if(lane_first) {
-		// Check if the entire queue used to be empty
-		first = !query(snzi);
-
-		// Update the snzi
-		arrive( snzi, i );
-	}
+	#ifdef USE_SNZI
+		// If this lane used to be empty we need to do more
+		if(lane_first) {
+			// Check if the entire queue used to be empty
+			first = !query(snzi);
+
+			// Update the snzi
+			arrive( snzi, i );
+		}
+	#endif
 
 	// Unlock and return
@@ -294,4 +310,5 @@
 __attribute__((hot)) $thread * pop(struct cluster * cltr) with (cltr->ready_queue) {
 	/* paranoid */ verify( lanes.count > 0 );
+	unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
 	#if defined(BIAS)
 		// Don't bother trying locally too much
@@ -300,5 +317,9 @@
 
 	// As long as the list is not empty, try finding a lane that isn't empty and pop from it
-	while( query(snzi) ) {
+	#ifdef USE_SNZI
+		while( query(snzi) ) {
+	#else
+		for(25) {
+	#endif
 		// Pick two lists at random
 		unsigned i,j;
@@ -336,6 +357,6 @@
 		#endif
 
-		i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
-		j %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+		i %= count;
+		j %= count;
 
 		// try popping from the 2 picked lists
@@ -353,4 +374,21 @@
 }
 
+__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+	/* paranoid */ verify( lanes.count > 0 );
+	unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+	unsigned offset = __tls_rand();
+	for(i; count) {
+		unsigned idx = (offset + i) % count;
+		struct $thread * thrd = try_pop(cltr, idx);
+		if(thrd) {
+			return thrd;
+		}
+	}
+
+	// All lanes where empty return 0p
+	return 0p;
+}
+
+
 //-----------------------------------------------------------------------
 // Given 2 indexes, pick the list with the oldest push an try to pop from it
@@ -388,14 +426,15 @@
 	// Actually pop the list
 	struct $thread * thrd;
-	bool emptied;
-	[thrd, emptied] = pop(lane);
+	thrd = pop(lane);
 
 	/* paranoid */ verify(thrd);
 	/* paranoid */ verify(lane.lock);
 
-	// If this was the last element in the lane
-	if(emptied) {
-		depart( snzi, w );
-	}
+	#ifdef USE_SNZI
+		// If this was the last element in the lane
+		if(emptied) {
+			depart( snzi, w );
+		}
+	#endif
 
 	// Unlock and return
@@ -424,13 +463,14 @@
 			if(head(lane)->link.next == thrd) {
 				$thread * pthrd;
-				bool emptied;
-				[pthrd, emptied] = pop(lane);
+				pthrd = pop(lane);
 
 				/* paranoid */ verify( pthrd == thrd );
 
 				removed = true;
-				if(emptied) {
-					depart( snzi, i );
-				}
+				#ifdef USE_SNZI
+					if(emptied) {
+						depart( snzi, i );
+					}
+				#endif
 			}
 		__atomic_unlock(&lane.lock);
@@ -494,5 +534,7 @@
 	// grow the ready queue
 	with( cltr->ready_queue ) {
-		^(snzi){};
+		#ifdef USE_SNZI
+			^(snzi){};
+		#endif
 
 		// Find new count
@@ -516,11 +558,13 @@
 		lanes.count = ncount;
 
-		// Re-create the snzi
-		snzi{ log2( lanes.count / 8 ) };
-		for( idx; (size_t)lanes.count ) {
-			if( !is_empty(lanes.data[idx]) ) {
-				arrive(snzi, idx);
-			}
-		}
+		#ifdef USE_SNZI
+			// Re-create the snzi
+			snzi{ log2( lanes.count / 8 ) };
+			for( idx; (size_t)lanes.count ) {
+				if( !is_empty(lanes.data[idx]) ) {
+					arrive(snzi, idx);
+				}
+			}
+		#endif
 	}
 
@@ -542,5 +586,7 @@
 
 	with( cltr->ready_queue ) {
-		^(snzi){};
+		#ifdef USE_SNZI
+			^(snzi){};
+		#endif
 
 		// Remember old count
@@ -567,6 +613,5 @@
 			while(!is_empty(lanes.data[idx])) {
 				struct $thread * thrd;
-				__attribute__((unused)) bool _;
-				[thrd, _] = pop(lanes.data[idx]);
+				thrd = pop(lanes.data[idx]);
 
 				push(cltr, thrd);
@@ -596,11 +641,13 @@
 		}
 
-		// Re-create the snzi
-		snzi{ log2( lanes.count / 8 ) };
-		for( idx; (size_t)lanes.count ) {
-			if( !is_empty(lanes.data[idx]) ) {
-				arrive(snzi, idx);
-			}
-		}
+		#ifdef USE_SNZI
+			// Re-create the snzi
+			snzi{ log2( lanes.count / 8 ) };
+			for( idx; (size_t)lanes.count ) {
+				if( !is_empty(lanes.data[idx]) ) {
+					arrive(snzi, idx);
+				}
+			}
+		#endif
 	}
 
Index: libcfa/src/concurrency/ready_subqueue.hfa
===================================================================
--- libcfa/src/concurrency/ready_subqueue.hfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/concurrency/ready_subqueue.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -144,5 +144,5 @@
 // returns popped
 // returns true of lane was empty before push, false otherwise
-[$thread *, bool] pop(__intrusive_lane_t & this) {
+$thread * pop(__intrusive_lane_t & this) {
 	/* paranoid */ verify(this.lock);
 	/* paranoid */ verify(this.before.link.ts != 0ul);
@@ -162,5 +162,6 @@
 	head->link.next = next;
 	next->link.prev = head;
-	node->link.[next, prev] = 0p;
+	node->link.next = 0p;
+	node->link.prev = 0p;
 
 	// Update head time stamp
@@ -180,5 +181,5 @@
 		/* paranoid */ verify(tail(this)->link.prev == head(this));
 		/* paranoid */ verify(head(this)->link.next == tail(this));
-		return [node, true];
+		return node;
 	}
 	else {
@@ -187,5 +188,5 @@
 		/* paranoid */ verify(head(this)->link.next != tail(this));
 		/* paranoid */ verify(this.before.link.ts != 0);
-		return [node, false];
+		return node;
 	}
 }
Index: libcfa/src/concurrency/stats.cfa
===================================================================
--- libcfa/src/concurrency/stats.cfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/concurrency/stats.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -38,6 +38,6 @@
 			stats->io.submit_q.busy   = 0;
 			stats->io.complete_q.completed_avg.val = 0;
-			stats->io.complete_q.completed_avg.slow_cnt = 0;
-			stats->io.complete_q.completed_avg.fast_cnt = 0;
+			stats->io.complete_q.completed_avg.cnt = 0;
+			stats->io.complete_q.blocks = 0;
 		#endif
 	}
@@ -60,20 +60,20 @@
 
 		#if defined(CFA_HAVE_LINUX_IO_URING_H)
-			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.rdy          , proc->io.submit_q.submit_avg.rdy          , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.csm          , proc->io.submit_q.submit_avg.csm          , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.avl          , proc->io.submit_q.submit_avg.avl          , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.cnt          , proc->io.submit_q.submit_avg.cnt          , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.look_avg.val            , proc->io.submit_q.look_avg.val            , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.look_avg.cnt            , proc->io.submit_q.look_avg.cnt            , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.look_avg.block          , proc->io.submit_q.look_avg.block          , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.val           , proc->io.submit_q.alloc_avg.val           , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.cnt           , proc->io.submit_q.alloc_avg.cnt           , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.block         , proc->io.submit_q.alloc_avg.block         , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.helped                  , proc->io.submit_q.helped                  , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.leader                  , proc->io.submit_q.leader                  , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.submit_q.busy                    , proc->io.submit_q.busy                    , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.val     , proc->io.complete_q.completed_avg.val     , __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.slow_cnt, proc->io.complete_q.completed_avg.slow_cnt, __ATOMIC_SEQ_CST );
-			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.fast_cnt, proc->io.complete_q.completed_avg.fast_cnt, __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.rdy     , proc->io.submit_q.submit_avg.rdy     , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.csm     , proc->io.submit_q.submit_avg.csm     , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.avl     , proc->io.submit_q.submit_avg.avl     , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.cnt     , proc->io.submit_q.submit_avg.cnt     , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.look_avg.val       , proc->io.submit_q.look_avg.val       , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.look_avg.cnt       , proc->io.submit_q.look_avg.cnt       , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.look_avg.block     , proc->io.submit_q.look_avg.block     , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.val      , proc->io.submit_q.alloc_avg.val      , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.cnt      , proc->io.submit_q.alloc_avg.cnt      , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.block    , proc->io.submit_q.alloc_avg.block    , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.helped             , proc->io.submit_q.helped             , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.leader             , proc->io.submit_q.leader             , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.busy               , proc->io.submit_q.busy               , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.val, proc->io.complete_q.completed_avg.val, __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.cnt, proc->io.complete_q.completed_avg.cnt, __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.complete_q.blocks           , proc->io.complete_q.blocks           , __ATOMIC_SEQ_CST );
 		#endif
 	}
@@ -154,6 +154,7 @@
 					"- avg alloc search len   : %'18.2lf\n"
 					"- avg alloc search block : %'18.2lf\n"
-					"- total wait calls       : %'15" PRIu64 "   (%'" PRIu64 " slow, %'" PRIu64 " fast)\n"
+					"- total wait calls       : %'15" PRIu64 "\n"
 					"- avg completion/wait    : %'18.2lf\n"
+					"- total completion blocks: %'15" PRIu64 "\n"
 					"\n"
 					, cluster ? "Cluster" : "Processor",  name, id
@@ -165,7 +166,7 @@
 					, io.submit_q.alloc_avg.cnt
 					, aavgv, aavgb
-					, io.complete_q.completed_avg.slow_cnt + io.complete_q.completed_avg.fast_cnt
-					, io.complete_q.completed_avg.slow_cnt,  io.complete_q.completed_avg.fast_cnt
-					, ((double)io.complete_q.completed_avg.val) / (io.complete_q.completed_avg.slow_cnt + io.complete_q.completed_avg.fast_cnt)
+					, io.complete_q.completed_avg.cnt
+					, ((double)io.complete_q.completed_avg.val) / io.complete_q.completed_avg.cnt
+					, io.complete_q.blocks
 				);
 			}
Index: libcfa/src/concurrency/stats.hfa
===================================================================
--- libcfa/src/concurrency/stats.hfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/concurrency/stats.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -90,7 +90,7 @@
 				struct {
 					volatile uint64_t val;
-					volatile uint64_t slow_cnt;
-					volatile uint64_t fast_cnt;
+					volatile uint64_t cnt;
 				} completed_avg;
+				volatile uint64_t blocks;
 			} complete_q;
 		};
Index: libcfa/src/containers/list.hfa
===================================================================
--- libcfa/src/containers/list.hfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/containers/list.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -13,4 +13,6 @@
 // Update Count     : 1
 //
+
+#pragma once
 
 #include <assert.h>
Index: libcfa/src/exception.c
===================================================================
--- libcfa/src/exception.c	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/exception.c	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -10,6 +10,6 @@
 // Created On       : Mon Jun 26 15:13:00 2017
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Sat Aug 15 07:17:19 2020
-// Update Count     : 26
+// Last Modified On : Thu Aug 20 23:45:45 2020
+// Update Count     : 27
 //
 
@@ -31,4 +31,5 @@
 #include <unwind.h>
 #include <bits/debug.hfa>
+#include "concurrency/invoke.h"
 #include "stdhdr/assert.h"
 
@@ -61,16 +62,9 @@
 
 
-// Temperary global exception context. Does not work with concurency.
-struct exception_context_t {
-	struct __cfaehm_try_resume_node * top_resume;
-
-	exception_t * current_exception;
-	int current_handler_index;
-} static shared_stack = {NULL, NULL, 0};
-
 // Get the current exception context.
 // There can be a single global until multithreading occurs, then each stack
-// needs its own. It will have to be updated to handle that.
-struct exception_context_t * this_exception_context() {
+// needs its own. We get this from libcfathreads (no weak attribute).
+__attribute__((weak)) struct exception_context_t * this_exception_context() {
+	static struct exception_context_t shared_stack = {NULL, NULL};
 	return &shared_stack;
 }
@@ -125,4 +119,15 @@
 
 // MEMORY MANAGEMENT =========================================================
+
+struct __cfaehm_node {
+	struct _Unwind_Exception unwind_exception;
+	struct __cfaehm_node * next;
+	int handler_index;
+};
+
+#define NODE_TO_EXCEPT(node) ((exception_t *)(1 + (node)))
+#define EXCEPT_TO_NODE(except) ((struct __cfaehm_node *)(except) - 1)
+#define UNWIND_TO_NODE(unwind) ((struct __cfaehm_node *)(unwind))
+#define NULL_MAP(map, ptr) ((ptr) ? (map(ptr)) : NULL)
 
 // How to clean up an exception in various situations.
@@ -140,15 +145,4 @@
 }
 
-// We need a piece of storage to raise the exception, for now its a single
-// piece.
-static struct _Unwind_Exception this_exception_storage;
-
-struct __cfaehm_node {
-	struct __cfaehm_node * next;
-};
-
-#define NODE_TO_EXCEPT(node) ((exception_t *)(1 + (node)))
-#define EXCEPT_TO_NODE(except) ((struct __cfaehm_node *)(except) - 1)
-
 // Creates a copy of the indicated exception and sets current_exception to it.
 static void __cfaehm_allocate_exception( exception_t * except ) {
@@ -164,14 +158,14 @@
 	}
 
+	// Initialize the node:
+	exception_t * except_store = NODE_TO_EXCEPT(store);
+	store->unwind_exception.exception_class = __cfaehm_exception_class;
+	store->unwind_exception.exception_cleanup = __cfaehm_exception_cleanup;
+	store->handler_index = 0;
+	except->virtual_table->copy( except_store, except );
+
 	// Add the node to the list:
-	store->next = EXCEPT_TO_NODE(context->current_exception);
-	context->current_exception = NODE_TO_EXCEPT(store);
-
-	// Copy the exception to storage.
-	except->virtual_table->copy( context->current_exception, except );
-
-	// Set up the exception storage.
-	this_exception_storage.exception_class = __cfaehm_exception_class;
-	this_exception_storage.exception_cleanup = __cfaehm_exception_cleanup;
+	store->next = NULL_MAP(EXCEPT_TO_NODE, context->current_exception);
+	context->current_exception = except_store;
 }
 
@@ -188,5 +182,5 @@
 	if ( context->current_exception == except ) {
 		node = to_free->next;
-		context->current_exception = (node) ? NODE_TO_EXCEPT(node) : 0;
+		context->current_exception = NULL_MAP(NODE_TO_EXCEPT, node);
 	} else {
 		node = EXCEPT_TO_NODE(context->current_exception);
@@ -216,5 +210,5 @@
 	// Verify actions follow the rules we expect.
 	verify((actions & _UA_CLEANUP_PHASE) && (actions & _UA_FORCE_UNWIND));
-	verify(!(actions & (_UA_SEARCH_PHASE | _UA_HANDER_FRAME)));
+	verify(!(actions & (_UA_SEARCH_PHASE | _UA_HANDLER_FRAME)));
 
 	if ( actions & _UA_END_OF_STACK ) {
@@ -225,9 +219,11 @@
 }
 
+static struct _Unwind_Exception cancel_exception_storage;
+
 // Cancel the current stack, prefroming approprate clean-up and messaging.
 void __cfaehm_cancel_stack( exception_t * exception ) {
 	// TODO: Detect current stack and pick a particular stop-function.
 	_Unwind_Reason_Code ret;
-	ret = _Unwind_ForcedUnwind( &this_exception_storage, _Stop_Fn, (void*)0x22 );
+	ret = _Unwind_ForcedUnwind( &cancel_exception_storage, _Stop_Fn, (void*)0x22 );
 	printf("UNWIND ERROR %d after force unwind\n", ret);
 	abort();
@@ -250,9 +246,10 @@
 static void __cfaehm_begin_unwind(void(*defaultHandler)(exception_t *)) {
 	struct exception_context_t * context = this_exception_context();
-	struct _Unwind_Exception * storage = &this_exception_storage;
 	if ( NULL == context->current_exception ) {
 		printf("UNWIND ERROR missing exception in begin unwind\n");
 		abort();
 	}
+	struct _Unwind_Exception * storage =
+		&EXCEPT_TO_NODE(context->current_exception)->unwind_exception;
 
 	// Call stdlibc to raise the exception
@@ -426,5 +423,5 @@
 				_Unwind_Reason_Code ret = (0 == index)
 					? _URC_CONTINUE_UNWIND : _URC_HANDLER_FOUND;
-				context->current_handler_index = index;
+				UNWIND_TO_NODE(unwind_exception)->handler_index = index;
 
 				// Based on the return value, check if we matched the exception
@@ -432,4 +429,5 @@
 					__cfadbg_print_safe(exception, " handler found\n");
 				} else {
+					// TODO: Continue the search if there is more in the table.
 					__cfadbg_print_safe(exception, " no handler\n");
 				}
@@ -523,5 +521,5 @@
 	// Exception handler
 	// Note: Saving the exception context on the stack breaks termination exceptions.
-	catch_block( this_exception_context()->current_handler_index,
+	catch_block( EXCEPT_TO_NODE( this_exception_context()->current_exception )->handler_index,
 	             this_exception_context()->current_exception );
 }
Index: libcfa/src/heap.cfa
===================================================================
--- libcfa/src/heap.cfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/heap.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -10,6 +10,6 @@
 // Created On       : Tue Dec 19 21:58:35 2017
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Sun Aug  9 12:23:20 2020
-// Update Count     : 894
+// Last Modified On : Wed Aug 12 16:43:38 2020
+// Update Count     : 902
 //
 
@@ -650,5 +650,8 @@
 		for ( HeapManager.Storage * p = freeLists[i].freeList; p != 0p; p = p->header.kind.real.next ) {
 		#else
-		for ( HeapManager.Storage * p = top( freeLists[i].freeList ); p != 0p; p = (p)`next->top ) {
+		// for ( HeapManager.Storage * p = top( freeLists[i].freeList ); p != 0p; p = (p)`next->top ) {
+		for ( HeapManager.Storage * p = top( freeLists[i].freeList ); p != 0p; /* p = getNext( p )->top */) {
+			typeof(p) temp = (( p )`next)->top;			// FIX ME: direct assignent fails, initialization works
+			p = temp;
 		#endif // BUCKETLOCK
 			total += size;
@@ -1162,5 +1165,5 @@
 		choose( option ) {
 		  case M_TOP_PAD:
-			heapExpand = ceiling( value, pageSize ); return 1;
+			heapExpand = ceiling2( value, pageSize ); return 1;
 		  case M_MMAP_THRESHOLD:
 			if ( setMmapStart( value ) ) return 1;
Index: libcfa/src/iostream.cfa
===================================================================
--- libcfa/src/iostream.cfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/iostream.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -10,6 +10,6 @@
 // Created On       : Wed May 27 17:56:53 2015
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Mon Aug 10 09:32:14 2020
-// Update Count     : 1126
+// Last Modified On : Tue Aug 11 22:16:33 2020
+// Update Count     : 1128
 //
 
@@ -37,22 +37,4 @@
 
 forall( dtype ostype | ostream( ostype ) ) {
-	ostype & ?|?( ostype & os, zero_t ) {
-		if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) );
-		fmt( os, "%d", 0n );
-		return os;
-	} // ?|?
-	void ?|?( ostype & os, zero_t z ) {
-		(ostype &)(os | z); ends( os );
-	} // ?|?
-
-	ostype & ?|?( ostype & os, one_t ) {
-		if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) );
-		fmt( os, "%d", 1n );
-		return os;
-	} // ?|?
-	void ?|?( ostype & os, one_t o ) {
-		(ostype &)(os | o); ends( os );
-	} // ?|?
-
 	ostype & ?|?( ostype & os, bool b ) {
 		if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) );
Index: libcfa/src/iostream.hfa
===================================================================
--- libcfa/src/iostream.hfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/iostream.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -10,6 +10,6 @@
 // Created On       : Wed May 27 17:56:53 2015
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Thu Jul 16 07:43:32 2020
-// Update Count     : 348
+// Last Modified On : Tue Aug 11 22:16:14 2020
+// Update Count     : 350
 //
 
@@ -67,9 +67,4 @@
 
 forall( dtype ostype | ostream( ostype ) ) {
-	ostype & ?|?( ostype &, zero_t );
-	void ?|?( ostype &, zero_t );
-	ostype & ?|?( ostype &, one_t );
-	void ?|?( ostype &, one_t );
-
 	ostype & ?|?( ostype &, bool );
 	void ?|?( ostype &, bool );
Index: libcfa/src/parseargs.cfa
===================================================================
--- libcfa/src/parseargs.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
+++ libcfa/src/parseargs.cfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -0,0 +1,242 @@
+#include "parseargs.hfa"
+
+#include <stdint.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+extern "C" {
+	#include <getopt.h>
+	#include <sys/ioctl.h>
+
+	struct FILE;
+	extern FILE * stderr;
+	extern FILE * stdout;
+
+	extern int fileno(FILE *stream);
+
+	extern int fprintf ( FILE * stream, const char * format, ... );
+
+	extern          long long int strtoll (const char* str, char** endptr, int base);
+	extern unsigned long long int strtoull(const char* str, char** endptr, int base);
+	extern                 double strtod  (const char* str, char** endptr);
+}
+
+#include "common.hfa"
+#include "limits.hfa"
+
+extern int cfa_args_argc;
+extern char ** cfa_args_argv;
+extern char ** cfa_args_envp;
+
+static void usage(char * cmd, cfa_option options[], size_t opt_count, const char * usage, FILE * out)  __attribute__ ((noreturn));
+
+void parse_args( cfa_option options[], size_t opt_count, const char * usage, char ** & left ) {
+	parse_args(cfa_args_argc, cfa_args_argv, options, opt_count, usage, left );
+}
+
+//-----------------------------------------------------------------------------
+// getopt_long wrapping
+void parse_args(
+	int argc,
+	char * argv[],
+	cfa_option options[],
+	size_t opt_count,
+	const char * usage,
+	char ** & left
+) {
+	struct option optarr[opt_count + 2];
+	{
+		int idx = 0;
+		for(i; opt_count) {
+			if(options[i].long_name) {
+				optarr[idx].name = options[i].long_name;
+				optarr[idx].flag = 0p;
+				optarr[idx].val  = options[i].short_name;
+				if(    ((intptr_t)options[i].parse) == ((intptr_t)parse_settrue)
+				    || ((intptr_t)options[i].parse) == ((intptr_t)parse_setfalse) ) {
+					optarr[idx].has_arg = no_argument;
+				} else {
+					optarr[idx].has_arg = required_argument;
+				}
+				idx++;
+			}
+		}
+		optarr[idx+0].[name, has_arg, flag, val] = ["help", no_argument, 0, 'h'];
+		optarr[idx+1].[name, has_arg, flag, val] = [0, no_argument, 0, 0];
+	}
+
+	char optstring[opt_count * 3] = { '\0' };
+	{
+		int idx = 0;
+		for(i; opt_count) {
+			optstring[idx] = options[i].short_name;
+			idx++;
+			if(    ((intptr_t)options[i].parse) != ((intptr_t)parse_settrue)
+			    && ((intptr_t)options[i].parse) != ((intptr_t)parse_setfalse) ) {
+				optstring[idx] = ':';
+				idx++;
+			}
+		}
+		optstring[idx+0] = 'h';
+		optstring[idx+1] = '\0';
+	}
+
+	FILE * out = stderr;
+	NEXT_ARG:
+	for() {
+		int idx = 0;
+		int opt = getopt_long(argc, argv, optstring, optarr, &idx);
+		switch(opt) {
+			case -1:
+				if(&left != 0p) left = argv + optind;
+				return;
+			case 'h':
+				out = stdout;
+			case '?':
+				usage(argv[0], options, opt_count, usage, out);
+			default:
+				for(i; opt_count) {
+					if(opt == options[i].short_name) {
+						const char * arg = optarg ? optarg : "";
+						bool success = options[i].parse( arg, options[i].variable );
+						if(success) continue NEXT_ARG;
+
+						fprintf(out, "Argument '%s' for option %c could not be parsed\n\n", arg, (char)opt);
+						usage(argv[0], options, opt_count, usage, out);
+					}
+				}
+				abort("Internal parse arg error\n");
+		}
+
+	}
+}
+
+//-----------------------------------------------------------------------------
+// Print usage
+static void printopt(FILE * out, int width, int max, char sn, const char * ln, const char * help) {
+	int hwidth = max - (11 + width);
+	if(hwidth <= 0) hwidth = max;
+
+	fprintf(out, "  -%c, --%-*s   %.*s\n", sn, width, ln, hwidth, help);
+	for() {
+		help += min(strlen(help), hwidth);
+		if('\0' == *help) break;
+		fprintf(out, "%*s%.*s\n", width + 11, "", hwidth, help);
+	}
+}
+
+void print_args_usage(cfa_option options[], size_t opt_count, const char * usage, bool error)  __attribute__ ((noreturn)) {
+	usage(cfa_args_argv[0], options, opt_count, usage, error ? stderr : stdout);
+}
+
+void print_args_usage(int , char * argv[], cfa_option options[], size_t opt_count, const char * usage, bool error)  __attribute__ ((noreturn)) {
+	usage(argv[0], options, opt_count, usage, error ? stderr : stdout);
+}
+
+static void usage(char * cmd, cfa_option options[], size_t opt_count, const char * help, FILE * out) __attribute__((noreturn)) {
+	int width = 0;
+	{
+		for(i; opt_count) {
+			if(options[i].long_name) {
+				int w = strlen(options[i].long_name);
+				if(w > width) width = w;
+			}
+		}
+	}
+
+	int max_width = 1_000_000;
+	int outfd = fileno(out);
+	if(isatty(outfd)) {
+		struct winsize size;
+		int ret = ioctl(outfd, TIOCGWINSZ, &size);
+		if(ret < 0) abort( "ioctl error: (%d) %s\n", (int)errno, strerror(errno) );
+		max_width = size.ws_col;
+	}
+
+	fprintf(out, "Usage:\n  %s %s\n", cmd, help);
+
+	for(i; opt_count) {
+		printopt(out, width, max_width, options[i].short_name, options[i].long_name, options[i].help);
+	}
+	fprintf(out, "  -%c, --%-*s   %s\n", 'h', width, "help", "print this help message");
+	exit(out == stdout ? 0 : 1);
+}
+
+//-----------------------------------------------------------------------------
+// Typed argument parsing
+bool parse_yesno(const char * arg, bool & value ) {
+	if(strcmp(arg, "yes") == 0) {
+		value = true;
+		return true;
+	}
+
+	if(strcmp(arg, "no") == 0) {
+		value = false;
+		return true;
+	}
+
+	return false;
+}
+
+bool parse_settrue (const char *, bool & value ) {
+	value = true;
+	return true;
+}
+
+bool parse_setfalse(const char *, bool & value )  {
+	value = false;
+	return true;
+}
+
+bool parse(const char * arg, const char * & value ) {
+	value = arg;
+	return true;
+}
+
+bool parse(const char * arg, int & value) {
+	char * end;
+	int r = strtoll(arg, &end, 10);
+	if(*end != '\0') return false;
+
+	value = r;
+	return true;
+}
+
+bool parse(const char * arg, unsigned & value) {
+	char * end;
+	unsigned long long int r = strtoull(arg, &end, 10);
+	if(*end != '\0') return false;
+	if(r > (unsigned)MAX) return false;
+
+	value = r;
+	return true;
+}
+
+bool parse(const char * arg, unsigned long & value) {
+	char * end;
+	unsigned long long int r = strtoull(arg, &end, 10);
+	if(*end != '\0') return false;
+	if(r > (unsigned long)MAX) return false;
+
+	value = r;
+	return true;
+}
+
+bool parse(const char * arg, unsigned long long & value) {
+        char * end;
+        unsigned long long int r = strtoull(arg, &end, 10);
+        if(*end != '\0') return false;
+        if(r > (unsigned long long)MAX) return false;
+
+        value = r;
+        return true;
+}
+
+bool parse(const char * arg, double & value) {
+	char * end;
+	double r = strtod(arg, &end);
+	if(*end != '\0') return false;
+
+	value = r;
+	return true;
+}
Index: libcfa/src/parseargs.hfa
===================================================================
--- libcfa/src/parseargs.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
+++ libcfa/src/parseargs.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -0,0 +1,48 @@
+#pragma once
+
+struct cfa_option {
+      char short_name;
+      const char * long_name;
+      const char * help;
+      void * variable;
+      bool (*parse)(const char *, void * );
+};
+
+extern cfa_option last_option;
+
+static inline void ?{}( cfa_option & this ) {}
+
+forall(dtype T | { bool parse(const char *, T & ); })
+static inline void ?{}( cfa_option & this, char short_name, const char * long_name, const char * help, T & variable ) {
+      this.short_name = short_name;
+      this.long_name  = long_name;
+      this.help       = help;
+      this.variable   = (void*)&variable;
+      this.parse      = (bool (*)(const char *, void * ))parse;
+}
+
+forall(dtype T)
+static inline void ?{}( cfa_option & this, char short_name, const char * long_name, const char * help, T & variable, bool (*parse)(const char *, T & )) {
+      this.short_name = short_name;
+      this.long_name  = long_name;
+      this.help       = help;
+      this.variable   = (void*)&variable;
+      this.parse      = (bool (*)(const char *, void * ))parse;
+}
+
+void parse_args( cfa_option options[], size_t opt_count, const char * usage, char ** & left );
+void parse_args( int argc, char * argv[], cfa_option options[], size_t opt_count, const char * usage, char ** & left );
+
+void print_args_usage(cfa_option options[], size_t opt_count, const char * usage, bool error)  __attribute__ ((noreturn));
+void print_args_usage(int argc, char * argv[], cfa_option options[], size_t opt_count, const char * usage, bool error)  __attribute__ ((noreturn));
+
+bool parse_yesno   (const char *, bool & );
+bool parse_settrue (const char *, bool & );
+bool parse_setfalse(const char *, bool & );
+
+bool parse(const char *, const char * & );
+bool parse(const char *, int & );
+bool parse(const char *, unsigned & );
+bool parse(const char *, unsigned long & );
+bool parse(const char *, unsigned long long & );
+bool parse(const char *, double & );
Index: libcfa/src/stdlib.hfa
===================================================================
--- libcfa/src/stdlib.hfa	(revision 67ca73e71dbea21372d807389df49e401624ec3b)
+++ libcfa/src/stdlib.hfa	(revision e67a82d79e32a55496140aa883716d9123ccc7f7)
@@ -10,6 +10,6 @@
 // Created On       : Thu Jan 28 17:12:35 2016
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Thu Jul 30 16:14:58 2020
-// Update Count     : 490
+// Last Modified On : Fri Aug 14 23:38:50 2020
+// Update Count     : 504
 //
 
@@ -39,4 +39,8 @@
 //---------------------------------------
 
+#include "common.hfa"
+
+//---------------------------------------
+
 // Macro because of returns
 #define $VAR_ALLOC( allocation, alignment ) \
@@ -136,23 +140,23 @@
 	T * alloc_set( char fill ) {
 		return (T *)memset( (T *)alloc(), (int)fill, sizeof(T) ); // initialize with fill value
-	} // alloc
-
-	T * alloc_set( T fill ) {
+	} // alloc_set
+
+	T * alloc_set( const T & fill ) {
 		return (T *)memcpy( (T *)alloc(), &fill, sizeof(T) ); // initialize with fill value
-	} // alloc
+	} // alloc_set
 
 	T * alloc_set( size_t dim, char fill ) {
 		return (T *)memset( (T *)alloc( dim ), (int)fill, dim * sizeof(T) ); // initialize with fill value
-	} // alloc
-
-	T * alloc_set( size_t dim, T fill ) {
+	} // alloc_set
+
+	T * alloc_set( size_t dim, const T & fill ) {
 		T * r = (T *)alloc( dim );
 		for ( i; dim ) { memcpy( &r[i], &fill, sizeof(T) ); } // initialize with fill value
 		return r;
-	} // alloc
-
-	T * alloc_set( size_t dim, const T fill[] ) {
-		return (T *)memcpy( (T *)alloc( dim ), fill, dim * sizeof(T) ); // initialize with fill value
-	} // alloc
+	} // alloc_set
+
+	T * alloc_set( size_t dimNew, const T fill[], size_t dimOld ) {
+		return (T *)memcpy( (T *)alloc( dimNew ), fill, min( dimNew, dimOld ) * sizeof(T) ); // initialize with fill value
+	} // alloc_set
 
 	T * alloc_set( T ptr[], size_t dim, char fill ) {	// realloc array with fill
@@ -166,5 +170,5 @@
 	} // alloc_set
 
-	T * alloc_set( T ptr[], size_t dim, T & fill ) {	// realloc array with fill
+	T * alloc_set( T ptr[], size_t dim, const T & fill ) {	// realloc array with fill
 		size_t odim = malloc_size( ptr ) / sizeof(T);	// current dimension
 		size_t nsize = dim * sizeof(T);					// new allocation
@@ -177,5 +181,5 @@
 		} // if
 		return nptr;
-	} // alloc_align_set
+	} // alloc_set
 } // distribution
 
@@ -204,23 +208,23 @@
 	T * alloc_align_set( size_t align, char fill ) {
 		return (T *)memset( (T *)alloc_align( align ), (int)fill, sizeof(T) ); // initialize with fill value
-	} // alloc_align
-
-	T * alloc_align_set( size_t align, T fill ) {
+	} // alloc_align_set
+
+	T * alloc_align_set( size_t align, const T & fill ) {
 		return (T *)memcpy( (T *)alloc_align( align ), &fill, sizeof(T) ); // initialize with fill value
-	} // alloc_align
+	} // alloc_align_set
 
 	T * alloc_align_set( size_t align, size_t dim, char fill ) {
 		return (T *)memset( (T *)alloc_align( align, dim ), (int)fill, dim * sizeof(T) ); // initialize with fill value
-	} // alloc_align
-
-	T * alloc_align_set( size_t align, size_t dim, T fill ) {
+	} // alloc_align_set
+
+	T * alloc_align_set( size_t align, size_t dim, const T & fill ) {
 		T * r = (T *)alloc_align( align, dim );
 		for ( i; dim ) { memcpy( &r[i], &fill, sizeof(T) ); } // initialize with fill value
 		return r;
-	} // alloc_align
-
-	T * alloc_align_set( size_t align, size_t dim, const T fill[] ) {
-		return (T *)memcpy( (T *)alloc_align( align, dim ), fill, dim * sizeof(T) );
-	} // alloc_align
+	} // alloc_align_set
+
+	T * alloc_align_set( size_t align, size_t dimNew, const T fill[], size_t dimOld ) {
+		return (T *)memcpy( (T *)alloc_align( align, dimNew ), fill, min( dimNew, dimOld ) * sizeof(T) );
+	} // alloc_align_set
 
 	T * alloc_align_set( T ptr[], size_t align, size_t dim, char fill ) {
@@ -234,5 +238,5 @@
 	} // alloc_align_set
 
-	T * alloc_align_set( T ptr[], size_t align, size_t dim, T & fill ) {
+	T * alloc_align_set( T ptr[], size_t align, size_t dim, const T & fill ) {
 		size_t odim = malloc_size( ptr ) / sizeof(T);	// current dimension
 		size_t nsize = dim * sizeof(T);					// new allocation
@@ -374,8 +378,4 @@
 //---------------------------------------
 
-#include "common.hfa"
-
-//---------------------------------------
-
 extern bool threading_enabled(void) OPTIONAL_THREAD;
 
