Index: libcfa/src/Makefile.am
===================================================================
--- libcfa/src/Makefile.am	(revision e2853eb86b033c1a5f683bbe898f3733235771dc)
+++ libcfa/src/Makefile.am	(revision 6c53a93454697f8b549b141d01803272ff074931)
@@ -84,4 +84,5 @@
 	time.hfa \
 	bits/weakso_locks.hfa \
+	algorithms/range_iterator.hfa \
 	containers/maybe.hfa \
 	containers/pair.hfa \
Index: libcfa/src/algorithms/range_iterator.cfa
===================================================================
--- libcfa/src/algorithms/range_iterator.cfa	(revision 6c53a93454697f8b549b141d01803272ff074931)
+++ libcfa/src/algorithms/range_iterator.cfa	(revision 6c53a93454697f8b549b141d01803272ff074931)
@@ -0,0 +1,62 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// range_iterator.cfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Tue Nov 30 13:06:22 2021
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#include "range_iterator.hfa"
+
+#include <stdio.h>
+
+#include <fstream.hfa>
+
+void main(RangeIter & this) {
+	for() {
+		this._start = -1;
+		this._stop = -1;
+		int start_len = -1, stop_len = -1;
+		int ret = sscanf(this.text, "%u%n-%u%n", &this._start, &start_len, &this._stop, &stop_len);
+		switch(ret) {
+		case 0:
+			// Not a range, maybe a comma?
+			if(this.text[0] == ',') {
+				this.text ++;
+				continue;
+			}
+
+			serr | "Error: unexpected character in next range: '" | this.text |"'";
+			exit(2);
+		case 1:
+			this.text += start_len;
+			// Only one value, push it!
+			this.com = this._start;
+			suspend;
+			break;
+		case 2:
+			if(this._start > this._stop) {
+				serr | "Error: next range out of order '" | this.text |"'";
+				exit(2);
+			}
+			this.text += stop_len;
+			for(this.com = this._start; this.com <= this._stop; this.com++) {
+				suspend;
+			}
+			break;
+		default:
+			serr | "Error reading next block: '" | this.text |"', returned" | ret;
+			exit(2);
+		}
+
+		if(this.text[0] == '\0') break;
+	}
+	this.com = -1;
+}
Index: libcfa/src/algorithms/range_iterator.hfa
===================================================================
--- libcfa/src/algorithms/range_iterator.hfa	(revision 6c53a93454697f8b549b141d01803272ff074931)
+++ libcfa/src/algorithms/range_iterator.hfa	(revision 6c53a93454697f8b549b141d01803272ff074931)
@@ -0,0 +1,27 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// range_iterator.hfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Tue Nov 30 13:06:22 2021
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+generator RangeIter {
+	const char * text;
+	int com;
+	int _start;
+	int _stop;
+};
+
+static inline void ?{}(RangeIter & this, const char * text) {
+	this.text = text;
+}
+
+static inline bool moveNext(RangeIter & this) { resume(this); return this.com >= 0; }
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision e2853eb86b033c1a5f683bbe898f3733235771dc)
+++ libcfa/src/concurrency/io.cfa	(revision 6c53a93454697f8b549b141d01803272ff074931)
@@ -33,4 +33,5 @@
 		#include <sys/syscall.h>
 		#include <sys/eventfd.h>
+		#include <sys/uio.h>
 
 		#include <linux/io_uring.h>
@@ -133,5 +134,5 @@
 	}
 
-	void __cfa_io_flush( processor * proc ) {
+	bool __cfa_io_flush( processor * proc, int min_comp ) {
 		/* paranoid */ verify( ! __preemption_enabled() );
 		/* paranoid */ verify( proc );
@@ -141,13 +142,8 @@
 		$io_context & ctx = *proc->io.ctx;
 
-		// for(i; 2) {
-		// 	unsigned idx = proc->rdq.id + i;
-		// 	cltr->ready_queue.lanes.tscs[idx].tv = -1ull;
-		// }
-
 		__ioarbiter_flush( ctx );
 
 		__STATS__( true, io.calls.flush++; )
-		int ret = syscall( __NR_io_uring_enter, ctx.fd, ctx.sq.to_submit, 0, 0, (sigset_t *)0p, _NSIG / 8);
+		int ret = syscall( __NR_io_uring_enter, ctx.fd, ctx.sq.to_submit, min_comp, min_comp > 0 ? IORING_ENTER_GETEVENTS : 0, (sigset_t *)0p, _NSIG / 8);
 		if( ret < 0 ) {
 			switch((int)errno) {
@@ -157,9 +153,5 @@
 				// Update statistics
 				__STATS__( false, io.calls.errors.busy ++; )
-				// for(i; 2) {
-				// 	unsigned idx = proc->rdq.id + i;
-				// 	cltr->ready_queue.lanes.tscs[idx].tv = rdtscl();
-				// }
-				return;
+				return false;
 			default:
 				abort( "KERNEL ERROR: IO_URING SYSCALL - (%d) %s\n", (int)errno, strerror(errno) );
@@ -182,10 +174,8 @@
 
 		ctx.proc->io.pending = false;
-
-		__cfa_io_drain( proc );
-		// for(i; 2) {
-		// 	unsigned idx = proc->rdq.id + i;
-		// 	cltr->ready_queue.lanes.tscs[idx].tv = rdtscl();
-		// }
+		ready_schedule_lock();
+		bool ret = __cfa_io_drain( proc );
+		ready_schedule_unlock();
+		return ret;
 	}
 
@@ -291,5 +281,4 @@
 	}
 
-
 	//=============================================================================================
 	// submission
@@ -314,7 +303,5 @@
 		ctx->proc->io.dirty   = true;
 		if(sq.to_submit > 30 || !lazy) {
-			ready_schedule_lock();
-			__cfa_io_flush( ctx->proc );
-			ready_schedule_unlock();
+			__cfa_io_flush( ctx->proc, 0 );
 		}
 	}
@@ -515,3 +502,51 @@
 		}
 	}
+
+	#if defined(CFA_WITH_IO_URING_IDLE)
+		bool __kernel_read(processor * proc, io_future_t & future, iovec & iov, int fd) {
+			$io_context * ctx = proc->io.ctx;
+			/* paranoid */ verify( ! __preemption_enabled() );
+			/* paranoid */ verify( proc == __cfaabi_tls.this_processor );
+			/* paranoid */ verify( ctx );
+
+			__u32 idx;
+			struct io_uring_sqe * sqe;
+
+			// We can proceed to the fast path
+			if( !__alloc(ctx, &idx, 1) ) return false;
+
+			// Allocation was successful
+			__fill( &sqe, 1, &idx, ctx );
+
+			sqe->user_data = (uintptr_t)&future;
+			sqe->flags = 0;
+			sqe->fd = fd;
+			sqe->off = 0;
+			sqe->ioprio = 0;
+			sqe->fsync_flags = 0;
+			sqe->__pad2[0] = 0;
+			sqe->__pad2[1] = 0;
+			sqe->__pad2[2] = 0;
+
+			#if defined(CFA_HAVE_IORING_OP_READ)
+				sqe->opcode = IORING_OP_READ;
+				sqe->addr = (uint64_t)iov.iov_base;
+				sqe->len = iov.iov_len;
+			#elif defined(CFA_HAVE_READV) && defined(CFA_HAVE_IORING_OP_READV)
+				sqe->opcode = IORING_OP_READV;
+				sqe->addr = (uintptr_t)&iov;
+				sqe->len = 1;
+			#else
+				#error CFA_WITH_IO_URING_IDLE but none of CFA_HAVE_READV, CFA_HAVE_IORING_OP_READV or CFA_HAVE_IORING_OP_READ defined
+			#endif
+
+			asm volatile("": : :"memory");
+
+			/* paranoid */ verify( sqe->user_data == (uintptr_t)&future );
+			__submit( ctx, &idx, 1, true );
+
+			/* paranoid */ verify( proc == __cfaabi_tls.this_processor );
+			/* paranoid */ verify( ! __preemption_enabled() );
+		}
+	#endif
 #endif
Index: libcfa/src/concurrency/io/setup.cfa
===================================================================
--- libcfa/src/concurrency/io/setup.cfa	(revision e2853eb86b033c1a5f683bbe898f3733235771dc)
+++ libcfa/src/concurrency/io/setup.cfa	(revision 6c53a93454697f8b549b141d01803272ff074931)
@@ -32,5 +32,5 @@
 
 	void __cfa_io_start( processor * proc ) {}
-	void __cfa_io_flush( processor * proc ) {}
+	bool __cfa_io_flush( processor * proc, int ) {}
 	void __cfa_io_stop ( processor * proc ) {}
 
@@ -111,5 +111,5 @@
 		this.ext_sq.empty = true;
 		(this.ext_sq.queue){};
-		__io_uring_setup( this, cl.io.params, proc->idle );
+		__io_uring_setup( this, cl.io.params, proc->idle_fd );
 		__cfadbg_print_safe(io_core, "Kernel I/O : Created ring for io_context %u (%p)\n", this.fd, &this);
 	}
@@ -220,19 +220,21 @@
 		cq.cqes = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
 
-		// Step 4 : eventfd
-		// io_uring_register is so f*cking slow on some machine that it
-		// will never succeed if preemption isn't hard blocked
-		__cfadbg_print_safe(io_core, "Kernel I/O : registering %d for completion with ring %d\n", procfd, fd);
-
-		__disable_interrupts_hard();
-
-		int ret = syscall( __NR_io_uring_register, fd, IORING_REGISTER_EVENTFD, &procfd, 1);
-		if (ret < 0) {
-			abort("KERNEL ERROR: IO_URING EVENTFD REGISTER - %s\n", strerror(errno));
-		}
-
-		__enable_interrupts_hard();
-
-		__cfadbg_print_safe(io_core, "Kernel I/O : registered %d for completion with ring %d\n", procfd, fd);
+		#if !defined(CFA_WITH_IO_URING_IDLE)
+			// Step 4 : eventfd
+			// io_uring_register is so f*cking slow on some machine that it
+			// will never succeed if preemption isn't hard blocked
+			__cfadbg_print_safe(io_core, "Kernel I/O : registering %d for completion with ring %d\n", procfd, fd);
+
+			__disable_interrupts_hard();
+
+			int ret = syscall( __NR_io_uring_register, fd, IORING_REGISTER_EVENTFD, &procfd, 1);
+			if (ret < 0) {
+				abort("KERNEL ERROR: IO_URING EVENTFD REGISTER - %s\n", strerror(errno));
+			}
+
+			__enable_interrupts_hard();
+
+			__cfadbg_print_safe(io_core, "Kernel I/O : registered %d for completion with ring %d\n", procfd, fd);
+		#endif
 
 		// some paranoid checks
Index: libcfa/src/concurrency/io/types.hfa
===================================================================
--- libcfa/src/concurrency/io/types.hfa	(revision e2853eb86b033c1a5f683bbe898f3733235771dc)
+++ libcfa/src/concurrency/io/types.hfa	(revision 6c53a93454697f8b549b141d01803272ff074931)
@@ -185,10 +185,6 @@
 
 	// Wait for the future to be fulfilled
-	bool wait( io_future_t & this ) {
-		return wait(this.self);
-	}
-
-	void reset( io_future_t & this ) {
-		return reset(this.self);
-	}
+	bool wait     ( io_future_t & this ) { return wait     (this.self); }
+	void reset    ( io_future_t & this ) { return reset    (this.self); }
+	bool available( io_future_t & this ) { return available(this.self); }
 }
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision e2853eb86b033c1a5f683bbe898f3733235771dc)
+++ libcfa/src/concurrency/kernel.cfa	(revision 6c53a93454697f8b549b141d01803272ff074931)
@@ -27,4 +27,5 @@
 extern "C" {
 	#include <sys/eventfd.h>
+	#include <sys/uio.h>
 }
 
@@ -34,4 +35,5 @@
 #include "strstream.hfa"
 #include "device/cpu.hfa"
+#include "io/types.hfa"
 
 //Private includes
@@ -124,13 +126,17 @@
 static void __wake_one(cluster * cltr);
 
-static void mark_idle (__cluster_proc_list & idles, processor & proc);
+static void idle_sleep(processor * proc, io_future_t & future, iovec & iov);
+static bool mark_idle (__cluster_proc_list & idles, processor & proc);
 static void mark_awake(__cluster_proc_list & idles, processor & proc);
-static [unsigned idle, unsigned total, * processor] query_idles( & __cluster_proc_list idles );
 
 extern void __cfa_io_start( processor * );
 extern bool __cfa_io_drain( processor * );
-extern void __cfa_io_flush( processor * );
+extern bool __cfa_io_flush( processor *, int min_comp );
 extern void __cfa_io_stop ( processor * );
 static inline bool __maybe_io_drain( processor * );
+
+#if defined(CFA_WITH_IO_URING_IDLE)
+	extern bool __kernel_read(processor * proc, io_future_t & future, iovec &, int fd);
+#endif
 
 extern void __disable_interrupts_hard();
@@ -148,4 +154,5 @@
 	/* paranoid */ verify( __preemption_enabled() );
 }
+
 
 //=============================================================================================
@@ -163,4 +170,9 @@
 	verify(this);
 
+	io_future_t future; // used for idle sleep when io_uring is present
+	future.self.ptr = 1p;  // mark it as already fulfilled so we know if there is a pending request or not
+	eventfd_t idle_val;
+	iovec idle_iovec = { &idle_val, sizeof(idle_val) };
+
 	__cfa_io_start( this );
 
@@ -196,7 +208,5 @@
 
 			if( !readyThread ) {
-				ready_schedule_lock();
-				__cfa_io_flush( this );
-				ready_schedule_unlock();
+				__cfa_io_flush( this, 0 );
 
 				readyThread = __next_thread_slow( this->cltr );
@@ -213,5 +223,5 @@
 
 				// Push self to idle stack
-				mark_idle(this->cltr->procs, * this);
+				if(!mark_idle(this->cltr->procs, * this)) continue MAIN_LOOP;
 
 				// Confirm the ready-queue is empty
@@ -229,15 +239,98 @@
 				}
 
-				#if !defined(__CFA_NO_STATISTICS__)
-					if(this->print_halts) {
-						__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->unique_id, rdtscl());
+				idle_sleep( this, future, idle_iovec );
+
+				// We were woken up, remove self from idle
+				mark_awake(this->cltr->procs, * this);
+
+				// DON'T just proceed, start looking again
+				continue MAIN_LOOP;
+			}
+
+			/* paranoid */ verify( readyThread );
+
+			// Reset io dirty bit
+			this->io.dirty = false;
+
+			// We found a thread run it
+			__run_thread(this, readyThread);
+
+			// Are we done?
+			if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
+
+			if(this->io.pending && !this->io.dirty) {
+				__cfa_io_flush( this, 0 );
+			}
+
+			#else
+				#warning new kernel loop
+			SEARCH: {
+				/* paranoid */ verify( ! __preemption_enabled() );
+
+				// First, lock the scheduler since we are searching for a thread
+				ready_schedule_lock();
+
+				// Try to get the next thread
+				readyThread = pop_fast( this->cltr );
+				if(readyThread) { ready_schedule_unlock(); break SEARCH; }
+
+				// If we can't find a thread, might as well flush any outstanding I/O
+				if(this->io.pending) { __cfa_io_flush( this, 0 ); }
+
+				// Spin a little on I/O, just in case
+				for(5) {
+					__maybe_io_drain( this );
+					readyThread = pop_fast( this->cltr );
+					if(readyThread) { ready_schedule_unlock(); break SEARCH; }
+				}
+
+				// no luck, try stealing a few times
+				for(5) {
+					if( __maybe_io_drain( this ) ) {
+						readyThread = pop_fast( this->cltr );
+					} else {
+						readyThread = pop_slow( this->cltr );
 					}
-				#endif
-
-				__cfadbg_print_safe(runtime_core, "Kernel : core %p waiting on eventfd %d\n", this, this->idle);
+					if(readyThread) { ready_schedule_unlock(); break SEARCH; }
+				}
+
+				// still no luck, search for a thread
+				readyThread = pop_search( this->cltr );
+				if(readyThread) { ready_schedule_unlock(); break SEARCH; }
+
+				// Don't block if we are done
+				if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) {
+					ready_schedule_unlock();
+					break MAIN_LOOP;
+				}
+
+				__STATS( __tls_stats()->ready.sleep.halts++; )
+
+				// Push self to idle stack
+				ready_schedule_unlock();
+				if(!mark_idle(this->cltr->procs, * this)) goto SEARCH;
+				ready_schedule_lock();
+
+				// Confirm the ready-queue is empty
+				__maybe_io_drain( this );
+				readyThread = pop_search( this->cltr );
+				ready_schedule_unlock();
+
+				if( readyThread ) {
+					// A thread was found, cancel the halt
+					mark_awake(this->cltr->procs, * this);
+
+					__STATS( __tls_stats()->ready.sleep.cancels++; )
+
+					// continue the main loop
+					break SEARCH;
+				}
+
+				__STATS( if(this->print_halts) __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->unique_id, rdtscl()); )
+				__cfadbg_print_safe(runtime_core, "Kernel : core %p waiting on eventfd %d\n", this, this->idle_fd);
 
 				{
 					eventfd_t val;
-					ssize_t ret = read( this->idle, &val, sizeof(val) );
+					ssize_t ret = read( this->idle_fd, &val, sizeof(val) );
 					if(ret < 0) {
 						switch((int)errno) {
@@ -255,9 +348,5 @@
 				}
 
-				#if !defined(__CFA_NO_STATISTICS__)
-					if(this->print_halts) {
-						__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->unique_id, rdtscl());
-					}
-				#endif
+					__STATS( if(this->print_halts) __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->unique_id, rdtscl()); )
 
 				// We were woken up, remove self from idle
@@ -268,115 +357,4 @@
 			}
 
-			/* paranoid */ verify( readyThread );
-
-			// Reset io dirty bit
-			this->io.dirty = false;
-
-			// We found a thread run it
-			__run_thread(this, readyThread);
-
-			// Are we done?
-			if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
-
-			if(this->io.pending && !this->io.dirty) {
-				ready_schedule_lock();
-				__cfa_io_flush( this );
-				ready_schedule_unlock();
-			}
-
-			#else
-				#warning new kernel loop
-			SEARCH: {
-				/* paranoid */ verify( ! __preemption_enabled() );
-
-				// First, lock the scheduler since we are searching for a thread
-				ready_schedule_lock();
-
-				// Try to get the next thread
-				readyThread = pop_fast( this->cltr );
-				if(readyThread) { ready_schedule_unlock(); break SEARCH; }
-
-				// If we can't find a thread, might as well flush any outstanding I/O
-				if(this->io.pending) { __cfa_io_flush( this ); }
-
-				// Spin a little on I/O, just in case
-				for(5) {
-					__maybe_io_drain( this );
-					readyThread = pop_fast( this->cltr );
-					if(readyThread) { ready_schedule_unlock(); break SEARCH; }
-				}
-
-				// no luck, try stealing a few times
-				for(5) {
-					if( __maybe_io_drain( this ) ) {
-						readyThread = pop_fast( this->cltr );
-					} else {
-						readyThread = pop_slow( this->cltr );
-					}
-					if(readyThread) { ready_schedule_unlock(); break SEARCH; }
-				}
-
-				// still no luck, search for a thread
-				readyThread = pop_search( this->cltr );
-				if(readyThread) { ready_schedule_unlock(); break SEARCH; }
-
-				// Don't block if we are done
-				if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) {
-					ready_schedule_unlock();
-					break MAIN_LOOP;
-				}
-
-				__STATS( __tls_stats()->ready.sleep.halts++; )
-
-				// Push self to idle stack
-				ready_schedule_unlock();
-				mark_idle(this->cltr->procs, * this);
-				ready_schedule_lock();
-
-				// Confirm the ready-queue is empty
-				__maybe_io_drain( this );
-				readyThread = pop_search( this->cltr );
-				ready_schedule_unlock();
-
-				if( readyThread ) {
-					// A thread was found, cancel the halt
-					mark_awake(this->cltr->procs, * this);
-
-					__STATS( __tls_stats()->ready.sleep.cancels++; )
-
-					// continue the main loop
-					break SEARCH;
-				}
-
-				__STATS( if(this->print_halts) __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->unique_id, rdtscl()); )
-				__cfadbg_print_safe(runtime_core, "Kernel : core %p waiting on eventfd %d\n", this, this->idle);
-
-				{
-					eventfd_t val;
-					ssize_t ret = read( this->idle, &val, sizeof(val) );
-					if(ret < 0) {
-						switch((int)errno) {
-						case EAGAIN:
-						#if EAGAIN != EWOULDBLOCK
-							case EWOULDBLOCK:
-						#endif
-						case EINTR:
-							// No need to do anything special here, just assume it's a legitimate wake-up
-							break;
-						default:
-							abort( "KERNEL : internal error, read failure on idle eventfd, error(%d) %s.", (int)errno, strerror( (int)errno ) );
-						}
-					}
-				}
-
-					__STATS( if(this->print_halts) __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->unique_id, rdtscl()); )
-
-				// We were woken up, remove self from idle
-				mark_awake(this->cltr->procs, * this);
-
-				// DON'T just proceed, start looking again
-				continue MAIN_LOOP;
-			}
-
 		RUN_THREAD:
 			/* paranoid */ verify( ! __preemption_enabled() );
@@ -393,5 +371,5 @@
 
 			if(this->io.pending && !this->io.dirty) {
-				__cfa_io_flush( this );
+				__cfa_io_flush( this, 0 );
 			}
 
@@ -403,4 +381,9 @@
 
 		__cfadbg_print_safe(runtime_core, "Kernel : core %p stopping\n", this);
+	}
+
+	for(int i = 0; !available(future); i++) {
+		if(i > 1000) __cfaabi_dbg_write( "ERROR: kernel has bin spinning on a flush after exit loop.\n", 60);
+		__cfa_io_flush( this, 1 );
 	}
 
@@ -766,16 +749,13 @@
 
 	// Check if there is a sleeping processor
-	processor * p;
-	unsigned idle;
-	unsigned total;
-	[idle, total, p] = query_idles(this->procs);
+	int fd = __atomic_load_n(&this->procs.fd, __ATOMIC_SEQ_CST);
 
 	// If no one is sleeping, we are done
-	if( idle == 0 ) return;
+	if( fd == 0 ) return;
 
 	// We found a processor, wake it up
 	eventfd_t val;
 	val = 1;
-	eventfd_write( p->idle, val );
+	eventfd_write( fd, val );
 
 	#if !defined(__CFA_NO_STATISTICS__)
@@ -802,17 +782,67 @@
 		eventfd_t val;
 		val = 1;
-		eventfd_write( this->idle, val );
+		eventfd_write( this->idle_fd, val );
 	__enable_interrupts_checked();
 }
 
-static void mark_idle(__cluster_proc_list & this, processor & proc) {
-	/* paranoid */ verify( ! __preemption_enabled() );
-	lock( this );
+static void idle_sleep(processor * this, io_future_t & future, iovec & iov) {
+	#if !defined(CFA_WITH_IO_URING_IDLE)
+		#if !defined(__CFA_NO_STATISTICS__)
+			if(this->print_halts) {
+				__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->unique_id, rdtscl());
+			}
+		#endif
+
+		__cfadbg_print_safe(runtime_core, "Kernel : core %p waiting on eventfd %d\n", this, this->idle_fd);
+
+		{
+			eventfd_t val;
+			ssize_t ret = read( this->idle_fd, &val, sizeof(val) );
+			if(ret < 0) {
+				switch((int)errno) {
+				case EAGAIN:
+				#if EAGAIN != EWOULDBLOCK
+					case EWOULDBLOCK:
+				#endif
+				case EINTR:
+					// No need to do anything special here, just assume it's a legitimate wake-up
+					break;
+				default:
+					abort( "KERNEL : internal error, read failure on idle eventfd, error(%d) %s.", (int)errno, strerror( (int)errno ) );
+				}
+			}
+		}
+
+		#if !defined(__CFA_NO_STATISTICS__)
+			if(this->print_halts) {
+				__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->unique_id, rdtscl());
+			}
+		#endif
+	#else
+		// Do we already have a pending read
+		if(available(future)) {
+			// There is no pending read, we need to add one
+			reset(future);
+
+			__kernel_read(this, future, iov, this->idle_fd );
+		}
+
+		__cfa_io_flush( this, 1 );
+	#endif
+}
+
+static bool mark_idle(__cluster_proc_list & this, processor & proc) {
+	/* paranoid */ verify( ! __preemption_enabled() );
+	if(!try_lock( this )) return false;
 		this.idle++;
 		/* paranoid */ verify( this.idle <= this.total );
 		remove(proc);
 		insert_first(this.idles, proc);
+
+		__atomic_store_n(&this.fd, proc.idle_fd, __ATOMIC_SEQ_CST);
 	unlock( this );
 	/* paranoid */ verify( ! __preemption_enabled() );
+
+	return true;
 }
 
@@ -824,25 +854,12 @@
 		remove(proc);
 		insert_last(this.actives, proc);
+
+		{
+			int fd = 0;
+			if(!this.idles`isEmpty) fd = this.idles`first.idle_fd;
+			__atomic_store_n(&this.fd, fd, __ATOMIC_SEQ_CST);
+		}
+
 	unlock( this );
-	/* paranoid */ verify( ! __preemption_enabled() );
-}
-
-static [unsigned idle, unsigned total, * processor] query_idles( & __cluster_proc_list this ) {
-	/* paranoid */ verify( ! __preemption_enabled() );
-	/* paranoid */ verify( ready_schedule_islocked() );
-
-	for() {
-		uint64_t l = __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST);
-		if( 1 == (l % 2) ) { Pause(); continue; }
-		unsigned idle    = this.idle;
-		unsigned total   = this.total;
-		processor * proc = &this.idles`first;
-		// Compiler fence is unnecessary, but gcc-8 and older incorrectly reorder code without it
-		asm volatile("": : :"memory");
-		if(l != __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST)) { Pause(); continue; }
-		return [idle, total, proc];
-	}
-
-	/* paranoid */ verify( ready_schedule_islocked() );
 	/* paranoid */ verify( ! __preemption_enabled() );
 }
@@ -906,10 +923,10 @@
 		if(head == tail) return false;
 		#if OLD_MAIN
-		ready_schedule_lock();
-		ret = __cfa_io_drain( proc );
-		ready_schedule_unlock();
+			ready_schedule_lock();
+			ret = __cfa_io_drain( proc );
+			ready_schedule_unlock();
 		#else
 			ret = __cfa_io_drain( proc );
-	#endif
+		#endif
 	#endif
 	return ret;
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision e2853eb86b033c1a5f683bbe898f3733235771dc)
+++ libcfa/src/concurrency/kernel.hfa	(revision 6c53a93454697f8b549b141d01803272ff074931)
@@ -100,5 +100,5 @@
 
 	// Idle lock (kernel semaphore)
-	int idle;
+	int idle_fd;
 
 	// Termination synchronisation (user semaphore)
@@ -195,5 +195,8 @@
 struct __cluster_proc_list {
 	// Spin lock protecting the queue
-	volatile uint64_t lock;
+	__spinlock_t lock;
+
+	// FD to use to wake a processor
+	volatile int fd;
 
 	// Total number of processors
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision e2853eb86b033c1a5f683bbe898f3733235771dc)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 6c53a93454697f8b549b141d01803272ff074931)
@@ -527,6 +527,6 @@
 	this.local_data = 0p;
 
-	this.idle = eventfd(0, 0);
-	if (idle < 0) {
+	this.idle_fd = eventfd(0, 0);
+	if (idle_fd < 0) {
 		abort("KERNEL ERROR: PROCESSOR EVENTFD - %s\n", strerror(errno));
 	}
@@ -542,5 +542,5 @@
 // Not a ctor, it just preps the destruction but should not destroy members
 static void deinit(processor & this) {
-	close(this.idle);
+	close(this.idle_fd);
 }
 
@@ -584,5 +584,5 @@
 // Cluster
 static void ?{}(__cluster_proc_list & this) {
-	this.lock  = 0;
+	this.fd    = 0;
 	this.idle  = 0;
 	this.total = 0;
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision e2853eb86b033c1a5f683bbe898f3733235771dc)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 6c53a93454697f8b549b141d01803272ff074931)
@@ -39,4 +39,14 @@
 }
 
+// Defines whether or not we *want* to use io_uring_enter as the idle_sleep blocking call
+#define CFA_WANT_IO_URING_IDLE
+
+// Defines whether or not we *can* use io_uring_enter as the idle_sleep blocking call
+#if defined(CFA_WANT_IO_URING_IDLE) && defined(CFA_HAVE_LINUX_IO_URING_H)
+	#if defined(CFA_HAVE_IORING_OP_READ) || (defined(CFA_HAVE_READV) && defined(CFA_HAVE_IORING_OP_READV))
+		#define CFA_WITH_IO_URING_IDLE
+	#endif
+#endif
+
 //-----------------------------------------------------------------------------
 // Scheduler
@@ -149,8 +159,4 @@
 	__atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
 }
-
-
-
-
 
 //-----------------------------------------------------------------------
@@ -268,16 +274,27 @@
 	ready_schedule_lock();
 
-	// Simple counting lock, acquired, acquired by incrementing the counter
-	// to an odd number
-	for() {
-		uint64_t l = this.lock;
-		if(
-			(0 == (l % 2))
-			&& __atomic_compare_exchange_n(&this.lock, &l, l + 1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
-		) return;
-		Pause();
-	}
-
-	/* paranoid */ verify( ! __preemption_enabled() );
+	lock( this.lock __cfaabi_dbg_ctx2 );
+
+	/* paranoid */ verify( ! __preemption_enabled() );
+}
+
+static inline bool try_lock(__cluster_proc_list & this) {
+	/* paranoid */ verify( ! __preemption_enabled() );
+
+	// Start by locking the global RWlock so that we know no-one is
+	// adding/removing processors while we mess with the idle lock
+	ready_schedule_lock();
+
+	if(try_lock( this.lock __cfaabi_dbg_ctx2 )) {
+		// success
+		/* paranoid */ verify( ! __preemption_enabled() );
+		return true;
+	}
+
+	// failed to lock
+	ready_schedule_unlock();
+
+	/* paranoid */ verify( ! __preemption_enabled() );
+	return false;
 }
 
@@ -285,7 +302,5 @@
 	/* paranoid */ verify( ! __preemption_enabled() );
 
-	/* paranoid */ verify( 1 == (this.lock % 2) );
-	// Simple couting lock, release by incrementing to an even number
-	__atomic_fetch_add( &this.lock, 1, __ATOMIC_SEQ_CST );
+	unlock(this.lock);
 
 	// Release the global lock, which we acquired when locking
Index: libcfa/src/device/cpu.cfa
===================================================================
--- libcfa/src/device/cpu.cfa	(revision e2853eb86b033c1a5f683bbe898f3733235771dc)
+++ libcfa/src/device/cpu.cfa	(revision 6c53a93454697f8b549b141d01803272ff074931)
@@ -30,4 +30,6 @@
 	#include <fcntl.h>
 }
+
+#include "algorithms/range_iterator.hfa"
 
 // search a string for character 'character' but looking atmost at len
@@ -135,5 +137,6 @@
 		count++;
 	}
-	iterate_dir(path, lambda);
+	int ret = iterate_dir(path, lambda);
+	if(ret == ENOTDIR) return 0;
 
 	/* paranoid */ verifyf(count == max + 1, "Inconsistent %s count, counted %d, but max %s was %d", prefix, count, prefix, (int)max);
@@ -143,5 +146,5 @@
 
 // Count number of cpus in the system
-static int count_cpus(void) {
+static [int, const char *] count_cpus(void) {
 	const char * fpath = "/sys/devices/system/cpu/online";
 	int fd = open(fpath, 0, O_RDONLY);
@@ -159,7 +162,5 @@
 
 	const char * _;
-	int cnt = read_width(buff, r - 1, &_);
-	/* paranoid */ verify(cnt == count_prefix_dirs("/sys/devices/system/cpu", "cpu"));
-	return cnt;
+	return [read_width(buff, r - 1, &_), strndup(buff, r - 1)];
 }
 
@@ -226,7 +227,7 @@
 
 struct raw_cache_instance {
-	idx_range_t range;
-	unsigned width;
-	unsigned char level;
+	idx_range_t range;	// A text description of the cpus covered
+	unsigned width;		// The number of cpus covered
+	unsigned char level;	// the cache level
 	// FIXME add at least size and type
 };
@@ -235,8 +236,13 @@
 static void ^?{}(raw_cache_instance & this) { free(this.range);}
 
-raw_cache_instance ** build_raw_cache_table(unsigned cpus, unsigned idxs, unsigned cache_levels)
+// Returns a 2D array of instances of size [cpu count][cache levels]
+// where cache level doesn't include instruction caches
+raw_cache_instance ** build_raw_cache_table(unsigned cpus_c, idx_range_t cpus, unsigned idxs, unsigned cache_levels)
 {
-	raw_cache_instance ** raw = alloc(cpus);
-	for(i; cpus) {
+	raw_cache_instance ** raw = alloc(cpus_c, '\0'`fill);
+
+	RangeIter rc = { cpus };
+	while(moveNext(rc)) {
+		unsigned i = rc.com;
 		raw[i] = alloc(cache_levels);
 		void addcache(unsigned fidx, unsigned char level, idx_range_t range, size_t len) {
@@ -263,16 +269,23 @@
 
 // returns an allocate list of all the different distinct last level caches
-static [*llc_map_t, size_t cnt] distinct_llcs(unsigned cpus, unsigned llc_idx, raw_cache_instance ** raw) {
+static [*llc_map_t, size_t cnt] distinct_llcs(idx_range_t cpus, unsigned llc_idx, raw_cache_instance ** raw) {
 	// Allocate at least one element
 	llc_map_t* ranges = alloc();
 	size_t range_cnt = 1;
 
+	RangeIter rc = { cpus };
+	__attribute__((unused)) bool ret =
+	moveNext(rc);
+	/* paranoid */ verify( ret );
+	/* paranoid */ verify( rc.com >= 0 );
+
 	// Initialize with element 0
-	ranges->raw = &raw[0][llc_idx];
+	ranges->raw = &raw[rc.com][llc_idx];
 	ranges->count = 0;
 	ranges->start = -1u;
 
 	// Go over all other cpus
-	CPU_LOOP: for(i; 1~cpus) {
+	CPU_LOOP: while(moveNext(rc)) {
+		unsigned i = rc.com;
 		// Check if the range is already there
 		raw_cache_instance * candidate = &raw[i][llc_idx];
@@ -304,8 +317,10 @@
 }
 
-static [[]cpu_pairing_t] get_cpu_pairings(unsigned cpus, raw_cache_instance ** raw, llc_map_t * maps, size_t map_cnt) {
-	cpu_pairing_t * pairings = alloc(cpus);
-
-	CPU_LOOP: for(i; cpus) {
+static [[]cpu_pairing_t] get_cpu_pairings(unsigned cpus_c, idx_range_t cpus, raw_cache_instance ** raw, llc_map_t * maps, size_t map_cnt) {
+	cpu_pairing_t * pairings = alloc(cpus_c);
+
+	RangeIter rc = { cpus };
+	CPU_LOOP: while(moveNext(rc)) {
+		unsigned i = rc.com;
 		pairings[i].cpu = i;
 		idx_range_t want = raw[i][0].range;
@@ -327,5 +342,18 @@
 extern "C" {
 	void __cfaabi_device_startup( void ) {
-		int cpus = count_cpus();
+		int cpus_c;
+		const char * cpus;
+		[cpus_c, cpus] = count_cpus();
+		#if defined(__CFA_WITH_VERIFY__)
+		// Verify that the mapping is self consistant.
+		{
+			RangeIter rc = { cpus };
+			while(moveNext(rc)) {
+				unsigned i = rc.com;
+				verify(cpus_c > i);
+			}
+		}
+		#endif
+
 		int idxs = count_cache_indexes();
 
@@ -333,5 +361,5 @@
 		unsigned cache_levels = 0;
 		unsigned llc = 0;
-		{
+		if (idxs != 0) {
 			unsigned char prev = -1u;
 			void first(unsigned idx, unsigned char level, const char * map, size_t len) {
@@ -345,5 +373,5 @@
 
 		// Read in raw data
-		raw_cache_instance ** raw = build_raw_cache_table(cpus, idxs, cache_levels);
+		raw_cache_instance ** raw = build_raw_cache_table(cpus_c, cpus, idxs, cache_levels);
 
 		// Find number of distinct cache instances
@@ -362,18 +390,20 @@
 				width2 += maps[i].raw->width;
 			}
-			verify(width1 == cpus);
-			verify(width2 == cpus);
+			verify(width1 == cpus_c);
+			verify(width2 == cpus_c);
 		}
 		#endif
 
 		// Get mappings from cpu to cache instance
-		cpu_pairing_t * pairings = get_cpu_pairings(cpus, raw, maps, map_cnt);
+		cpu_pairing_t * pairings = get_cpu_pairings(cpus_c, cpus, raw, maps, map_cnt);
 
 		// Sort by cache instance
-		qsort(pairings, cpus);
+		qsort(pairings, cpus_c);
 
 		{
 			unsigned it = 0;
-			for(i; cpus) {
+			RangeIter rc = { cpus };
+			while(moveNext(rc)) {
+				unsigned i = rc.com;
 				unsigned llc_id = pairings[i].id;
 				if(maps[llc_id].start == -1u) {
@@ -384,11 +414,14 @@
 				}
 			}
-			/* paranoid */ verify(it == cpus);
+			/* paranoid */ verify(it == cpus_c);
 		}
 
 		// From the mappings build the actual cpu map we want
-		struct cpu_map_entry_t * entries = alloc(cpus);
-		for(i; cpus) { entries[i].count = 0; }
-		for(i; cpus) {
+		struct cpu_map_entry_t * entries = alloc(cpus_c);
+		for(i; cpus_c) { entries[i].count = 0; }
+
+		RangeIter rc = { cpus };
+		while(moveNext(rc)) {
+			unsigned i = rc.com;
 			/* paranoid */ verify(pairings[i].id < map_cnt);
 			unsigned c = pairings[i].cpu;
@@ -406,6 +439,6 @@
 		free(pairings);
 
-		for(i; cpus) {
-			for(j; cache_levels) {
+		for(i; cpus_c) {
+			if( raw[i] ) for(j; cache_levels) {
 				^(raw[i][j]){};
 			}
@@ -415,5 +448,6 @@
 
 		cpu_info.llc_map = entries;
-		cpu_info.hthrd_count = cpus;
+		cpu_info.hthrd_count = cpus_c;
+		cpu_info.llc_count = map_cnt;
 	}
 
Index: libcfa/src/device/cpu.hfa
===================================================================
--- libcfa/src/device/cpu.hfa	(revision e2853eb86b033c1a5f683bbe898f3733235771dc)
+++ libcfa/src/device/cpu.hfa	(revision 6c53a93454697f8b549b141d01803272ff074931)
@@ -23,9 +23,12 @@
 
 struct cpu_info_t {
-	 // array of size [hthrd_count]
+	// Array of size [hthrd_count]
 	const cpu_map_entry_t * llc_map;
 
-	 // Number of _hardware_ threads present in the system
+	// Number of _hardware_ threads present in the system
 	size_t hthrd_count;
+
+	// Number of distinct last level caches
+	size_t llc_count;
 };
 
Index: libcfa/src/heap.cfa
===================================================================
--- libcfa/src/heap.cfa	(revision e2853eb86b033c1a5f683bbe898f3733235771dc)
+++ libcfa/src/heap.cfa	(revision 6c53a93454697f8b549b141d01803272ff074931)
@@ -10,6 +10,6 @@
 // Created On       : Tue Dec 19 21:58:35 2017
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Mon Aug  9 19:03:02 2021
-// Update Count     : 1040
+// Last Modified On : Sun Jan  2 23:29:41 2022
+// Update Count     : 1058
 //
 
@@ -263,26 +263,26 @@
 #ifdef __STATISTICS__
 // Heap statistics counters.
-static unsigned int malloc_zero_calls, malloc_calls;
-static unsigned long long int malloc_storage;
-static unsigned int aalloc_zero_calls, aalloc_calls;
-static unsigned long long int aalloc_storage;
-static unsigned int calloc_zero_calls, calloc_calls;
-static unsigned long long int calloc_storage;
-static unsigned int memalign_zero_calls, memalign_calls;
-static unsigned long long int memalign_storage;
-static unsigned int amemalign_zero_calls, amemalign_calls;
-static unsigned long long int amemalign_storage;
-static unsigned int cmemalign_zero_calls, cmemalign_calls;
-static unsigned long long int cmemalign_storage;
-static unsigned int resize_zero_calls, resize_calls;
-static unsigned long long int resize_storage;
-static unsigned int realloc_zero_calls, realloc_calls;
-static unsigned long long int realloc_storage;
-static unsigned int free_zero_calls, free_calls;
-static unsigned long long int free_storage;
+static unsigned int malloc_calls, malloc_0_calls;
+static unsigned long long int malloc_storage_request, malloc_storage_alloc;
+static unsigned int aalloc_calls, aalloc_0_calls;
+static unsigned long long int aalloc_storage_request, aalloc_storage_alloc;
+static unsigned int calloc_calls, calloc_0_calls;
+static unsigned long long int calloc_storage_request, calloc_storage_alloc;
+static unsigned int memalign_calls, memalign_0_calls;
+static unsigned long long int memalign_storage_request, memalign_storage_alloc;
+static unsigned int amemalign_calls, amemalign_0_calls;
+static unsigned long long int amemalign_storage_request, amemalign_storage_alloc;
+static unsigned int cmemalign_calls, cmemalign_0_calls;
+static unsigned long long int cmemalign_storage_request, cmemalign_storage_alloc;
+static unsigned int resize_calls, resize_0_calls;
+static unsigned long long int resize_storage_request, resize_storage_alloc;
+static unsigned int realloc_calls, realloc_0_calls;
+static unsigned long long int realloc_storage_request, realloc_storage_alloc;
+static unsigned int free_calls, free_null_calls;
+static unsigned long long int free_storage_request, free_storage_alloc;
 static unsigned int mmap_calls;
-static unsigned long long int mmap_storage;
+static unsigned long long int mmap_storage_request, mmap_storage_alloc;
 static unsigned int munmap_calls;
-static unsigned long long int munmap_storage;
+static unsigned long long int munmap_storage_request, munmap_storage_alloc;
 static unsigned int sbrk_calls;
 static unsigned long long int sbrk_storage;
@@ -294,29 +294,29 @@
 	char helpText[1024];
 	__cfaabi_bits_print_buffer( STDERR_FILENO, helpText, sizeof(helpText),
-								"\nHeap statistics:\n"
-								"  malloc    0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
-								"  aalloc    0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
-								"  calloc    0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
-								"  memalign  0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
-								"  amemalign 0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
-								"  cmemalign 0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
-								"  resize    0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
-								"  realloc   0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
-								"  free      0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
-								"  mmap      calls %'u; storage %'llu bytes\n"
-								"  munmap    calls %'u; storage %'llu bytes\n"
-								"  sbrk      calls %'u; storage %'llu bytes\n",
-								malloc_zero_calls, malloc_calls, malloc_storage,
-								aalloc_zero_calls, aalloc_calls, aalloc_storage,
-								calloc_zero_calls, calloc_calls, calloc_storage,
-								memalign_zero_calls, memalign_calls, memalign_storage,
-								amemalign_zero_calls, amemalign_calls, amemalign_storage,
-								cmemalign_zero_calls, cmemalign_calls, cmemalign_storage,
-								resize_zero_calls, resize_calls, resize_storage,
-								realloc_zero_calls, realloc_calls, realloc_storage,
-								free_zero_calls, free_calls, free_storage,
-								mmap_calls, mmap_storage,
-								munmap_calls, munmap_storage,
-								sbrk_calls, sbrk_storage
+								"\nHeap statistics: (storage request / allocation + header)\n"
+								"  malloc    >0 calls %'u; 0 calls %'u; storage %'llu / %'llu bytes\n"
+								"  aalloc    >0 calls %'u; 0 calls %'u; storage %'llu / %'llu bytes\n"
+								"  calloc    >0 calls %'u; 0 calls %'u; storage %'llu / %'llu bytes\n"
+								"  memalign  >0 calls %'u; 0 calls %'u; storage %'llu / %'llu bytes\n"
+								"  amemalign >0 calls %'u; 0 calls %'u; storage %'llu / %'llu bytes\n"
+								"  cmemalign >0 calls %'u; 0 calls %'u; storage %'llu / %'llu bytes\n"
+								"  resize    >0 calls %'u; 0 calls %'u; storage %'llu / %'llu bytes\n"
+								"  realloc   >0 calls %'u; 0 calls %'u; storage %'llu / %'llu bytes\n"
+								"  free      !null calls %'u; null calls %'u; storage %'llu / %'llu bytes\n"
+								"  sbrk      calls %'u; storage %'llu bytes\n"
+								"  mmap      calls %'u; storage %'llu / %'llu bytes\n"
+								"  munmap    calls %'u; storage %'llu / %'llu bytes\n",
+								malloc_calls, malloc_0_calls, malloc_storage_request, malloc_storage_alloc,
+								aalloc_calls, aalloc_0_calls, aalloc_storage_request, aalloc_storage_alloc,
+								calloc_calls, calloc_0_calls, calloc_storage_request, calloc_storage_alloc,
+								memalign_calls, memalign_0_calls, memalign_storage_request, memalign_storage_alloc,
+								amemalign_calls, amemalign_0_calls, amemalign_storage_request, amemalign_storage_alloc,
+								cmemalign_calls, cmemalign_0_calls, cmemalign_storage_request, cmemalign_storage_alloc,
+								resize_calls, resize_0_calls, resize_storage_request, resize_storage_alloc,
+								realloc_calls, realloc_0_calls, realloc_storage_request, realloc_storage_alloc,
+								free_calls, free_null_calls, free_storage_request, free_storage_alloc,
+								sbrk_calls, sbrk_storage,
+								mmap_calls, mmap_storage_request, mmap_storage_alloc,
+								munmap_calls, munmap_storage_request, munmap_storage_alloc
 		);
 } // printStats
@@ -329,29 +329,29 @@
 						"<sizes>\n"
 						"</sizes>\n"
-						"<total type=\"malloc\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
-						"<total type=\"aalloc\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
-						"<total type=\"calloc\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
-						"<total type=\"memalign\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
-						"<total type=\"amemalign\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
-						"<total type=\"cmemalign\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
-						"<total type=\"resize\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
-						"<total type=\"realloc\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
-						"<total type=\"free\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
-						"<total type=\"mmap\" count=\"%'u;\" size=\"%'llu\"/> bytes\n"
-						"<total type=\"munmap\" count=\"%'u;\" size=\"%'llu\"/> bytes\n"
+						"<total type=\"malloc\" >0 count=\"%'u;\" 0 count=\"%'u;\" size=\"%'llu / %'llu\"/> bytes\n"
+						"<total type=\"aalloc\" >0 count=\"%'u;\" 0 count=\"%'u;\" size=\"%'llu / %'llu\"/> bytes\n"
+						"<total type=\"calloc\" >0 count=\"%'u;\" 0 count=\"%'u;\" size=\"%'llu / %'llu\"/> bytes\n"
+						"<total type=\"memalign\" >0 count=\"%'u;\" 0 count=\"%'u;\" size=\"%'llu / %'llu\"/> bytes\n"
+						"<total type=\"amemalign\" >0 count=\"%'u;\" 0 count=\"%'u;\" size=\"%'llu / %'llu\"/> bytes\n"
+						"<total type=\"cmemalign\" >0 count=\"%'u;\" 0 count=\"%'u;\" size=\"%'llu / %'llu\"/> bytes\n"
+						"<total type=\"resize\" >0 count=\"%'u;\" 0 count=\"%'u;\" size=\"%'llu / %'llu\"/> bytes\n"
+						"<total type=\"realloc\" >0 count=\"%'u;\" 0 count=\"%'u;\" size=\"%'llu / %'llu\"/> bytes\n"
+						"<total type=\"free\" !null=\"%'u;\" 0 null=\"%'u;\" size=\"%'llu / %'llu\"/> bytes\n"
 						"<total type=\"sbrk\" count=\"%'u;\" size=\"%'llu\"/> bytes\n"
+						"<total type=\"mmap\" count=\"%'u;\" size=\"%'llu / %'llu\" / > bytes\n"
+						"<total type=\"munmap\" count=\"%'u;\" size=\"%'llu / %'llu\"/> bytes\n"
 						"</malloc>",
-						malloc_zero_calls, malloc_calls, malloc_storage,
-						aalloc_zero_calls, aalloc_calls, aalloc_storage,
-						calloc_zero_calls, calloc_calls, calloc_storage,
-						memalign_zero_calls, memalign_calls, memalign_storage,
-						amemalign_zero_calls, amemalign_calls, amemalign_storage,
-						cmemalign_zero_calls, cmemalign_calls, cmemalign_storage,
-						resize_zero_calls, resize_calls, resize_storage,
-						realloc_zero_calls, realloc_calls, realloc_storage,
-						free_zero_calls, free_calls, free_storage,
-						mmap_calls, mmap_storage,
-						munmap_calls, munmap_storage,
-						sbrk_calls, sbrk_storage
+						malloc_calls, malloc_0_calls, malloc_storage_request, malloc_storage_alloc,
+						aalloc_calls, aalloc_0_calls, aalloc_storage_request, aalloc_storage_alloc,
+						calloc_calls, calloc_0_calls, calloc_storage_request, calloc_storage_alloc,
+						memalign_calls, memalign_0_calls, memalign_storage_request, memalign_storage_alloc,
+						amemalign_calls, amemalign_0_calls, amemalign_storage_request, amemalign_storage_alloc,
+						cmemalign_calls, cmemalign_0_calls, cmemalign_storage_request, cmemalign_storage_alloc,
+						resize_calls, resize_0_calls, resize_storage_request, resize_storage_alloc,
+						realloc_calls, realloc_0_calls, realloc_storage_request, realloc_storage_alloc,
+						free_calls, free_null_calls, free_storage_request, free_storage_alloc,
+						sbrk_calls, sbrk_storage,
+						mmap_calls, mmap_storage_request, mmap_storage_alloc,
+						munmap_calls, munmap_storage_request, munmap_storage_alloc
 		);
 	__cfaabi_bits_write( fileno( stream ), helpText, len );	// ensures all bytes written or exit
@@ -577,5 +577,6 @@
 		#ifdef __STATISTICS__
 		__atomic_add_fetch( &mmap_calls, 1, __ATOMIC_SEQ_CST );
-		__atomic_add_fetch( &mmap_storage, tsize, __ATOMIC_SEQ_CST );
+		__atomic_add_fetch( &mmap_storage_request, size, __ATOMIC_SEQ_CST );
+		__atomic_add_fetch( &mmap_storage_alloc, tsize, __ATOMIC_SEQ_CST );
 		#endif // __STATISTICS__
 
@@ -626,5 +627,6 @@
 		#ifdef __STATISTICS__
 		__atomic_add_fetch( &munmap_calls, 1, __ATOMIC_SEQ_CST );
-		__atomic_add_fetch( &munmap_storage, size, __ATOMIC_SEQ_CST );
+		__atomic_add_fetch( &munmap_storage_request, header->kind.real.size, __ATOMIC_SEQ_CST );
+		__atomic_add_fetch( &munmap_storage_alloc, size, __ATOMIC_SEQ_CST );
 		#endif // __STATISTICS__
 		if ( munmap( header, size ) == -1 ) {
@@ -642,5 +644,6 @@
 		#ifdef __STATISTICS__
 		__atomic_add_fetch( &free_calls, 1, __ATOMIC_SEQ_CST );
-		__atomic_add_fetch( &free_storage, size, __ATOMIC_SEQ_CST );
+		__atomic_add_fetch( &free_storage_request, header->kind.real.size, __ATOMIC_SEQ_CST );
+		__atomic_add_fetch( &free_storage_alloc, size, __ATOMIC_SEQ_CST );
 		#endif // __STATISTICS__
 
@@ -819,7 +822,7 @@
 		if ( likely( size > 0 ) ) {
 			__atomic_add_fetch( &malloc_calls, 1, __ATOMIC_SEQ_CST );
-			__atomic_add_fetch( &malloc_storage, size, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &malloc_storage_request, size, __ATOMIC_SEQ_CST );
 		} else {
-			__atomic_add_fetch( &malloc_zero_calls, 1, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &malloc_0_calls, 1, __ATOMIC_SEQ_CST );
 		} // if
 		#endif // __STATISTICS__
@@ -835,7 +838,7 @@
 		if ( likely( size > 0 ) ) {
 			__atomic_add_fetch( &aalloc_calls, 1, __ATOMIC_SEQ_CST );
-			__atomic_add_fetch( &aalloc_storage, size, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &aalloc_storage_request, size, __ATOMIC_SEQ_CST );
 		} else {
-			__atomic_add_fetch( &aalloc_zero_calls, 1, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &aalloc_0_calls, 1, __ATOMIC_SEQ_CST );
 		} // if
 		#endif // __STATISTICS__
@@ -850,5 +853,5 @@
 	  if ( unlikely( size ) == 0 ) {			// 0 BYTE ALLOCATION RETURNS NULL POINTER
 			#ifdef __STATISTICS__
-			__atomic_add_fetch( &calloc_zero_calls, 1, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &calloc_0_calls, 1, __ATOMIC_SEQ_CST );
 			#endif // __STATISTICS__
 			return 0p;
@@ -856,5 +859,5 @@
 		#ifdef __STATISTICS__
 		__atomic_add_fetch( &calloc_calls, 1, __ATOMIC_SEQ_CST );
-		__atomic_add_fetch( &calloc_storage, dim * elemSize, __ATOMIC_SEQ_CST );
+		__atomic_add_fetch( &calloc_storage_request, dim * elemSize, __ATOMIC_SEQ_CST );
 		#endif // __STATISTICS__
 
@@ -891,5 +894,5 @@
 	  if ( unlikely( size == 0 ) ) {					// special cases
 			#ifdef __STATISTICS__
-			__atomic_add_fetch( &resize_zero_calls, 1, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &resize_0_calls, 1, __ATOMIC_SEQ_CST );
 			#endif // __STATISTICS__
 			free( oaddr );
@@ -902,5 +905,5 @@
 	  if ( unlikely( oaddr == 0p ) ) {
 			#ifdef __STATISTICS__
-			__atomic_add_fetch( &resize_storage, size, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &resize_storage_request, size, __ATOMIC_SEQ_CST );
 			#endif // __STATISTICS__
 			return mallocNoStats( size );
@@ -921,5 +924,5 @@
 
 		#ifdef __STATISTICS__
-		__atomic_add_fetch( &resize_storage, size, __ATOMIC_SEQ_CST );
+		__atomic_add_fetch( &resize_storage_request, size, __ATOMIC_SEQ_CST );
 		#endif // __STATISTICS__
 
@@ -936,5 +939,5 @@
 	  if ( unlikely( size == 0 ) ) {					// special cases
 			#ifdef __STATISTICS__
-			__atomic_add_fetch( &realloc_zero_calls, 1, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &realloc_0_calls, 1, __ATOMIC_SEQ_CST );
 			#endif // __STATISTICS__
 			free( oaddr );
@@ -947,5 +950,5 @@
 	  if ( unlikely( oaddr == 0p ) ) {
 			#ifdef __STATISTICS__
-			__atomic_add_fetch( &realloc_storage, size, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &realloc_storage_request, size, __ATOMIC_SEQ_CST );
 			#endif // __STATISTICS__
 			return mallocNoStats( size );
@@ -969,5 +972,5 @@
 
 		#ifdef __STATISTICS__
-	  	__atomic_add_fetch( &realloc_storage, size, __ATOMIC_SEQ_CST );
+	  	__atomic_add_fetch( &realloc_storage_request, size, __ATOMIC_SEQ_CST );
 		#endif // __STATISTICS__
 
@@ -1000,7 +1003,7 @@
 		if ( likely( size > 0 ) ) {
 			__atomic_add_fetch( &memalign_calls, 1, __ATOMIC_SEQ_CST );
-			__atomic_add_fetch( &memalign_storage, size, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &memalign_storage_request, size, __ATOMIC_SEQ_CST );
 		} else {
-			__atomic_add_fetch( &memalign_zero_calls, 1, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &memalign_0_calls, 1, __ATOMIC_SEQ_CST );
 		} // if
 		#endif // __STATISTICS__
@@ -1016,7 +1019,7 @@
 		if ( likely( size > 0 ) ) {
 			__atomic_add_fetch( &cmemalign_calls, 1, __ATOMIC_SEQ_CST );
-			__atomic_add_fetch( &cmemalign_storage, size, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &cmemalign_storage_request, size, __ATOMIC_SEQ_CST );
 		} else {
-			__atomic_add_fetch( &cmemalign_zero_calls, 1, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &cmemalign_0_calls, 1, __ATOMIC_SEQ_CST );
 		} // if
 		#endif // __STATISTICS__
@@ -1031,5 +1034,5 @@
 	  if ( unlikely( size ) == 0 ) {					// 0 BYTE ALLOCATION RETURNS NULL POINTER
 			#ifdef __STATISTICS__
-			__atomic_add_fetch( &cmemalign_zero_calls, 1, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &cmemalign_0_calls, 1, __ATOMIC_SEQ_CST );
 			#endif // __STATISTICS__
 			return 0p;
@@ -1037,5 +1040,5 @@
 		#ifdef __STATISTICS__
 		__atomic_add_fetch( &cmemalign_calls, 1, __ATOMIC_SEQ_CST );
-		__atomic_add_fetch( &cmemalign_storage, dim * elemSize, __ATOMIC_SEQ_CST );
+		__atomic_add_fetch( &cmemalign_storage_request, dim * elemSize, __ATOMIC_SEQ_CST );
 		#endif // __STATISTICS__
 
@@ -1101,5 +1104,5 @@
 	  if ( unlikely( addr == 0p ) ) {					// special case
 			#ifdef __STATISTICS__
-			__atomic_add_fetch( &free_zero_calls, 1, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &free_null_calls, 1, __ATOMIC_SEQ_CST );
 			#endif // __STATISTICS__
 
@@ -1280,5 +1283,5 @@
   if ( unlikely( size == 0 ) ) {						// special cases
 		#ifdef __STATISTICS__
-		__atomic_add_fetch( &resize_zero_calls, 1, __ATOMIC_SEQ_CST );
+		__atomic_add_fetch( &resize_0_calls, 1, __ATOMIC_SEQ_CST );
 		#endif // __STATISTICS__
 		free( oaddr );
@@ -1294,5 +1297,5 @@
 		#ifdef __STATISTICS__
 		__atomic_add_fetch( &resize_calls, 1, __ATOMIC_SEQ_CST );
-		__atomic_add_fetch( &resize_storage, size, __ATOMIC_SEQ_CST );
+		__atomic_add_fetch( &resize_storage_request, size, __ATOMIC_SEQ_CST );
 		#endif // __STATISTICS__
 		return memalignNoStats( nalign, size );
@@ -1329,5 +1332,5 @@
 
 	#ifdef __STATISTICS__
-	__atomic_add_fetch( &resize_storage, size, __ATOMIC_SEQ_CST );
+	__atomic_add_fetch( &resize_storage_request, size, __ATOMIC_SEQ_CST );
 	#endif // __STATISTICS__
 
@@ -1342,5 +1345,5 @@
   if ( unlikely( size == 0 ) ) {						// special cases
 		#ifdef __STATISTICS__
-		__atomic_add_fetch( &realloc_zero_calls, 1, __ATOMIC_SEQ_CST );
+		__atomic_add_fetch( &realloc_0_calls, 1, __ATOMIC_SEQ_CST );
 		#endif // __STATISTICS__
 		free( oaddr );
@@ -1356,5 +1359,5 @@
 		#ifdef __STATISTICS__
 		__atomic_add_fetch( &realloc_calls, 1, __ATOMIC_SEQ_CST );
-		__atomic_add_fetch( &realloc_storage, size, __ATOMIC_SEQ_CST );
+		__atomic_add_fetch( &realloc_storage_request, size, __ATOMIC_SEQ_CST );
 		#endif // __STATISTICS__
 		return memalignNoStats( nalign, size );
@@ -1380,5 +1383,5 @@
 	#ifdef __STATISTICS__
 	__atomic_add_fetch( &realloc_calls, 1, __ATOMIC_SEQ_CST );
-	__atomic_add_fetch( &realloc_storage, size, __ATOMIC_SEQ_CST );
+	__atomic_add_fetch( &realloc_storage_request, size, __ATOMIC_SEQ_CST );
 	#endif // __STATISTICS__
 
Index: libcfa/src/stdlib.cfa
===================================================================
--- libcfa/src/stdlib.cfa	(revision e2853eb86b033c1a5f683bbe898f3733235771dc)
+++ libcfa/src/stdlib.cfa	(revision 6c53a93454697f8b549b141d01803272ff074931)
@@ -10,6 +10,6 @@
 // Created On       : Thu Jan 28 17:10:29 2016
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Thu Nov 12 07:46:09 2020
-// Update Count     : 503
+// Last Modified On : Mon Jan  3 09:36:27 2022
+// Update Count     : 519
 //
 
@@ -221,7 +221,34 @@
 //---------------------------------------
 
-bool threading_enabled(void) __attribute__((weak)) {
-	return false;
-}
+static uint32_t seed = 0;								// current seed
+static thread_local uint32_t state;						// random state
+
+void set_seed( uint32_t seed_ ) { state = seed = seed_; }
+uint32_t get_seed() { return seed; }
+
+#define GENERATOR LCG
+
+inline uint32_t MarsagliaXor( uint32_t & state ) {
+	if ( unlikely( seed == 0 ) ) set_seed( rdtscl() );
+	else if ( unlikely( state == 0 ) ) state = seed;
+	state ^= state << 6;
+	state ^= state >> 21;
+	state ^= state << 7;
+	return state;
+} // MarsagliaXor
+
+inline uint32_t LCG( uint32_t & state ) {				// linear congruential generator
+	if ( unlikely( seed == 0 ) ) set_seed( rdtscl() );
+	else if ( unlikely( state == 0 ) ) state = seed;
+	return state = 36969 * (state & 65535) + (state >> 16); // 36969 is NOT prime!
+} // LCG
+
+uint32_t prng( PRNG & prng ) with( prng ) { callcnt += 1; return GENERATOR( state ); }
+
+uint32_t prng( void ) { return GENERATOR( state ); }
+
+//---------------------------------------
+
+bool threading_enabled( void ) __attribute__(( weak )) { return false; }
 
 // Local Variables: //
Index: libcfa/src/stdlib.hfa
===================================================================
--- libcfa/src/stdlib.hfa	(revision e2853eb86b033c1a5f683bbe898f3733235771dc)
+++ libcfa/src/stdlib.hfa	(revision 6c53a93454697f8b549b141d01803272ff074931)
@@ -10,6 +10,6 @@
 // Created On       : Thu Jan 28 17:12:35 2016
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Tue Apr 20 21:20:03 2021
-// Update Count     : 575
+// Last Modified On : Sun Jan  2 22:53:57 2022
+// Update Count     : 594
 //
 
@@ -43,33 +43,30 @@
 //---------------------------------------
 
-// Macro because of returns
-#define ARRAY_ALLOC$( allocation, alignment, dim ) \
-	if ( _Alignof(T) <= libAlign() ) return (T *)(void *)allocation( dim, (size_t)sizeof(T) ); /* C allocation */ \
-	else return (T *)alignment( _Alignof(T), dim, sizeof(T) )
-
 static inline forall( T & | sized(T) ) {
 	// CFA safe equivalents, i.e., implicit size specification
 
 	T * malloc( void ) {
-		if ( _Alignof(T) <= libAlign() ) return (T *)(void *)malloc( (size_t)sizeof(T) ); // C allocation
+		if ( _Alignof(T) <= libAlign() ) return (T *)malloc( sizeof(T) ); // C allocation
 		else return (T *)memalign( _Alignof(T), sizeof(T) );
 	} // malloc
 
 	T * aalloc( size_t dim ) {
-		ARRAY_ALLOC$( aalloc, amemalign, dim );
+		if ( _Alignof(T) <= libAlign() ) return (T *)aalloc( dim, sizeof(T) ); // C allocation
+		else return (T *)amemalign( _Alignof(T), dim, sizeof(T) );
 	} // aalloc
 
 	T * calloc( size_t dim ) {
-		ARRAY_ALLOC$( calloc, cmemalign, dim );
+		if ( _Alignof(T) <= libAlign() ) return (T *)calloc( dim, sizeof(T) ); // C allocation
+		else return (T *)cmemalign( _Alignof(T), dim, sizeof(T) );
 	} // calloc
 
 	T * resize( T * ptr, size_t size ) {				// CFA resize, eliminate return-type cast
-		if ( _Alignof(T) <= libAlign() ) return (T *)(void *)resize( (void *)ptr, size ); // CFA resize
-		else return (T *)(void *)resize( (void *)ptr, _Alignof(T), size ); // CFA resize
+		if ( _Alignof(T) <= libAlign() ) return (T *)resize( (void *)ptr, size ); // CFA resize
+		else return (T *)resize( (void *)ptr, _Alignof(T), size ); // CFA resize
 	} // resize
 
 	T * realloc( T * ptr, size_t size ) {				// CFA realloc, eliminate return-type cast
-		if ( _Alignof(T) <= libAlign() ) return (T *)(void *)realloc( (void *)ptr, size ); // C realloc
-		else return (T *)(void *)realloc( (void *)ptr, _Alignof(T), size ); // CFA realloc
+		if ( _Alignof(T) <= libAlign() ) return (T *)realloc( (void *)ptr, size ); // C realloc
+		else return (T *)realloc( (void *)ptr, _Alignof(T), size ); // CFA realloc
 	} // realloc
 
@@ -169,6 +166,7 @@
 		return ret;
 	}
+	S_fill(T) 		?`fill ( zero_t ) = void; // FIX ME: remove this once ticket 214 is resolved
+	S_fill(T) 		?`fill ( T * a ) 				{ return (S_fill(T)){ 'T', '0', 0, a }; } // FIX ME: remove this once ticket 214 is resolved
 	S_fill(T) 		?`fill ( char c ) 				{ return (S_fill(T)){ 'c', c };	}
-	S_fill(T) 		?`fill ( T * a ) 				{ return (S_fill(T)){ 'T', '0', 0, a }; }
 	S_fill(T) 		?`fill ( T a[], size_t nmemb ) 	{ return (S_fill(T)){ 'a', '0', nmemb * sizeof(T), a }; }
 
@@ -362,9 +360,9 @@
 
 static inline {
-	long int random( long int l, long int u ) { if ( u < l ) [u, l] = [l, u]; return lrand48() % (u - l) + l; } // [l,u)
-	long int random( long int u ) { if ( u < 0 ) return random( u, 0 ); else return random( 0, u ); } // [0,u)
+	long int random( long int l, long int u ) { if ( u < l ) [u, l] = [l, u]; return lrand48() % (u - l + 1) + l; } // [l,u]
+	long int random( long int u ) { return random( 0, u - 1 ); } // [0,u)
 	unsigned long int random( void ) { return lrand48(); }
 	unsigned long int random( unsigned long int u ) { return lrand48() % u; } // [0,u)
-	unsigned long int random( unsigned long int l, unsigned long int u ) { if ( u < l ) [u, l] = [l, u]; return lrand48() % (u - l) + l; } // [l,u)
+	unsigned long int random( unsigned long int l, unsigned long int u ) { if ( u < l ) [u, l] = [l, u]; return lrand48() % (u - l + 1) + l; } // [l,u]
 
 	char random( void ) { return (unsigned long int)random(); }
@@ -387,4 +385,33 @@
 //---------------------------------------
 
+struct PRNG {
+	uint32_t callcnt;									// call count
+	uint32_t seed;										// current seed
+	uint32_t state;										// random state
+}; // PRNG
+
+extern uint32_t prng( PRNG & prng ) __attribute__(( warn_unused_result )); // [0,UINT_MAX]
+static inline {
+	void set_seed( PRNG & prng, uint32_t seed_ ) with( prng ) { state = seed = seed_; } // set seed
+	void ?{}( PRNG & prng ) { set_seed( prng, rdtscl() ); }	// random seed
+	void ?{}( PRNG & prng, uint32_t seed ) { set_seed( prng, seed ); } // fixed seed
+	uint32_t get_seed( PRNG & prng ) __attribute__(( warn_unused_result )) with( prng ) { return seed; } // get seed
+	uint32_t prng( PRNG & prng, uint32_t u ) __attribute__(( warn_unused_result )) { return prng( prng ) % u; } // [0,u)
+	uint32_t prng( PRNG & prng, uint32_t l, uint32_t u ) __attribute__(( warn_unused_result )) { return prng( prng, u - l + 1 ) + l; } // [l,u]
+	uint32_t calls( PRNG & prng ) __attribute__(( warn_unused_result )) with( prng ) { return callcnt; }
+} // distribution
+
+extern void set_seed( uint32_t seed );					// set per thread seed
+extern uint32_t get_seed();								// get seed
+extern uint32_t prng( void ) __attribute__(( warn_unused_result )); // [0,UINT_MAX]
+static inline {
+	uint32_t prng( uint32_t u ) __attribute__(( warn_unused_result ));
+	uint32_t prng( uint32_t u ) { return prng() % u; }	// [0,u)
+	uint32_t prng( uint32_t l, uint32_t u ) __attribute__(( warn_unused_result ));
+	uint32_t prng( uint32_t l, uint32_t u ) { return prng( u - l + 1 ) + l; } // [l,u]
+} // distribution
+
+//---------------------------------------
+
 extern bool threading_enabled( void ) OPTIONAL_THREAD;
 
