Index: libcfa/prelude/Makefile.am
===================================================================
--- libcfa/prelude/Makefile.am	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/prelude/Makefile.am	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -11,6 +11,6 @@
 ## Created On       : Sun May 31 08:54:01 2015
 ## Last Modified By : Peter A. Buhr
-## Last Modified On : Mon Feb  3 21:27:18 2020
-## Update Count     : 208
+## Last Modified On : Thu Jan 13 17:06:27 2022
+## Update Count     : 215
 ###############################################################################
 
@@ -37,17 +37,19 @@
 # create extra forward types/declarations to reduce inclusion of library files
 extras.cf : ${srcdir}/extras.regx ${srcdir}/extras.c
-	${AM_V_GEN}gcc ${AM_CFLAGS} -E ${srcdir}/extras.c | grep -f ${srcdir}/extras.regx > extras.cf
-	${AM_V_GEN}gcc ${AM_CFLAGS} -E ${srcdir}/extras.c | grep -zo -f ${srcdir}/extras.regx2 | tr '\0' '\n' >> extras.cf
+	@echo '# 2 "${@}"  // needed for error messages from this file' > ${@}
+	${AM_V_GEN}gcc ${AM_CFLAGS} -E ${srcdir}/extras.c | grep -f ${srcdir}/extras.regx >> ${@}
+	${AM_V_GEN}gcc ${AM_CFLAGS} -E ${srcdir}/extras.c | grep -zo -f ${srcdir}/extras.regx2 | tr '\0' '\n' >> ${@}
 
 # create forward declarations for gcc builtins
 gcc-builtins.cf : gcc-builtins.c ${srcdir}/prototypes.sed
-	${AM_V_GEN}gcc -I${srcdir} -E -P $< | sed -r -f ${srcdir}/prototypes.sed > $@
+	@echo '# 2 "${@}"  // needed for error messages from this file' > ${@}
+	${AM_V_GEN}gcc -I${srcdir} -E -P $< | sed -r -f ${srcdir}/prototypes.sed >> ${@}
 
 gcc-builtins.c : ${srcdir}/builtins.def ${srcdir}/prototypes.awk ${srcdir}/sync-builtins.cf ${srcdir}/prototypes.c
-	${AM_V_GEN}gcc -I${srcdir} -E ${srcdir}/prototypes.c | awk -f ${srcdir}/prototypes.awk > $@
+	${AM_V_GEN}gcc -I${srcdir} -E ${srcdir}/prototypes.c | awk -f ${srcdir}/prototypes.awk > ${@}
 
 prelude.cfa : prelude-gen.cc
 	${AM_V_GEN}${CXX} ${AM_CXXFLAGS} ${CXXFLAGS} ${AM_CFLAGS} ${<} -o prelude-gen -Wall -Wextra -O2 -g -std=c++14
-	@./prelude-gen > $@
+	@./prelude-gen > ${@}
 	@rm ./prelude-gen
 
@@ -58,5 +60,5 @@
 # create forward declarations for cfa builtins
 builtins.cf : builtins.c @LOCAL_CFACC@
-	${AM_V_GEN}gcc ${AM_CFLAGS} -E -P ${<} -o ${@} -MD -MP -MF $(DEPDIR)/builtins.Po -D__cforall
+	${AM_V_GEN}gcc ${AM_CFLAGS} -E ${<} -o ${@} -MD -MP -MF $(DEPDIR)/builtins.Po -D__cforall
 	${AM_V_at}sed -i 's/builtins.o/builtins.cf/g' $(DEPDIR)/builtins.Po
 
@@ -64,5 +66,5 @@
 
 bootloader.c : ${srcdir}/bootloader.cf prelude.cfa extras.cf gcc-builtins.cf builtins.cf @CFACPP@
-	${AM_V_GEN}@CFACPP@ --prelude-dir=${builddir} -tpm ${srcdir}/bootloader.cf $@  # use src/cfa-cpp as not in lib until after install
+	${AM_V_GEN}@CFACPP@ --prelude-dir=${builddir} -tpm ${srcdir}/bootloader.cf ${@}  # use src/cfa-cpp as not in lib until after install
 
 maintainer-clean-local :
Index: libcfa/src/bits/random.hfa
===================================================================
--- libcfa/src/bits/random.hfa	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/src/bits/random.hfa	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -1,17 +1,34 @@
+// 
+// Cforall Version 1.0.0 Copyright (C) 2022 University of Waterloo
+// 
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// random.hfa -- 
+// 
+// Author           : Peter A. Buhr
+// Created On       : Fri Jan 14 07:18:11 2022
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Fri Jan 14 07:18:58 2022
+// Update Count     : 1
+// 
+
 #pragma once
 
 #include <stdint.h>
 
+// Pipelined to allow out-of-order overlap with reduced dependencies. Critically, the current random state is returned
+// (copied), and then compute and store the next random value.
+
+#if defined(__SIZEOF_INT128__)
 //--------------------------------------------------
-#if defined(__SIZEOF_INT128__)
-	typedef __uint128_t __lehmer64_state_t;
-	static inline uint64_t __lehmer64( __lehmer64_state_t & state ) {
+	static inline uint64_t lehmer64( __uint128_t & state ) {
+		__uint128_t ret = state;
 		state *= 0xda942042e4dd58b5;
-		return state >> 64;
+		return ret >> 64;
 	}
 
 //--------------------------------------------------
-	typedef uint64_t __wyhash64_state_t;
-	static inline uint64_t __wyhash64( __wyhash64_state_t & state ) {
+	static inline uint64_t wyhash64( uint64_t & state ) {
 		state += 0x60bee2bee120fc15;
 		__uint128_t tmp;
@@ -25,12 +42,20 @@
 
 //--------------------------------------------------
-typedef uint64_t __xorshift64_state_t;
-static inline uint64_t __xorshift64( __xorshift64_state_t & state ) {
-	uint64_t x = state;
-	x ^= x << 13;
-	x ^= x >> 7;
-	x ^= x << 17;
-	return state = x;
+static inline uint64_t xorshift_13_7_17( uint64_t & state ) {
+	uint64_t ret = state;
+	state ^= state << 13;
+	state ^= state >> 7;
+	state ^= state << 17;
+	return ret;
 }
+
+//--------------------------------------------------
+static inline uint32_t xorshift_6_21_7( uint32_t & state ) {
+	uint32_t ret = state;
+	state ^= state << 6;
+	state ^= state >> 21;
+	state ^= state << 7;
+	return ret;
+} // xorshift_6_21_7
 
 //--------------------------------------------------
@@ -38,9 +63,10 @@
   uint32_t a, b, c, d;
   uint32_t counter;
-} __xorwow__state_t;
+} xorwow__state_t;
 
-/* The state array must be initialized to not be all zero in the first four words */
-static inline uint32_t __xorwow( __xorwow__state_t & state ) {
-	/* Algorithm "xorwow" from p. 5 of Marsaglia, "Xorshift RNGs" */
+// The state array must be initialized to not be all zero in the first four words.
+static inline uint32_t xorwow( xorwow__state_t & state ) {
+	// Algorithm "xorwow" from p. 5 of Marsaglia, "Xorshift RNGs".
+	uint32_t ret = state.a + state.counter;
 	uint32_t t = state.d;
 
@@ -56,4 +82,36 @@
 
 	state.counter += 362437;
-	return t + state.counter;
+	return ret;
 }
+
+//--------------------------------------------------
+static inline uint32_t LCG( uint32_t & state ) {		// linear congruential generator
+	uint32_t ret = state;
+	state = 36969 * (state & 65535) + (state >> 16);	// 36969 is NOT prime! No not change it!
+	return ret;
+} // LCG
+
+//--------------------------------------------------
+#define M  (1_l64u << 48_l64u)
+#define A  (25214903917_l64u)
+#define AI (18446708753438544741_l64u)
+#define C  (11_l64u)
+#define D  (16_l64u)
+
+// Bi-directional LCG random-number generator
+static inline uint32_t LCGBI_fwd( uint64_t & state ) {
+	state = (A * state + C) & (M - 1);
+	return state >> D;
+}
+
+static inline uint32_t LCGBI_bck( uint64_t & state ) {
+	unsigned int r = state >> D;
+	state = AI * (state - C) & (M - 1);
+	return r;
+}
+
+#undef M
+#undef A
+#undef AI
+#undef C
+#undef D
Index: libcfa/src/common.hfa
===================================================================
--- libcfa/src/common.hfa	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/src/common.hfa	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -1,10 +1,10 @@
-// 
+//
 // Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
 //
 // The contents of this file are covered under the licence agreement in the
 // file "LICENCE" distributed with Cforall.
-// 
-// common -- 
-// 
+//
+// common.hfa --
+//
 // Author           : Peter A. Buhr
 // Created On       : Wed Jul 11 17:54:36 2018
@@ -12,5 +12,5 @@
 // Last Modified On : Wed May  5 14:02:04 2021
 // Update Count     : 18
-// 
+//
 
 #pragma once
Index: libcfa/src/concurrency/clib/cfathread.cfa
===================================================================
--- libcfa/src/concurrency/clib/cfathread.cfa	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/src/concurrency/clib/cfathread.cfa	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -22,4 +22,5 @@
 #include "thread.hfa"
 #include "time.hfa"
+#include "stdlib.hfa"
 
 #include "cfathread.h"
@@ -195,5 +196,5 @@
 				eevent.data.u64 = (uint64_t)active_thread();
 
-				int id = thread_rand() % poller_cnt;
+				int id = prng() % poller_cnt;
 				if(0 != epoll_ctl(poller_fds[id], EPOLL_CTL_ADD, fd, &eevent))
 				{
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/src/concurrency/io.cfa	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -144,34 +144,38 @@
 		__ioarbiter_flush( ctx );
 
-		__STATS__( true, io.calls.flush++; )
-		int ret = syscall( __NR_io_uring_enter, ctx.fd, ctx.sq.to_submit, min_comp, min_comp > 0 ? IORING_ENTER_GETEVENTS : 0, (sigset_t *)0p, _NSIG / 8);
-		if( ret < 0 ) {
-			switch((int)errno) {
-			case EAGAIN:
-			case EINTR:
-			case EBUSY:
-				// Update statistics
-				__STATS__( false, io.calls.errors.busy ++; )
-				return false;
-			default:
-				abort( "KERNEL ERROR: IO_URING SYSCALL - (%d) %s\n", (int)errno, strerror(errno) );
+		if(ctx.sq.to_submit != 0 || min_comp > 0) {
+
+			__STATS__( true, io.calls.flush++; )
+			int ret = syscall( __NR_io_uring_enter, ctx.fd, ctx.sq.to_submit, min_comp, min_comp > 0 ? IORING_ENTER_GETEVENTS : 0, (sigset_t *)0p, _NSIG / 8);
+			if( ret < 0 ) {
+				switch((int)errno) {
+				case EAGAIN:
+				case EINTR:
+				case EBUSY:
+					// Update statistics
+					__STATS__( false, io.calls.errors.busy ++; )
+					return false;
+				default:
+					abort( "KERNEL ERROR: IO_URING SYSCALL - (%d) %s\n", (int)errno, strerror(errno) );
+				}
 			}
-		}
-
-		__cfadbg_print_safe(io, "Kernel I/O : %u submitted to io_uring %d\n", ret, ctx.fd);
-		__STATS__( true, io.calls.submitted += ret; )
-		/* paranoid */ verify( ctx.sq.to_submit <= *ctx.sq.num );
-		/* paranoid */ verify( ctx.sq.to_submit >= ret );
-
-		ctx.sq.to_submit -= ret;
-
-		/* paranoid */ verify( ctx.sq.to_submit <= *ctx.sq.num );
-
-		// Release the consumed SQEs
-		__release_sqes( ctx );
-
-		/* paranoid */ verify( ! __preemption_enabled() );
-
-		ctx.proc->io.pending = false;
+
+			__cfadbg_print_safe(io, "Kernel I/O : %u submitted to io_uring %d\n", ret, ctx.fd);
+			__STATS__( true, io.calls.submitted += ret; )
+			/* paranoid */ verify( ctx.sq.to_submit <= *ctx.sq.num );
+			/* paranoid */ verify( ctx.sq.to_submit >= ret );
+
+			ctx.sq.to_submit -= ret;
+
+			/* paranoid */ verify( ctx.sq.to_submit <= *ctx.sq.num );
+
+			// Release the consumed SQEs
+			__release_sqes( ctx );
+
+			/* paranoid */ verify( ! __preemption_enabled() );
+
+			ctx.proc->io.pending = false;
+		}
+
 		ready_schedule_lock();
 		bool ret = __cfa_io_drain( proc );
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/src/concurrency/kernel.cfa	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -205,8 +205,4 @@
 				// Don't block if we are done
 				if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
-
-				#if !defined(__CFA_NO_STATISTICS__)
-					__tls_stats()->ready.sleep.halts++;
-				#endif
 
 				// Push self to idle stack
@@ -732,30 +728,46 @@
 // Wake a thread from the front if there are any
 static void __wake_one(cluster * this) {
+	eventfd_t val;
+
 	/* paranoid */ verify( ! __preemption_enabled() );
 	/* paranoid */ verify( ready_schedule_islocked() );
 
 	// Check if there is a sleeping processor
-	// int fd = __atomic_load_n(&this->procs.fd, __ATOMIC_SEQ_CST);
-	int fd = 0;
-	if( __atomic_load_n(&this->procs.fd, __ATOMIC_SEQ_CST) != 0 ) {
-		fd = __atomic_exchange_n(&this->procs.fd, 0, __ATOMIC_RELAXED);
-	}
-
-	// If no one is sleeping, we are done
-	if( fd == 0 ) return;
-
-	// We found a processor, wake it up
-	eventfd_t val;
-	val = 1;
-	eventfd_write( fd, val );
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		if( kernelTLS().this_stats ) {
-			__tls_stats()->ready.sleep.wakes++;
-		}
-		else {
-			__atomic_fetch_add(&this->stats->ready.sleep.wakes, 1, __ATOMIC_RELAXED);
-		}
-	#endif
+	struct __fd_waitctx * fdp = __atomic_load_n(&this->procs.fdw, __ATOMIC_SEQ_CST);
+
+	// If no one is sleeping: we are done
+	if( fdp == 0p ) return;
+
+	int fd = 1;
+	if( __atomic_load_n(&fdp->fd, __ATOMIC_SEQ_CST) != 1 ) {
+		fd = __atomic_exchange_n(&fdp->fd, 1, __ATOMIC_RELAXED);
+	}
+
+	switch(fd) {
+	case 0:
+		// If the processor isn't ready to sleep then the exchange will already wake it up
+		#if !defined(__CFA_NO_STATISTICS__)
+			if( kernelTLS().this_stats ) { __tls_stats()->ready.sleep.early++;
+			} else { __atomic_fetch_add(&this->stats->ready.sleep.early, 1, __ATOMIC_RELAXED); }
+		#endif
+		break;
+	case 1:
+		// If someone else already said they will wake them: we are done
+		#if !defined(__CFA_NO_STATISTICS__)
+			if( kernelTLS().this_stats ) { __tls_stats()->ready.sleep.seen++;
+			} else { __atomic_fetch_add(&this->stats->ready.sleep.seen, 1, __ATOMIC_RELAXED); }
+		#endif
+		break;
+	default:
+		// If the processor was ready to sleep, we need to wake it up with an actual write
+		val = 1;
+		eventfd_write( fd, val );
+
+		#if !defined(__CFA_NO_STATISTICS__)
+			if( kernelTLS().this_stats ) { __tls_stats()->ready.sleep.wakes++;
+			} else { __atomic_fetch_add(&this->stats->ready.sleep.wakes, 1, __ATOMIC_RELAXED); }
+		#endif
+		break;
+	}
 
 	/* paranoid */ verify( ready_schedule_islocked() );
@@ -770,4 +782,6 @@
 
 	__cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this);
+
+	this->idle_wctx.fd = 1;
 
 	eventfd_t val;
@@ -779,4 +793,19 @@
 
 static void idle_sleep(processor * this, io_future_t & future, iovec & iov) {
+	// Tell everyone we are ready to go do sleep
+	for() {
+		int expected = this->idle_wctx.fd;
+
+		// Someone already told us to wake-up! No time for a nap.
+		if(expected == 1) { return; }
+
+		// Try to mark that we are going to sleep
+		if(__atomic_compare_exchange_n(&this->idle_wctx.fd, &expected, this->idle_fd, false,  __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) ) {
+			// Every one agreed, taking a nap
+			break;
+		}
+	}
+
+
 	#if !defined(CFA_WITH_IO_URING_IDLE)
 		#if !defined(__CFA_NO_STATISTICS__)
@@ -825,4 +854,10 @@
 
 static bool mark_idle(__cluster_proc_list & this, processor & proc) {
+	#if !defined(__CFA_NO_STATISTICS__)
+		__tls_stats()->ready.sleep.halts++;
+	#endif
+
+	proc.idle_wctx.fd = 0;
+
 	/* paranoid */ verify( ! __preemption_enabled() );
 	if(!try_lock( this )) return false;
@@ -832,5 +867,5 @@
 		insert_first(this.idles, proc);
 
-		__atomic_store_n(&this.fd, proc.idle_fd, __ATOMIC_SEQ_CST);
+		__atomic_store_n(&this.fdw, &proc.idle_wctx, __ATOMIC_SEQ_CST);
 	unlock( this );
 	/* paranoid */ verify( ! __preemption_enabled() );
@@ -848,7 +883,7 @@
 
 		{
-			int fd = 0;
-			if(!this.idles`isEmpty) fd = this.idles`first.idle_fd;
-			__atomic_store_n(&this.fd, fd, __ATOMIC_SEQ_CST);
+			struct __fd_waitctx * wctx = 0;
+			if(!this.idles`isEmpty) wctx = &this.idles`first.idle_wctx;
+			__atomic_store_n(&this.fdw, wctx, __ATOMIC_SEQ_CST);
 		}
 
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/src/concurrency/kernel.hfa	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -55,4 +55,9 @@
 };
 
+
+struct __fd_waitctx {
+	volatile int fd;
+};
+
 // Wrapper around kernel threads
 struct __attribute__((aligned(128))) processor {
@@ -67,6 +72,5 @@
 		unsigned target;
 		unsigned last;
-		unsigned cnt;
-		unsigned long long int cutoff;
+		signed   cpu;
 	} rdq;
 
@@ -102,4 +106,7 @@
 	int idle_fd;
 
+	// Idle waitctx
+	struct __fd_waitctx idle_wctx;
+
 	// Termination synchronisation (user semaphore)
 	oneshot terminated;
@@ -152,4 +159,8 @@
 	volatile unsigned long long tv;
 	volatile unsigned long long ma;
+};
+
+struct __attribute__((aligned(16))) __cache_id_t {
+	volatile unsigned id;
 };
 
@@ -164,6 +175,10 @@
 static inline void ^?{}(__timestamp_t & this) {}
 
+struct __attribute__((aligned(128))) __ready_queue_caches_t;
+void  ?{}(__ready_queue_caches_t & this);
+void ^?{}(__ready_queue_caches_t & this);
+
 //TODO adjust cache size to ARCHITECTURE
-// Structure holding the relaxed ready queue
+// Structure holding the ready queue
 struct __ready_queue_t {
 	// Data tracking the actual lanes
@@ -178,4 +193,6 @@
 		__timestamp_t * volatile tscs;
 
+		__cache_id_t * volatile caches;
+
 		// Array of stats
 		__help_cnts_t * volatile help;
@@ -198,5 +215,5 @@
 
 	// FD to use to wake a processor
-	volatile int fd;
+	struct __fd_waitctx * volatile fdw;
 
 	// Total number of processors
Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -79,32 +79,17 @@
 			return
 			#if defined(__SIZEOF_INT128__)
-				__lehmer64( kernelTLS().rand_seed );
+				lehmer64( kernelTLS().rand_seed );
 			#else
-				__xorshift64( kernelTLS().rand_seed );
+				xorshift_13_7_17( kernelTLS().rand_seed );
 			#endif
 		}
 
-		#define M  (1_l64u << 48_l64u)
-		#define A  (25214903917_l64u)
-		#define AI (18446708753438544741_l64u)
-		#define C  (11_l64u)
-		#define D  (16_l64u)
-
 		static inline unsigned __tls_rand_fwd() {
-			kernelTLS().ready_rng.fwd_seed = (A * kernelTLS().ready_rng.fwd_seed + C) & (M - 1);
-			return kernelTLS().ready_rng.fwd_seed >> D;
+			return LCGBI_fwd( kernelTLS().ready_rng.fwd_seed );
 		}
 
 		static inline unsigned __tls_rand_bck() {
-			unsigned int r = kernelTLS().ready_rng.bck_seed >> D;
-			kernelTLS().ready_rng.bck_seed = AI * (kernelTLS().ready_rng.bck_seed - C) & (M - 1);
-			return r;
-		}
-
-		#undef M
-		#undef A
-		#undef AI
-		#undef C
-		#undef D
+			return LCGBI_bck( kernelTLS().ready_rng.bck_seed );
+		}
 
 		static inline void __tls_rand_advance_bck(void) {
@@ -140,6 +125,4 @@
 			}
 		}
-
-		extern uint64_t thread_rand();
 
 		// Semaphore which only supports a single thread
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -34,4 +34,5 @@
 #include "kernel_private.hfa"
 #include "startup.hfa"          // STARTUP_PRIORITY_XXX
+#include "limits.hfa"
 #include "math.hfa"
 
@@ -177,5 +178,4 @@
 
 
-
 //=============================================================================================
 // Kernel Setup logic
@@ -515,8 +515,9 @@
 	this.rdq.its = 0;
 	this.rdq.itr = 0;
-	this.rdq.id  = -1u;
-	this.rdq.target = -1u;
-	this.rdq.last = -1u;
-	this.rdq.cutoff = 0ull;
+	this.rdq.id  = MAX;
+	this.rdq.target = MAX;
+	this.rdq.last = MAX;
+	this.rdq.cpu = 0;
+	// this.rdq.cutoff = 0ull;
 	do_terminate = false;
 	preemption_alarm = 0p;
@@ -536,4 +537,11 @@
 	}
 
+	this.idle_wctx.fd = 0;
+
+	// I'm assuming these two are reserved for standard input and output
+	// so I'm using them as sentinels with idle_wctx.
+	/* paranoid */ verify( this.idle_fd != 0 );
+	/* paranoid */ verify( this.idle_fd != 1 );
+
 	#if !defined(__CFA_NO_STATISTICS__)
 		print_stats = 0;
@@ -589,5 +597,5 @@
 // Cluster
 static void ?{}(__cluster_proc_list & this) {
-	this.fd    = 0;
+	this.fdw   = 0p;
 	this.idle  = 0;
 	this.total = 0;
@@ -686,4 +694,6 @@
 	uint_fast32_t last_size;
 	[this->unique_id, last_size] = ready_mutate_register();
+
+		this->rdq.cpu = __kernel_getcpu();
 
 		this->cltr->procs.total += 1u;
Index: libcfa/src/concurrency/locks.hfa
===================================================================
--- libcfa/src/concurrency/locks.hfa	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/src/concurrency/locks.hfa	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -29,103 +29,4 @@
 #include "time_t.hfa"
 #include "time.hfa"
-
-//-----------------------------------------------------------------------------
-// Semaphores
-
-// '0-nary' semaphore
-// Similar to a counting semaphore except the value of one is never reached
-// as a consequence, a V() that would bring the value to 1 *spins* until
-// a P consumes it
-struct Semaphore0nary {
-	__spinlock_t lock; // needed to protect
-	mpsc_queue(thread$) queue;
-};
-
-static inline bool P(Semaphore0nary & this, thread$ * thrd) {
-	/* paranoid */ verify(!thrd`next);
-	/* paranoid */ verify(!(&(*thrd)`next));
-
-	push(this.queue, thrd);
-	return true;
-}
-
-static inline bool P(Semaphore0nary & this) {
-    thread$ * thrd = active_thread();
-    P(this, thrd);
-    park();
-    return true;
-}
-
-static inline thread$ * V(Semaphore0nary & this, bool doUnpark = true) {
-	thread$ * next;
-	lock(this.lock __cfaabi_dbg_ctx2);
-		for (;;) {
-			next = pop(this.queue);
-			if (next) break;
-			Pause();
-		}
-	unlock(this.lock);
-
-	if (doUnpark) unpark(next);
-	return next;
-}
-
-// Wrapper used on top of any sempahore to avoid potential locking
-struct BinaryBenaphore {
-	volatile ssize_t counter;
-};
-
-static inline {
-	void ?{}(BinaryBenaphore & this) { this.counter = 0; }
-	void ?{}(BinaryBenaphore & this, zero_t) { this.counter = 0; }
-	void ?{}(BinaryBenaphore & this, one_t ) { this.counter = 1; }
-
-	// returns true if no blocking needed
-	bool P(BinaryBenaphore & this) {
-		return __atomic_fetch_sub(&this.counter, 1, __ATOMIC_SEQ_CST) > 0;
-	}
-
-	bool tryP(BinaryBenaphore & this) {
-		ssize_t c = this.counter;
-		/* paranoid */ verify( c > MIN );
-		return (c >= 1) && __atomic_compare_exchange_n(&this.counter, &c, c-1, false, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
-	}
-
-	// returns true if notify needed
-	bool V(BinaryBenaphore & this) {
-		ssize_t c = 0;
-		for () {
-			/* paranoid */ verify( this.counter < MAX );
-			if (__atomic_compare_exchange_n(&this.counter, &c, c+1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-				if (c == 0) return true;
-				/* paranoid */ verify(c < 0);
-				return false;
-			} else {
-				if (c == 1) return true;
-				/* paranoid */ verify(c < 1);
-				Pause();
-			}
-		}
-	}
-}
-
-// Binary Semaphore based on the BinaryBenaphore on top of the 0-nary Semaphore
-struct ThreadBenaphore {
-	BinaryBenaphore ben;
-	Semaphore0nary  sem;
-};
-
-static inline void ?{}(ThreadBenaphore & this) {}
-static inline void ?{}(ThreadBenaphore & this, zero_t) { (this.ben){ 0 }; }
-static inline void ?{}(ThreadBenaphore & this, one_t ) { (this.ben){ 1 }; }
-
-static inline bool P(ThreadBenaphore & this)              { return P(this.ben) ? false : P(this.sem); }
-static inline bool tryP(ThreadBenaphore & this)           { return tryP(this.ben); }
-static inline bool P(ThreadBenaphore & this, bool wait)   { return wait ? P(this) : tryP(this); }
-
-static inline thread$ * V(ThreadBenaphore & this, bool doUnpark = true) {
-	if (V(this.ben)) return 0p;
-	return V(this.sem, doUnpark);
-}
 
 //-----------------------------------------------------------------------------
@@ -171,51 +72,4 @@
 static inline void   on_wakeup( owner_lock & this, size_t v ) { on_wakeup ( (blocking_lock &)this, v ); }
 static inline void   on_notify( owner_lock & this, struct thread$ * t ) { on_notify( (blocking_lock &)this, t ); }
-
-struct fast_lock {
-	thread$ * volatile owner;
-	ThreadBenaphore sem;
-};
-
-static inline void ?{}(fast_lock & this) __attribute__((deprecated("use linear_backoff_then_block_lock instead")));
-static inline void ?{}(fast_lock & this) { this.owner = 0p; }
-
-static inline bool $try_lock(fast_lock & this, thread$ * thrd) {
-    thread$ * exp = 0p;
-    return __atomic_compare_exchange_n(&this.owner, &exp, thrd, false, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
-}
-
-static inline void lock( fast_lock & this ) __attribute__((deprecated("use linear_backoff_then_block_lock instead"), artificial));
-static inline void lock( fast_lock & this ) {
-	thread$ * thrd = active_thread();
-	/* paranoid */verify(thrd != this.owner);
-
-	for (;;) {
-		if ($try_lock(this, thrd)) return;
-		P(this.sem);
-	}
-}
-
-static inline bool try_lock( fast_lock & this ) __attribute__((deprecated("use linear_backoff_then_block_lock instead"), artificial));
-static inline bool try_lock ( fast_lock & this ) {
-	thread$ * thrd = active_thread();
-	/* paranoid */ verify(thrd != this.owner);
-	return $try_lock(this, thrd);
-}
-
-static inline thread$ * unlock( fast_lock & this ) __attribute__((deprecated("use linear_backoff_then_block_lock instead"), artificial));
-static inline thread$ * unlock( fast_lock & this ) {
-	/* paranoid */ verify(active_thread() == this.owner);
-
-	// open 'owner' before unlocking anyone
-	// so new and unlocked threads don't park incorrectly.
-	// This may require additional fencing on ARM.
-	this.owner = 0p;
-
-	return V(this.sem);
-}
-
-static inline size_t on_wait( fast_lock & this ) { unlock(this); return 0; }
-static inline void on_wakeup( fast_lock & this, size_t ) { lock(this); }
-static inline void on_notify( fast_lock &, struct thread$ * t ) { unpark(t); }
 
 struct mcs_node {
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -20,7 +20,8 @@
 
 
-#define USE_RELAXED_FIFO
+// #define USE_RELAXED_FIFO
 // #define USE_WORK_STEALING
 // #define USE_CPU_WORK_STEALING
+#define USE_AWARE_STEALING
 
 #include "bits/defs.hfa"
@@ -29,4 +30,5 @@
 
 #include "stdlib.hfa"
+#include "limits.hfa"
 #include "math.hfa"
 
@@ -54,5 +56,8 @@
 #endif
 
-#if   defined(USE_CPU_WORK_STEALING)
+#if   defined(USE_AWARE_STEALING)
+	#define READYQ_SHARD_FACTOR 2
+	#define SEQUENTIAL_SHARD 2
+#elif defined(USE_CPU_WORK_STEALING)
 	#define READYQ_SHARD_FACTOR 2
 #elif defined(USE_RELAXED_FIFO)
@@ -138,5 +143,4 @@
 	__kernel_rseq_register();
 
-	__cfadbg_print_safe(ready_queue, "Kernel : Registering proc %p for RW-Lock\n", proc);
 	bool * handle = (bool *)&kernelTLS().sched_lock;
 
@@ -174,6 +178,4 @@
 	}
 
-	__cfadbg_print_safe(ready_queue, "Kernel : Registering proc %p done, id %lu\n", proc, n);
-
 	// Return new spot.
 	/* paranoid */ verify(n < ready);
@@ -190,6 +192,4 @@
 
 	__atomic_store_n(cell, 0p, __ATOMIC_RELEASE);
-
-	__cfadbg_print_safe(ready_queue, "Kernel : Unregister proc %p\n", proc);
 
 	__kernel_rseq_unregister();
@@ -201,5 +201,4 @@
 uint_fast32_t ready_mutate_lock( void ) with(*__scheduler_lock) {
 	/* paranoid */ verify( ! __preemption_enabled() );
-	/* paranoid */ verify( ! kernelTLS().sched_lock );
 
 	// Step 1 : lock global lock
@@ -207,4 +206,9 @@
 	//   to simply lock their own lock and enter.
 	__atomic_acquire( &write_lock );
+
+	// Make sure we won't deadlock ourself
+	// Checking before acquiring the writer lock isn't safe
+	// because someone else could have locked us.
+	/* paranoid */ verify( ! kernelTLS().sched_lock );
 
 	// Step 2 : lock per-proc lock
@@ -244,11 +248,45 @@
 
 //=======================================================================
+// caches handling
+
+struct __attribute__((aligned(128))) __ready_queue_caches_t {
+	// Count States:
+	// - 0  : No one is looking after this cache
+	// - 1  : No one is looking after this cache, BUT it's not empty
+	// - 2+ : At least one processor is looking after this cache
+	volatile unsigned count;
+};
+
+void  ?{}(__ready_queue_caches_t & this) { this.count = 0; }
+void ^?{}(__ready_queue_caches_t & this) {}
+
+static inline void depart(__ready_queue_caches_t & cache) {
+	/* paranoid */ verify( cache.count > 1);
+	__atomic_fetch_add(&cache.count, -1, __ATOMIC_SEQ_CST);
+	/* paranoid */ verify( cache.count != 0);
+	/* paranoid */ verify( cache.count < 65536 ); // This verify assumes no cluster will have more than 65000 kernel threads mapped to a single cache, which could be correct but is super weird.
+}
+
+static inline void arrive(__ready_queue_caches_t & cache) {
+	// for() {
+	// 	unsigned expected = cache.count;
+	// 	unsigned desired  = 0 == expected ? 2 : expected + 1;
+	// }
+}
+
+//=======================================================================
 // Cforall Ready Queue used for scheduling
 //=======================================================================
-unsigned long long moving_average(unsigned long long nval, unsigned long long oval) {
-	const unsigned long long tw = 16;
-	const unsigned long long nw = 4;
-	const unsigned long long ow = tw - nw;
-	return ((nw * nval) + (ow * oval)) / tw;
+unsigned long long moving_average(unsigned long long currtsc, unsigned long long instsc, unsigned long long old_avg) {
+	/* paranoid */ verifyf( currtsc < 45000000000000000, "Suspiciously large current time: %'llu (%llx)\n", currtsc, currtsc );
+	/* paranoid */ verifyf( instsc  < 45000000000000000, "Suspiciously large insert time: %'llu (%llx)\n", instsc, instsc );
+	/* paranoid */ verifyf( old_avg < 15000000000000, "Suspiciously large previous average: %'llu (%llx)\n", old_avg, old_avg );
+
+	const unsigned long long new_val = currtsc > instsc ? currtsc - instsc : 0;
+	const unsigned long long total_weight = 16;
+	const unsigned long long new_weight   = 4;
+	const unsigned long long old_weight = total_weight - new_weight;
+	const unsigned long long ret = ((new_weight * new_val) + (old_weight * old_avg)) / total_weight;
+	return ret;
 }
 
@@ -271,8 +309,9 @@
 		}
 	#else
-		lanes.data  = 0p;
-		lanes.tscs  = 0p;
-		lanes.help  = 0p;
-		lanes.count = 0;
+		lanes.data   = 0p;
+		lanes.tscs   = 0p;
+		lanes.caches = 0p;
+		lanes.help   = 0p;
+		lanes.count  = 0;
 	#endif
 }
@@ -285,8 +324,133 @@
 	free(lanes.data);
 	free(lanes.tscs);
+	free(lanes.caches);
 	free(lanes.help);
 }
 
 //-----------------------------------------------------------------------
+#if defined(USE_AWARE_STEALING)
+	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
+		processor * const proc = kernelTLS().this_processor;
+		const bool external = (!proc) || (cltr != proc->cltr);
+		const bool remote   = hint == UNPARK_REMOTE;
+
+		unsigned i;
+		if( external || remote ) {
+			// Figure out where thread was last time and make sure it's valid
+			/* paranoid */ verify(thrd->preferred >= 0);
+			if(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count) {
+				/* paranoid */ verify(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count);
+				unsigned start = thrd->preferred * READYQ_SHARD_FACTOR;
+				do {
+					unsigned r = __tls_rand();
+					i = start + (r % READYQ_SHARD_FACTOR);
+					/* paranoid */ verify( i < lanes.count );
+					// If we can't lock it retry
+				} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+			} else {
+				do {
+					i = __tls_rand() % lanes.count;
+				} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+			}
+		} else {
+			do {
+				unsigned r = proc->rdq.its++;
+				i = proc->rdq.id + (r % READYQ_SHARD_FACTOR);
+				/* paranoid */ verify( i < lanes.count );
+				// If we can't lock it retry
+			} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+		}
+
+		// Actually push it
+		push(lanes.data[i], thrd);
+
+		// Unlock and return
+		__atomic_unlock( &lanes.data[i].lock );
+
+		#if !defined(__CFA_NO_STATISTICS__)
+			if(unlikely(external || remote)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED);
+			else __tls_stats()->ready.push.local.success++;
+		#endif
+	}
+
+	static inline unsigned long long calc_cutoff(const unsigned long long ctsc, const processor * proc, __ready_queue_t & rdq) {
+		unsigned start = proc->rdq.id;
+		unsigned long long max = 0;
+		for(i; READYQ_SHARD_FACTOR) {
+			unsigned long long ptsc = ts(rdq.lanes.data[start + i]);
+			if(ptsc != -1ull) {
+				/* paranoid */ verify( start + i < rdq.lanes.count );
+				unsigned long long tsc = moving_average(ctsc, ptsc, rdq.lanes.tscs[start + i].ma);
+				if(tsc > max) max = tsc;
+			}
+		}
+		return (max + 2 * max) / 2;
+	}
+
+	__attribute__((hot)) struct thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
+		/* paranoid */ verify( lanes.count > 0 );
+		/* paranoid */ verify( kernelTLS().this_processor );
+		/* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count );
+
+		processor * const proc = kernelTLS().this_processor;
+		unsigned this = proc->rdq.id;
+		/* paranoid */ verify( this < lanes.count );
+		__cfadbg_print_safe(ready_queue, "Kernel : pop from %u\n", this);
+
+		// Figure out the current cpu and make sure it is valid
+		const int cpu = __kernel_getcpu();
+		/* paranoid */ verify(cpu >= 0);
+		/* paranoid */ verify(cpu < cpu_info.hthrd_count);
+		unsigned this_cache = cpu_info.llc_map[cpu].cache;
+
+		// Super important: don't write the same value over and over again
+		// We want to maximise our chances that his particular values stays in cache
+		if(lanes.caches[this / READYQ_SHARD_FACTOR].id != this_cache)
+			__atomic_store_n(&lanes.caches[this / READYQ_SHARD_FACTOR].id, this_cache, __ATOMIC_RELAXED);
+
+		const unsigned long long ctsc = rdtscl();
+
+		if(proc->rdq.target == MAX) {
+			uint64_t chaos = __tls_rand();
+			unsigned ext = chaos & 0xff;
+			unsigned other  = (chaos >> 8) % (lanes.count);
+
+			if(ext < 3 || __atomic_load_n(&lanes.caches[other / READYQ_SHARD_FACTOR].id, __ATOMIC_RELAXED) == this_cache) {
+				proc->rdq.target = other;
+			}
+		}
+		else {
+			const unsigned target = proc->rdq.target;
+			__cfadbg_print_safe(ready_queue, "Kernel : %u considering helping %u, tcsc %llu\n", this, target, lanes.tscs[target].tv);
+			/* paranoid */ verify( lanes.tscs[target].tv != MAX );
+			if(target < lanes.count) {
+				const unsigned long long cutoff = calc_cutoff(ctsc, proc, cltr->ready_queue);
+				const unsigned long long age = moving_average(ctsc, lanes.tscs[target].tv, lanes.tscs[target].ma);
+				__cfadbg_print_safe(ready_queue, "Kernel : Help attempt on %u from %u, age %'llu vs cutoff %'llu, %s\n", target, this, age, cutoff, age > cutoff ? "yes" : "no");
+				if(age > cutoff) {
+					thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
+					if(t) return t;
+				}
+			}
+			proc->rdq.target = MAX;
+		}
+
+		for(READYQ_SHARD_FACTOR) {
+			unsigned i = this + (proc->rdq.itr++ % READYQ_SHARD_FACTOR);
+			if(thread$ * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
+		}
+
+		// All lanes where empty return 0p
+		return 0p;
+
+	}
+	__attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+		unsigned i = __tls_rand() % lanes.count;
+		return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal));
+	}
+	__attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr) {
+		return search(cltr);
+	}
+#endif
 #if defined(USE_CPU_WORK_STEALING)
 	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
@@ -350,4 +514,5 @@
 		/* paranoid */ verify( kernelTLS().this_processor );
 
+		processor * const proc = kernelTLS().this_processor;
 		const int cpu = __kernel_getcpu();
 		/* paranoid */ verify(cpu >= 0);
@@ -360,16 +525,15 @@
 		/* paranoid */ verifyf((map.start + map.count) * READYQ_SHARD_FACTOR <= lanes.count, "have %zu lanes but map can go up to %u", lanes.count, (map.start + map.count) * READYQ_SHARD_FACTOR);
 
-		processor * const proc = kernelTLS().this_processor;
 		const int start = map.self * READYQ_SHARD_FACTOR;
 		const unsigned long long ctsc = rdtscl();
 
 		// Did we already have a help target
-		if(proc->rdq.target == -1u) {
+		if(proc->rdq.target == MAX) {
 			unsigned long long max = 0;
 			for(i; READYQ_SHARD_FACTOR) {
-				unsigned long long tsc = moving_average(ctsc - ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
+				unsigned long long tsc = moving_average(ctsc, ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
 				if(tsc > max) max = tsc;
 			}
-			 proc->rdq.cutoff = (max + 2 * max) / 2;
+			//  proc->rdq.cutoff = (max + 2 * max) / 2;
 			/* paranoid */ verify(lanes.count < 65536); // The following code assumes max 65536 cores.
 			/* paranoid */ verify(map.count < 65536); // The following code assumes max 65536 cores.
@@ -384,10 +548,10 @@
 			}
 
-			/* paranoid */ verify(proc->rdq.target != -1u);
+			/* paranoid */ verify(proc->rdq.target != MAX);
 		}
 		else {
 			unsigned long long max = 0;
 			for(i; READYQ_SHARD_FACTOR) {
-				unsigned long long tsc = moving_average(ctsc - ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
+				unsigned long long tsc = moving_average(ctsc, ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
 				if(tsc > max) max = tsc;
 			}
@@ -395,22 +559,21 @@
 			{
 				unsigned target = proc->rdq.target;
-				proc->rdq.target = -1u;
+				proc->rdq.target = MAX;
 				lanes.help[target / READYQ_SHARD_FACTOR].tri++;
-				if(moving_average(ctsc - lanes.tscs[target].tv, lanes.tscs[target].ma) > cutoff) {
+				if(moving_average(ctsc, lanes.tscs[target].tv, lanes.tscs[target].ma) > cutoff) {
 					thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
 					proc->rdq.last = target;
 					if(t) return t;
-					else proc->rdq.target = -1u;
 				}
-				else proc->rdq.target = -1u;
+				proc->rdq.target = MAX;
 			}
 
 			unsigned last = proc->rdq.last;
-			if(last != -1u && lanes.tscs[last].tv < cutoff && ts(lanes.data[last]) < cutoff) {
+			if(last != MAX && moving_average(ctsc, lanes.tscs[last].tv, lanes.tscs[last].ma) > cutoff) {
 				thread$ * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.help));
 				if(t) return t;
 			}
 			else {
-				proc->rdq.last = -1u;
+				proc->rdq.last = MAX;
 			}
 		}
@@ -428,8 +591,8 @@
 		processor * const proc = kernelTLS().this_processor;
 		unsigned last = proc->rdq.last;
-		if(last != -1u) {
+		if(last != MAX) {
 			struct thread$ * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.steal));
 			if(t) return t;
-			proc->rdq.last = -1u;
+			proc->rdq.last = MAX;
 		}
 
@@ -560,5 +723,5 @@
 		#else
 			unsigned preferred = thrd->preferred;
-			const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || preferred == -1u || thrd->curr_cluster != cltr;
+			const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || preferred == MAX || thrd->curr_cluster != cltr;
 			/* paranoid */ verifyf(external || preferred < lanes.count, "Invalid preferred queue %u for %u lanes", preferred, lanes.count );
 
@@ -612,5 +775,5 @@
 		processor * proc = kernelTLS().this_processor;
 
-		if(proc->rdq.target == -1u) {
+		if(proc->rdq.target == MAX) {
 			unsigned long long min = ts(lanes.data[proc->rdq.id]);
 			for(int i = 0; i < READYQ_SHARD_FACTOR; i++) {
@@ -623,5 +786,5 @@
 		else {
 			unsigned target = proc->rdq.target;
-			proc->rdq.target = -1u;
+			proc->rdq.target = MAX;
 			const unsigned long long bias = 0; //2_500_000_000;
 			const unsigned long long cutoff = proc->rdq.cutoff > bias ? proc->rdq.cutoff - bias : proc->rdq.cutoff;
@@ -658,4 +821,5 @@
 // try to pop from a lane given by index w
 static inline struct thread$ * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->ready_queue) {
+	/* paranoid */ verify( w < lanes.count );
 	__STATS( stats.attempt++; )
 
@@ -681,5 +845,5 @@
 	// Actually pop the list
 	struct thread$ * thrd;
-	#if defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING)
+	#if defined(USE_AWARE_STEALING) || defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING)
 		unsigned long long tsc_before = ts(lane);
 	#endif
@@ -697,11 +861,14 @@
 	__STATS( stats.success++; )
 
-	#if defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING)
-		unsigned long long now = rdtscl();
-		lanes.tscs[w].tv = tsv;
-		lanes.tscs[w].ma = moving_average(now > tsc_before ? now - tsc_before : 0, lanes.tscs[w].ma);
+	#if defined(USE_AWARE_STEALING) || defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING)
+		if (tsv != MAX) {
+			unsigned long long now = rdtscl();
+			unsigned long long pma = __atomic_load_n(&lanes.tscs[w].ma, __ATOMIC_RELAXED);
+			__atomic_store_n(&lanes.tscs[w].tv, tsv, __ATOMIC_RELAXED);
+			__atomic_store_n(&lanes.tscs[w].ma, moving_average(now, tsc_before, pma), __ATOMIC_RELAXED);
+		}
 	#endif
 
-	#if defined(USE_CPU_WORK_STEALING)
+	#if defined(USE_AWARE_STEALING) || defined(USE_CPU_WORK_STEALING)
 		thrd->preferred = w / READYQ_SHARD_FACTOR;
 	#else
@@ -802,5 +969,5 @@
 		/* paranoid */ verifyf( it, "Unexpected null iterator, at index %u of %u\n", i, count);
 		it->rdq.id = value;
-		it->rdq.target = -1u;
+		it->rdq.target = MAX;
 		value += READYQ_SHARD_FACTOR;
 		it = &(*it)`next;
@@ -815,10 +982,9 @@
 
 static void fix_times( struct cluster * cltr ) with( cltr->ready_queue ) {
-	#if defined(USE_WORK_STEALING)
+	#if defined(USE_AWARE_STEALING) || defined(USE_WORK_STEALING)
 		lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
 		for(i; lanes.count) {
-			unsigned long long tsc1 = ts(lanes.data[i]);
-			unsigned long long tsc2 = rdtscl();
-			lanes.tscs[i].tv = min(tsc1, tsc2);
+			lanes.tscs[i].tv = rdtscl();
+			lanes.tscs[i].ma = 0;
 		}
 	#endif
@@ -866,4 +1032,6 @@
 			// Update original
 			lanes.count = ncount;
+
+			lanes.caches = alloc( target, lanes.caches`realloc );
 		}
 
@@ -942,7 +1110,10 @@
 				fix(lanes.data[idx]);
 			}
+
+			lanes.caches = alloc( target, lanes.caches`realloc );
 		}
 
 		fix_times(cltr);
+
 
 		reassign_cltr_id(cltr);
Index: libcfa/src/concurrency/stats.cfa
===================================================================
--- libcfa/src/concurrency/stats.cfa	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/src/concurrency/stats.cfa	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -31,5 +31,7 @@
 		stats->ready.sleep.halts   = 0;
 		stats->ready.sleep.cancels = 0;
+		stats->ready.sleep.early   = 0;
 		stats->ready.sleep.wakes   = 0;
+		stats->ready.sleep.seen    = 0;
 		stats->ready.sleep.exits   = 0;
 
@@ -91,5 +93,7 @@
 		tally_one( &cltr->ready.sleep.halts       , &proc->ready.sleep.halts        );
 		tally_one( &cltr->ready.sleep.cancels     , &proc->ready.sleep.cancels      );
+		tally_one( &cltr->ready.sleep.early       , &proc->ready.sleep.early        );
 		tally_one( &cltr->ready.sleep.wakes       , &proc->ready.sleep.wakes        );
+		tally_one( &cltr->ready.sleep.seen        , &proc->ready.sleep.wakes        );
 		tally_one( &cltr->ready.sleep.exits       , &proc->ready.sleep.exits        );
 
@@ -153,5 +157,7 @@
 			     | " (" | eng3(ready.pop.search.attempt) | " try)";
 
-			sstr | "- Idle Slp : " | eng3(ready.sleep.halts) | "halt," | eng3(ready.sleep.cancels) | "cancel," | eng3(ready.sleep.wakes) | "wake," | eng3(ready.sleep.exits) | "exit";
+			sstr | "- Idle Slp : " | eng3(ready.sleep.halts) | "halt," | eng3(ready.sleep.cancels) | "cancel,"
+			     | eng3(ready.sleep.wakes + ready.sleep.early) | '(' | eng3(ready.sleep.early) | ',' | eng3(ready.sleep.seen) | ')' | " wake(early, seen),"
+			     | eng3(ready.sleep.exits) | "exit";
 			sstr | nl;
 		}
Index: libcfa/src/concurrency/stats.hfa
===================================================================
--- libcfa/src/concurrency/stats.hfa	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/src/concurrency/stats.hfa	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -69,5 +69,7 @@
 			volatile uint64_t halts;
 			volatile uint64_t cancels;
+			volatile uint64_t early;
 			volatile uint64_t wakes;
+			volatile uint64_t seen;
 			volatile uint64_t exits;
 		} sleep;
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/src/concurrency/thread.cfa	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -10,6 +10,6 @@
 // Created On       : Tue Jan 17 12:27:26 2017
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Wed Jan 12 18:46:48 2022
-// Update Count     : 36
+// Last Modified On : Thu Jan 13 20:11:55 2022
+// Update Count     : 42
 //
 
@@ -24,6 +24,4 @@
 #define __CFA_INVOKE_PRIVATE__
 #include "invoke.h"
-
-uint64_t thread_rand();
 
 extern uint32_t __global_random_seed;
@@ -174,26 +172,6 @@
 }
 
-uint64_t thread_rand() {
-	disable_interrupts();
-	uint64_t ret = __tls_rand();
-	enable_interrupts();
-	return ret;
-}
- 
+//-----------------------------------------------------------------------------
 #define GENERATOR LCG
-
-static inline uint32_t MarsagliaXor( uint32_t & state ) {
-	uint32_t ret = state;
-	state ^= state << 6;
-	state ^= state >> 21;
-	state ^= state << 7;
-	return ret;
-} // MarsagliaXor
-
-static inline uint32_t LCG( uint32_t & state ) {		// linear congruential generator
-	uint32_t ret = state;
-	state = 36969 * (state & 65535) + (state >> 16);	// 36969 is NOT prime! No not change it!
-	return ret;
-} // LCG
 
 void set_seed( uint32_t seed ) {
Index: libcfa/src/iostream.cfa
===================================================================
--- libcfa/src/iostream.cfa	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/src/iostream.cfa	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -10,6 +10,6 @@
 // Created On       : Wed May 27 17:56:53 2015
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Sun Oct 10 09:28:17 2021
-// Update Count     : 1345
+// Last Modified On : Wed Jan 19 08:15:53 2022
+// Update Count     : 1352
 //
 
@@ -57,5 +57,5 @@
 	ostype & ?|?( ostype & os, signed char sc ) {
 		if ( sepPrt$( os ) ) fmt( os, "%s", sepGetCur$( os ) );
-		fmt( os, "%hhd", sc );
+		fmt( os, "%'hhd", sc );
 		return os;
 	} // ?|?
@@ -66,5 +66,5 @@
 	ostype & ?|?( ostype & os, unsigned char usc ) {
 		if ( sepPrt$( os ) ) fmt( os, "%s", sepGetCur$( os ) );
-		fmt( os, "%hhu", usc );
+		fmt( os, "%'hhu", usc );
 		return os;
 	} // ?|?
@@ -75,5 +75,5 @@
 	ostype & ?|?( ostype & os, short int si ) {
 		if ( sepPrt$( os ) ) fmt( os, "%s", sepGetCur$( os ) );
-		fmt( os, "%hd", si );
+		fmt( os, "%'hd", si );
 		return os;
 	} // ?|?
@@ -84,5 +84,5 @@
 	ostype & ?|?( ostype & os, unsigned short int usi ) {
 		if ( sepPrt$( os ) ) fmt( os, "%s", sepGetCur$( os ) );
-		fmt( os, "%hu", usi );
+		fmt( os, "%'hu", usi );
 		return os;
 	} // ?|?
@@ -93,5 +93,5 @@
 	ostype & ?|?( ostype & os, int i ) {
 		if ( sepPrt$( os ) ) fmt( os, "%s", sepGetCur$( os ) );
-		fmt( os, "%d", i );
+		fmt( os, "%'d", i );
 		return os;
 	} // ?|?
@@ -102,5 +102,5 @@
 	ostype & ?|?( ostype & os, unsigned int ui ) {
 		if ( sepPrt$( os ) ) fmt( os, "%s", sepGetCur$( os ) );
-		fmt( os, "%u", ui );
+		fmt( os, "%'u", ui );
 		return os;
 	} // ?|?
@@ -111,5 +111,5 @@
 	ostype & ?|?( ostype & os, long int li ) {
 		if ( sepPrt$( os ) ) fmt( os, "%s", sepGetCur$( os ) );
-		fmt( os, "%ld", li );
+		fmt( os, "%'ld", li );
 		return os;
 	} // ?|?
@@ -120,5 +120,5 @@
 	ostype & ?|?( ostype & os, unsigned long int uli ) {
 		if ( sepPrt$( os ) ) fmt( os, "%s", sepGetCur$( os ) );
-		fmt( os, "%lu", uli );
+		fmt( os, "%'lu", uli );
 		return os;
 	} // ?|?
@@ -129,5 +129,5 @@
 	ostype & ?|?( ostype & os, long long int lli ) {
 		if ( sepPrt$( os ) ) fmt( os, "%s", sepGetCur$( os ) );
-		fmt( os, "%lld", lli );
+		fmt( os, "%'lld", lli );
 		return os;
 	} // ?|?
@@ -138,5 +138,5 @@
 	ostype & ?|?( ostype & os, unsigned long long int ulli ) {
 		if ( sepPrt$( os ) ) fmt( os, "%s", sepGetCur$( os ) );
-		fmt( os, "%llu", ulli );
+		fmt( os, "%'llu", ulli );
 		return os;
 	} // ?|?
@@ -205,5 +205,5 @@
 	ostype & ?|?( ostype & os, float f ) {
 		if ( sepPrt$( os ) ) fmt( os, "%s", sepGetCur$( os ) );
-		PrintWithDP( os, "%g", f );
+		PrintWithDP( os, "%'g", f );
 		return os;
 	} // ?|?
@@ -214,5 +214,5 @@
 	ostype & ?|?( ostype & os, double d ) {
 		if ( sepPrt$( os ) ) fmt( os, "%s", sepGetCur$( os ) );
-		PrintWithDP( os, "%.*lg", d, DBL_DIG );
+		PrintWithDP( os, "%'.*lg", d, DBL_DIG );
 		return os;
 	} // ?|?
@@ -223,5 +223,5 @@
 	ostype & ?|?( ostype & os, long double ld ) {
 		if ( sepPrt$( os ) ) fmt( os, "%s", sepGetCur$( os ) );
-		PrintWithDP( os, "%.*Lg", ld, LDBL_DIG );
+		PrintWithDP( os, "%'.*Lg", ld, LDBL_DIG );
 		return os;
 	} // ?|?
@@ -233,6 +233,6 @@
 		if ( sepPrt$( os ) ) fmt( os, "%s", sepGetCur$( os ) );
 //		os | crealf( fc ) | nonl;
-		PrintWithDP( os, "%g", crealf( fc ) );
-		PrintWithDP( os, "%+g", cimagf( fc ) );
+		PrintWithDP( os, "%'g", crealf( fc ) );
+		PrintWithDP( os, "%'+g", cimagf( fc ) );
 		fmt( os, "i" );
 		return os;
@@ -245,6 +245,6 @@
 		if ( sepPrt$( os ) ) fmt( os, "%s", sepGetCur$( os ) );
 //		os | creal( dc ) | nonl;
-		PrintWithDP( os, "%.*lg", creal( dc ), DBL_DIG );
-		PrintWithDP( os, "%+.*lg", cimag( dc ), DBL_DIG );
+		PrintWithDP( os, "%'.*lg", creal( dc ), DBL_DIG );
+		PrintWithDP( os, "%'+.*lg", cimag( dc ), DBL_DIG );
 		fmt( os, "i" );
 		return os;
@@ -257,6 +257,6 @@
 		if ( sepPrt$( os ) ) fmt( os, "%s", sepGetCur$( os ) );
 //		os | creall( ldc ) || nonl;
-		PrintWithDP( os, "%.*Lg", creall( ldc ), LDBL_DIG );
-		PrintWithDP( os, "%+.*Lg", cimagl( ldc ), LDBL_DIG );
+		PrintWithDP( os, "%'.*Lg", creall( ldc ), LDBL_DIG );
+		PrintWithDP( os, "%'+.*Lg", cimagl( ldc ), LDBL_DIG );
 		fmt( os, "i" );
 		return os;
@@ -282,4 +282,5 @@
 		}; // mask
 
+	  if ( s == 0p ) { fmt( os, "%s", "0p" ); return os; } // null pointer
 	  if ( s[0] == '\0' ) { sepOff( os ); return os; } // null string => no separator
 
@@ -496,5 +497,5 @@
 		if ( ! f.flags.pc ) memcpy( &fmtstr, IFMTNP, sizeof(IFMTNP) ); \
 		else memcpy( &fmtstr, IFMTP, sizeof(IFMTP) ); \
-		int star = 4;									/* position before first '*' */ \
+		int star = 5;									/* position before first '*' */ \
 \
 		/* Insert flags into spaces before '*', from right to left. */ \
@@ -503,4 +504,5 @@
 		if ( f.flags.sign ) { fmtstr[star] = '+'; star -= 1; } \
 		if ( f.flags.pad0 && ! f.flags.pc ) { fmtstr[star] = '0'; star -= 1; } \
+		fmtstr[star] = '\''; star -= 1;					/* locale */ \
 		fmtstr[star] = '%'; \
 \
@@ -521,14 +523,14 @@
 } // distribution
 
-IntegralFMTImpl( signed char, "     *hh ", "     *.*hh " )
-IntegralFMTImpl( unsigned char, "     *hh ", "     *.*hh " )
-IntegralFMTImpl( signed short int, "     *h ", "     *.*h " )
-IntegralFMTImpl( unsigned short int, "     *h ", "     *.*h " )
-IntegralFMTImpl( signed int, "     * ", "     *.* " )
-IntegralFMTImpl( unsigned int, "     * ", "     *.* " )
-IntegralFMTImpl( signed long int, "     *l ", "     *.*l " )
-IntegralFMTImpl( unsigned long int, "     *l ", "     *.*l " )
-IntegralFMTImpl( signed long long int, "     *ll ", "     *.*ll " )
-IntegralFMTImpl( unsigned long long int, "     *ll ", "     *.*ll " )
+IntegralFMTImpl( signed char,            "      *hh ", "      *.*hh " )
+IntegralFMTImpl( unsigned char,          "      *hh ", "      *.*hh " )
+IntegralFMTImpl( signed short int,       "      *h ",  "      *.*h " )
+IntegralFMTImpl( unsigned short int,     "      *h ",  "      *.*h " )
+IntegralFMTImpl( signed int,             "      * ",   "      *.* " )
+IntegralFMTImpl( unsigned int,           "      * ",   "      *.* " )
+IntegralFMTImpl( signed long int,        "      *l ",  "      *.*l " )
+IntegralFMTImpl( unsigned long int,      "      *l ",  "      *.*l " )
+IntegralFMTImpl( signed long long int,   "      *ll ", "      *.*ll " )
+IntegralFMTImpl( unsigned long long int, "      *ll ", "      *.*ll " )
 
 
@@ -692,5 +694,5 @@
 		if ( ! f.flags.pc ) memcpy( &fmtstr, DFMTNP, sizeof(DFMTNP) ); \
 		else memcpy( &fmtstr, DFMTP, sizeof(DFMTP) ); \
-		int star = 4;									/* position before first '*' */ \
+		int star = 5;									/* position before first '*' */ \
 \
 		/* Insert flags into spaces before '*', from right to left. */ \
@@ -698,4 +700,5 @@
 		if ( f.flags.sign ) { fmtstr[star] = '+'; star -= 1; } \
 		if ( f.flags.pad0 ) { fmtstr[star] = '0'; star -= 1; } \
+		fmtstr[star] = '\''; star -= 1;					/* locale */ \
 		fmtstr[star] = '%'; \
 \
@@ -715,6 +718,6 @@
 } // distribution
 
-FloatingPointFMTImpl( double, "     * ", "     *.* " )
-FloatingPointFMTImpl( long double, "     *L ", "     *.*L " )
+FloatingPointFMTImpl( double,      "      * ",  "      *.* " )
+FloatingPointFMTImpl( long double, "      *L ", "      *.*L " )
 
 // *********************************** character ***********************************
Index: libcfa/src/parseconfig.cfa
===================================================================
--- libcfa/src/parseconfig.cfa	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/src/parseconfig.cfa	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -1,2 +1,10 @@
+
+
+#pragma GCC diagnostic push
+//#pragma GCC diagnostic ignored "-Wunused-parameter"
+//#pragma GCC diagnostic ignored "-Wunused-function"
+//#pragma GCC diagnostic ignored "-Wuninitialized"
+//#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+
 #include <fstream.hfa>
 #include <parseargs.hfa>
@@ -19,12 +27,5 @@
 // TODO: use string interface when it's ready (and implement exception msg protocol)
 [ void ] msg( * Missing_Config_Entries ex ) {
-	serr | nlOff;
-	serr | "The config file is missing " | ex->num_missing;
-	serr | nlOn;
-	if ( ex->num_missing == 1 ) {
-		serr | " entry.";
-	} else {
-		serr | " entries.";
-	}
+	serr | "The config file is missing " | ex->num_missing | "entr" | sepOff | (ex->num_missing == 1 ? "y." : "ies.");
 } // msg
 
@@ -223,4 +224,5 @@
 	return value < zero_val;
 }
+#pragma GCC diagnostic pop
 
 
Index: libcfa/src/startup.cfa
===================================================================
--- libcfa/src/startup.cfa	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/src/startup.cfa	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -10,6 +10,6 @@
 // Created On       : Tue Jul 24 16:21:57 2018
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Wed Jan 12 18:51:24 2022
-// Update Count     : 51
+// Last Modified On : Mon Jan 17 16:41:54 2022
+// Update Count     : 55
 //
 
@@ -17,8 +17,9 @@
 #include <locale.h>										// setlocale
 #include <stdlib.h>										// getenv
+#include "bits/defs.hfa"								// rdtscl
 #include "startup.hfa"
-#include "bits/defs.hfa"
 
-extern uint32_t __global_random_seed, __global_random_state;
+extern uint32_t __global_random_seed;					// sequential/concurrent
+extern uint32_t __global_random_state;					// sequential
 
 extern "C" {
@@ -26,5 +27,4 @@
 	void __cfaabi_appready_startup( void ) {
 		tzset();										// initialize time global variables
-		setlocale( LC_NUMERIC, getenv("LANG") );
 		#ifdef __CFA_DEBUG__
 		extern void heapAppStart();
Index: libcfa/src/stdlib.cfa
===================================================================
--- libcfa/src/stdlib.cfa	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/src/stdlib.cfa	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -10,10 +10,10 @@
 // Created On       : Thu Jan 28 17:10:29 2016
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Wed Jan 12 18:52:41 2022
-// Update Count     : 582
+// Last Modified On : Thu Jan 13 21:38:30 2022
+// Update Count     : 593
 //
 
 #include "stdlib.hfa"
-//#include "concurrency/kernel/fwd.hfa"
+#include "bits/random.hfa"
 #include "concurrency/invoke.h"							// random_state
 
@@ -223,25 +223,8 @@
 //---------------------------------------
 
-// Pipelined to allow OoO overlap with reduced dependencies. Critically, return the current value, and compute and store
-// the next value.
-
 #define GENERATOR LCG
 
-static inline uint32_t MarsagliaXor( uint32_t & state ) {
-	uint32_t ret = state;
-	state ^= state << 6;
-	state ^= state >> 21;
-	state ^= state << 7;
-	return ret;
-} // MarsagliaXor
-
-static inline uint32_t LCG( uint32_t & state ) {		// linear congruential generator
-	uint32_t ret = state;
-	state = 36969 * (state & 65535) + (state >> 16);	// 36969 is NOT prime! No not change it!
-	return ret;
-} // LCG
-
 uint32_t __global_random_seed;							// sequential/concurrent
-uint32_t __global_random_state;							// sequential only
+uint32_t __global_random_state;						// sequential only
 
 void set_seed( PRNG & prng, uint32_t seed_ ) with( prng ) { state = seed = seed_; GENERATOR( state ); } // set seed
Index: libcfa/src/stdlib.hfa
===================================================================
--- libcfa/src/stdlib.hfa	(revision c1d8cde984cd89087477e50f07a84dafebe8d305)
+++ libcfa/src/stdlib.hfa	(revision 97fed4458f48d4add95cb1d99ccefd2a77340fde)
@@ -10,6 +10,6 @@
 // Created On       : Thu Jan 28 17:12:35 2016
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Wed Jan 12 18:56:13 2022
-// Update Count     : 621
+// Last Modified On : Thu Jan 13 21:34:46 2022
+// Update Count     : 636
 //
 
@@ -208,5 +208,4 @@
 
 	forall( TT... | { T * alloc_internal$( void *, T *, size_t, size_t, S_fill(T), TT ); } ) {
-
 		T * alloc_internal$( void *       , T * Realloc, size_t Align, size_t Dim, S_fill(T) Fill, T_resize Resize, TT rest) {
 	        return alloc_internal$( Resize, (T*)0p, Align, Dim, Fill, rest);
@@ -232,5 +231,4 @@
 	    	return alloc_internal$( (void*)0p, (T*)0p, (_Alignof(T) > libAlign() ? _Alignof(T) : libAlign()), dim, (S_fill(T)){'0'}, all);
 	    }
-
 	} // distribution TT
 } // distribution T