Index: libcfa/src/Makefile.am
===================================================================
--- libcfa/src/Makefile.am	(revision 3c4bf050c37d1391ba25a62e7b3d185e99ef7f44)
+++ libcfa/src/Makefile.am	(revision c42b8a1098b5adb1a163766aabb8c1e1b5f99fc9)
@@ -134,4 +134,5 @@
 	concurrency/io/call.cfa \
 	concurrency/iofwd.hfa \
+	concurrency/kernel/cluster.cfa \
 	concurrency/kernel_private.hfa \
 	concurrency/kernel/startup.cfa \
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 3c4bf050c37d1391ba25a62e7b3d185e99ef7f44)
+++ libcfa/src/concurrency/kernel.hfa	(revision c42b8a1098b5adb1a163766aabb8c1e1b5f99fc9)
@@ -205,7 +205,4 @@
 void  ?{}(__ready_queue_t & this);
 void ^?{}(__ready_queue_t & this);
-#if !defined(__CFA_NO_STATISTICS__)
-	unsigned cnt(const __ready_queue_t & this, unsigned idx);
-#endif
 
 // Idle Sleep
Index: libcfa/src/concurrency/kernel/cluster.cfa
===================================================================
--- libcfa/src/concurrency/kernel/cluster.cfa	(revision c42b8a1098b5adb1a163766aabb8c1e1b5f99fc9)
+++ libcfa/src/concurrency/kernel/cluster.cfa	(revision c42b8a1098b5adb1a163766aabb8c1e1b5f99fc9)
@@ -0,0 +1,494 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2022 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// cluster.cfa.cfa -- file that includes helpers for subsystem that need
+//				cluster wide support
+//
+// Author           : Thierry Delisle
+// Created On       : Fri 03 11 12:39:24 2022
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#define __cforall_thread__
+
+#include "bits/defs.hfa"
+#include "device/cpu.hfa"
+#include "kernel_private.hfa"
+
+#include "stdlib.hfa"
+#include "limits.hfa"
+#include "math.hfa"
+
+#include "ready_subqueue.hfa"
+
+#include <errno.h>
+#include <unistd.h>
+
+extern "C" {
+	#include <sys/syscall.h>  // __NR_xxx
+}
+
+// No overriden function, no environment variable, no define
+// fall back to a magic number
+#ifndef __CFA_MAX_PROCESSORS__
+	#define __CFA_MAX_PROCESSORS__ 1024
+#endif
+
+#if !defined(__CFA_NO_STATISTICS__)
+	#define __STATS(...) __VA_ARGS__
+#else
+	#define __STATS(...)
+#endif
+
+// returns the maximum number of processors the RWLock support
+__attribute__((weak)) unsigned __max_processors() {
+	const char * max_cores_s = getenv("CFA_MAX_PROCESSORS");
+	if(!max_cores_s) {
+		__cfadbg_print_nolock(ready_queue, "No CFA_MAX_PROCESSORS in ENV\n");
+		return __CFA_MAX_PROCESSORS__;
+	}
+
+	char * endptr = 0p;
+	long int max_cores_l = strtol(max_cores_s, &endptr, 10);
+	if(max_cores_l < 1 || max_cores_l > 65535) {
+		__cfadbg_print_nolock(ready_queue, "CFA_MAX_PROCESSORS out of range : %ld\n", max_cores_l);
+		return __CFA_MAX_PROCESSORS__;
+	}
+	if('\0' != *endptr) {
+		__cfadbg_print_nolock(ready_queue, "CFA_MAX_PROCESSORS not a decimal number : %s\n", max_cores_s);
+		return __CFA_MAX_PROCESSORS__;
+	}
+
+	return max_cores_l;
+}
+
+#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
+	// No forward declaration needed
+	#define __kernel_rseq_register rseq_register_current_thread
+	#define __kernel_rseq_unregister rseq_unregister_current_thread
+#elif defined(CFA_HAVE_LINUX_RSEQ_H)
+	static void __kernel_raw_rseq_register  (void);
+	static void __kernel_raw_rseq_unregister(void);
+
+	#define __kernel_rseq_register __kernel_raw_rseq_register
+	#define __kernel_rseq_unregister __kernel_raw_rseq_unregister
+#else
+	// No forward declaration needed
+	// No initialization needed
+	static inline void noop(void) {}
+
+	#define __kernel_rseq_register noop
+	#define __kernel_rseq_unregister noop
+#endif
+
+//=======================================================================
+// Cluster wide reader-writer lock
+//=======================================================================
+void  ?{}(__scheduler_RWLock_t & this) {
+	this.max   = __max_processors();
+	this.alloc = 0;
+	this.ready = 0;
+	this.data  = alloc(this.max);
+	this.write_lock  = false;
+
+	/*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.alloc), &this.alloc));
+	/*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.ready), &this.ready));
+
+}
+void ^?{}(__scheduler_RWLock_t & this) {
+	free(this.data);
+}
+
+
+//=======================================================================
+// Lock-Free registering/unregistering of threads
+unsigned register_proc_id( void ) with(*__scheduler_lock) {
+	__kernel_rseq_register();
+
+	bool * handle = (bool *)&kernelTLS().sched_lock;
+
+	// Step - 1 : check if there is already space in the data
+	uint_fast32_t s = ready;
+
+	// Check among all the ready
+	for(uint_fast32_t i = 0; i < s; i++) {
+		bool * volatile * cell = (bool * volatile *)&data[i]; // Cforall is bugged and the double volatiles causes problems
+		/* paranoid */ verify( handle != *cell );
+
+		bool * null = 0p; // Re-write every loop since compare thrashes it
+		if( __atomic_load_n(cell, (int)__ATOMIC_RELAXED) == null
+			&& __atomic_compare_exchange_n( cell, &null, handle, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+			/* paranoid */ verify(i < ready);
+			/* paranoid */ verify( (kernelTLS().sched_id = i, true) );
+			return i;
+		}
+	}
+
+	if(max <= alloc) abort("Trying to create more than %ud processors", __scheduler_lock->max);
+
+	// Step - 2 : F&A to get a new spot in the array.
+	uint_fast32_t n = __atomic_fetch_add(&alloc, 1, __ATOMIC_SEQ_CST);
+	if(max <= n) abort("Trying to create more than %ud processors", __scheduler_lock->max);
+
+	// Step - 3 : Mark space as used and then publish it.
+	data[n] = handle;
+	while() {
+		unsigned copy = n;
+		if( __atomic_load_n(&ready, __ATOMIC_RELAXED) == n
+			&& __atomic_compare_exchange_n(&ready, &copy, n + 1, true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
+			break;
+		Pause();
+	}
+
+	// Return new spot.
+	/* paranoid */ verify(n < ready);
+	/* paranoid */ verify( (kernelTLS().sched_id = n, true) );
+	return n;
+}
+
+void unregister_proc_id( unsigned id ) with(*__scheduler_lock) {
+	/* paranoid */ verify(id < ready);
+	/* paranoid */ verify(id == kernelTLS().sched_id);
+	/* paranoid */ verify(data[id] == &kernelTLS().sched_lock);
+
+	bool * volatile * cell = (bool * volatile *)&data[id]; // Cforall is bugged and the double volatiles causes problems
+
+	__atomic_store_n(cell, 0p, __ATOMIC_RELEASE);
+
+	__kernel_rseq_unregister();
+}
+
+//-----------------------------------------------------------------------
+// Writer side : acquire when changing the ready queue, e.g. adding more
+//  queues or removing them.
+uint_fast32_t ready_mutate_lock( void ) with(*__scheduler_lock) {
+	/* paranoid */ verify( ! __preemption_enabled() );
+
+	// Step 1 : lock global lock
+	// It is needed to avoid processors that register mid Critical-Section
+	//   to simply lock their own lock and enter.
+	__atomic_acquire( &write_lock );
+
+	// Make sure we won't deadlock ourself
+	// Checking before acquiring the writer lock isn't safe
+	// because someone else could have locked us.
+	/* paranoid */ verify( ! kernelTLS().sched_lock );
+
+	// Step 2 : lock per-proc lock
+	// Processors that are currently being registered aren't counted
+	//   but can't be in read_lock or in the critical section.
+	// All other processors are counted
+	uint_fast32_t s = ready;
+	for(uint_fast32_t i = 0; i < s; i++) {
+		volatile bool * llock = data[i];
+		if(llock) __atomic_acquire( llock );
+	}
+
+	/* paranoid */ verify( ! __preemption_enabled() );
+	return s;
+}
+
+void ready_mutate_unlock( uint_fast32_t last_s ) with(*__scheduler_lock) {
+	/* paranoid */ verify( ! __preemption_enabled() );
+
+	// Step 1 : release local locks
+	// This must be done while the global lock is held to avoid
+	//   threads that where created mid critical section
+	//   to race to lock their local locks and have the writer
+	//   immidiately unlock them
+	// Alternative solution : return s in write_lock and pass it to write_unlock
+	for(uint_fast32_t i = 0; i < last_s; i++) {
+		volatile bool * llock = data[i];
+		if(llock) __atomic_store_n(llock, (bool)false, __ATOMIC_RELEASE);
+	}
+
+	// Step 2 : release global lock
+	/*paranoid*/ assert(true == write_lock);
+	__atomic_store_n(&write_lock, (bool)false, __ATOMIC_RELEASE);
+
+	/* paranoid */ verify( ! __preemption_enabled() );
+}
+
+//=======================================================================
+// Cluster growth
+static const unsigned __readyq_single_shard = 2;
+
+//-----------------------------------------------------------------------
+// Check that all the intrusive queues in the data structure are still consistent
+static void check( __ready_queue_t & q ) with (q) {
+	#if defined(__CFA_WITH_VERIFY__)
+		{
+			for( idx ; lanes.count ) {
+				__intrusive_lane_t & sl = lanes.data[idx];
+				assert(!lanes.data[idx].lock);
+
+					if(is_empty(sl)) {
+						assert( sl.anchor.next == 0p );
+						assert( sl.anchor.ts   == -1llu );
+						assert( mock_head(sl)  == sl.prev );
+					} else {
+						assert( sl.anchor.next != 0p );
+						assert( sl.anchor.ts   != -1llu );
+						assert( mock_head(sl)  != sl.prev );
+					}
+			}
+		}
+	#endif
+}
+
+// Call this function of the intrusive list was moved using memcpy
+// fixes the list so that the pointers back to anchors aren't left dangling
+static inline void fix(__intrusive_lane_t & ll) {
+			if(is_empty(ll)) {
+				verify(ll.anchor.next == 0p);
+				ll.prev = mock_head(ll);
+			}
+}
+
+static void assign_list(unsigned & value, dlist(processor) & list, unsigned count) {
+	processor * it = &list`first;
+	for(unsigned i = 0; i < count; i++) {
+		/* paranoid */ verifyf( it, "Unexpected null iterator, at index %u of %u\n", i, count);
+		it->rdq.id = value;
+		it->rdq.target = MAX;
+		value += __readyq_shard_factor;
+		it = &(*it)`next;
+	}
+}
+
+static void reassign_cltr_id(struct cluster * cltr) {
+	unsigned preferred = 0;
+	assign_list(preferred, cltr->procs.actives, cltr->procs.total - cltr->procs.idle);
+	assign_list(preferred, cltr->procs.idles  , cltr->procs.idle );
+}
+
+static void fix_times( struct cluster * cltr ) with( cltr->ready_queue ) {
+	lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
+	for(i; lanes.count) {
+		lanes.tscs[i].tv = rdtscl();
+		lanes.tscs[i].ma = 0;
+	}
+}
+
+// Grow the ready queue
+void ready_queue_grow(struct cluster * cltr) {
+	size_t ncount;
+	int target = cltr->procs.total;
+
+	/* paranoid */ verify( ready_mutate_islocked() );
+	__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue\n");
+
+	// Make sure that everything is consistent
+	/* paranoid */ check( cltr->ready_queue );
+
+	// grow the ready queue
+	with( cltr->ready_queue ) {
+		// Find new count
+		// Make sure we always have atleast 1 list
+		if(target >= 2) {
+			ncount = target * __readyq_shard_factor;
+		} else {
+			ncount = __readyq_single_shard;
+		}
+
+		// Allocate new array (uses realloc and memcpies the data)
+		lanes.data = alloc( ncount, lanes.data`realloc );
+
+		// Fix the moved data
+		for( idx; (size_t)lanes.count ) {
+			fix(lanes.data[idx]);
+		}
+
+		// Construct new data
+		for( idx; (size_t)lanes.count ~ ncount) {
+			(lanes.data[idx]){};
+		}
+
+		// Update original
+		lanes.count = ncount;
+
+		lanes.caches = alloc( target, lanes.caches`realloc );
+	}
+
+	fix_times(cltr);
+
+	reassign_cltr_id(cltr);
+
+	// Make sure that everything is consistent
+	/* paranoid */ check( cltr->ready_queue );
+
+	__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue done\n");
+
+	/* paranoid */ verify( ready_mutate_islocked() );
+}
+
+// Shrink the ready queue
+void ready_queue_shrink(struct cluster * cltr) {
+	/* paranoid */ verify( ready_mutate_islocked() );
+	__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue\n");
+
+	// Make sure that everything is consistent
+	/* paranoid */ check( cltr->ready_queue );
+
+	int target = cltr->procs.total;
+
+	with( cltr->ready_queue ) {
+		// Remember old count
+		size_t ocount = lanes.count;
+
+		// Find new count
+		// Make sure we always have atleast 1 list
+		lanes.count = target >= 2 ? target * __readyq_shard_factor: __readyq_single_shard;
+		/* paranoid */ verify( ocount >= lanes.count );
+		/* paranoid */ verify( lanes.count == target * __readyq_shard_factor || target < 2 );
+
+		// for printing count the number of displaced threads
+		#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
+			__attribute__((unused)) size_t displaced = 0;
+		#endif
+
+		// redistribute old data
+		for( idx; (size_t)lanes.count ~ ocount) {
+			// Lock is not strictly needed but makes checking invariants much easier
+			__attribute__((unused)) bool locked = __atomic_try_acquire(&lanes.data[idx].lock);
+			verify(locked);
+
+			// As long as we can pop from this lane to push the threads somewhere else in the queue
+			while(!is_empty(lanes.data[idx])) {
+				struct thread$ * thrd;
+				unsigned long long _;
+				[thrd, _] = pop(lanes.data[idx]);
+
+				push(cltr, thrd, true);
+
+				// for printing count the number of displaced threads
+				#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
+					displaced++;
+				#endif
+			}
+
+			// Unlock the lane
+			__atomic_unlock(&lanes.data[idx].lock);
+
+			// TODO print the queue statistics here
+
+			^(lanes.data[idx]){};
+		}
+
+		__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue displaced %zu threads\n", displaced);
+
+		// Allocate new array (uses realloc and memcpies the data)
+		lanes.data = alloc( lanes.count, lanes.data`realloc );
+
+		// Fix the moved data
+		for( idx; (size_t)lanes.count ) {
+			fix(lanes.data[idx]);
+		}
+
+		lanes.caches = alloc( target, lanes.caches`realloc );
+	}
+
+	fix_times(cltr);
+
+
+	reassign_cltr_id(cltr);
+
+	// Make sure that everything is consistent
+	/* paranoid */ check( cltr->ready_queue );
+
+	__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue done\n");
+	/* paranoid */ verify( ready_mutate_islocked() );
+}
+
+// Ctor
+void ?{}( __intrusive_lane_t & this ) {
+	this.lock = false;
+	this.prev = mock_head(this);
+	this.anchor.next = 0p;
+	this.anchor.ts   = -1llu;
+	#if !defined(__CFA_NO_STATISTICS__)
+		this.cnt  = 0;
+	#endif
+
+	// We add a boat-load of assertions here because the anchor code is very fragile
+	/* paranoid */ _Static_assert( offsetof( thread$, link ) == offsetof(__intrusive_lane_t, anchor) );
+	/* paranoid */ verify( offsetof( thread$, link ) == offsetof(__intrusive_lane_t, anchor) );
+	/* paranoid */ verify( ((uintptr_t)( mock_head(this) ) + offsetof( thread$, link )) == (uintptr_t)(&this.anchor) );
+	/* paranoid */ verify( &mock_head(this)->link.next == &this.anchor.next );
+	/* paranoid */ verify( &mock_head(this)->link.ts   == &this.anchor.ts   );
+	/* paranoid */ verify( mock_head(this)->link.next == 0p );
+	/* paranoid */ verify( mock_head(this)->link.ts   == -1llu  );
+	/* paranoid */ verify( mock_head(this) == this.prev );
+	/* paranoid */ verify( __alignof__(__intrusive_lane_t) == 128 );
+	/* paranoid */ verify( __alignof__(this) == 128 );
+	/* paranoid */ verifyf( ((intptr_t)(&this) % 128) == 0, "Expected address to be aligned %p %% 128 == %zd", &this, ((intptr_t)(&this) % 128) );
+}
+
+// Dtor is trivial
+void ^?{}( __intrusive_lane_t & this ) {
+	// Make sure the list is empty
+	/* paranoid */ verify( this.anchor.next == 0p );
+	/* paranoid */ verify( this.anchor.ts   == -1llu );
+	/* paranoid */ verify( mock_head(this)  == this.prev );
+}
+
+#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
+	// No definition needed
+#elif defined(CFA_HAVE_LINUX_RSEQ_H)
+
+	#if defined( __x86_64 ) || defined( __i386 )
+		#define RSEQ_SIG	0x53053053
+	#elif defined( __ARM_ARCH )
+		#ifdef __ARMEB__
+		#define RSEQ_SIG    0xf3def5e7      /* udf    #24035    ; 0x5de3 (ARMv6+) */
+		#else
+		#define RSEQ_SIG    0xe7f5def3      /* udf    #24035    ; 0x5de3 */
+		#endif
+	#endif
+
+	extern void __disable_interrupts_hard();
+	extern void __enable_interrupts_hard();
+
+	static void __kernel_raw_rseq_register  (void) {
+		/* paranoid */ verify( __cfaabi_rseq.cpu_id == RSEQ_CPU_ID_UNINITIALIZED );
+
+		// int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, (sigset_t *)0p, _NSIG / 8);
+		int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, RSEQ_SIG);
+		if(ret != 0) {
+			int e = errno;
+			switch(e) {
+			case EINVAL: abort("KERNEL ERROR: rseq register invalid argument");
+			case ENOSYS: abort("KERNEL ERROR: rseq register no supported");
+			case EFAULT: abort("KERNEL ERROR: rseq register with invalid argument");
+			case EBUSY : abort("KERNEL ERROR: rseq register already registered");
+			case EPERM : abort("KERNEL ERROR: rseq register sig  argument  on unregistration does not match the signature received on registration");
+			default: abort("KERNEL ERROR: rseq register unexpected return %d", e);
+			}
+		}
+	}
+
+	static void __kernel_raw_rseq_unregister(void) {
+		/* paranoid */ verify( __cfaabi_rseq.cpu_id >= 0 );
+
+		// int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, (sigset_t *)0p, _NSIG / 8);
+		int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
+		if(ret != 0) {
+			int e = errno;
+			switch(e) {
+			case EINVAL: abort("KERNEL ERROR: rseq unregister invalid argument");
+			case ENOSYS: abort("KERNEL ERROR: rseq unregister no supported");
+			case EFAULT: abort("KERNEL ERROR: rseq unregister with invalid argument");
+			case EBUSY : abort("KERNEL ERROR: rseq unregister already registered");
+			case EPERM : abort("KERNEL ERROR: rseq unregister sig  argument  on unregistration does not match the signature received on registration");
+			default: abort("KERNEL ERROR: rseq unregisteunexpected return %d", e);
+			}
+		}
+	}
+#else
+	// No definition needed
+#endif
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 3c4bf050c37d1391ba25a62e7b3d185e99ef7f44)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision c42b8a1098b5adb1a163766aabb8c1e1b5f99fc9)
@@ -365,4 +365,20 @@
 void ready_queue_shrink(struct cluster * cltr);
 
+//-----------------------------------------------------------------------
+// Calc moving average based on existing average, before and current time.
+static inline unsigned long long moving_average(unsigned long long currtsc, unsigned long long instsc, unsigned long long old_avg) {
+	/* paranoid */ verifyf( currtsc < 45000000000000000, "Suspiciously large current time: %'llu (%llx)\n", currtsc, currtsc );
+	/* paranoid */ verifyf( instsc  < 45000000000000000, "Suspiciously large insert time: %'llu (%llx)\n", instsc, instsc );
+	/* paranoid */ verifyf( old_avg < 15000000000000, "Suspiciously large previous average: %'llu (%llx)\n", old_avg, old_avg );
+
+	const unsigned long long new_val = currtsc > instsc ? currtsc - instsc : 0;
+	const unsigned long long total_weight = 16;
+	const unsigned long long new_weight   = 4;
+	const unsigned long long old_weight = total_weight - new_weight;
+	const unsigned long long ret = ((new_weight * new_val) + (old_weight * old_avg)) / total_weight;
+	return ret;
+}
+
+static const unsigned __readyq_shard_factor = 2;
 
 // Local Variables: //
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 3c4bf050c37d1391ba25a62e7b3d185e99ef7f44)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision c42b8a1098b5adb1a163766aabb8c1e1b5f99fc9)
@@ -26,14 +26,8 @@
 #include "kernel_private.hfa"
 
-#include "stdlib.hfa"
 #include "limits.hfa"
-#include "math.hfa"
-
-#include <errno.h>
-#include <unistd.h>
-
-extern "C" {
-	#include <sys/syscall.h>  // __NR_xxx
-}
+
+// #include <errno.h>
+// #include <unistd.h>
 
 #include "ready_subqueue.hfa"
@@ -47,203 +41,11 @@
 #endif
 
-// No overriden function, no environment variable, no define
-// fall back to a magic number
-#ifndef __CFA_MAX_PROCESSORS__
-	#define __CFA_MAX_PROCESSORS__ 1024
-#endif
-
-#define READYQ_SHARD_FACTOR 2
-#define SEQUENTIAL_SHARD 2
-
 static inline struct thread$ * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats));
 static inline struct thread$ * try_pop(struct cluster * cltr, unsigned i, unsigned j __STATS(, __stats_readyQ_pop_t & stats));
 static inline struct thread$ * search(struct cluster * cltr);
 
-
-// returns the maximum number of processors the RWLock support
-__attribute__((weak)) unsigned __max_processors() {
-	const char * max_cores_s = getenv("CFA_MAX_PROCESSORS");
-	if(!max_cores_s) {
-		__cfadbg_print_nolock(ready_queue, "No CFA_MAX_PROCESSORS in ENV\n");
-		return __CFA_MAX_PROCESSORS__;
-	}
-
-	char * endptr = 0p;
-	long int max_cores_l = strtol(max_cores_s, &endptr, 10);
-	if(max_cores_l < 1 || max_cores_l > 65535) {
-		__cfadbg_print_nolock(ready_queue, "CFA_MAX_PROCESSORS out of range : %ld\n", max_cores_l);
-		return __CFA_MAX_PROCESSORS__;
-	}
-	if('\0' != *endptr) {
-		__cfadbg_print_nolock(ready_queue, "CFA_MAX_PROCESSORS not a decimal number : %s\n", max_cores_s);
-		return __CFA_MAX_PROCESSORS__;
-	}
-
-	return max_cores_l;
-}
-
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-	// No forward declaration needed
-	#define __kernel_rseq_register rseq_register_current_thread
-	#define __kernel_rseq_unregister rseq_unregister_current_thread
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-	static void __kernel_raw_rseq_register  (void);
-	static void __kernel_raw_rseq_unregister(void);
-
-	#define __kernel_rseq_register __kernel_raw_rseq_register
-	#define __kernel_rseq_unregister __kernel_raw_rseq_unregister
-#else
-	// No forward declaration needed
-	// No initialization needed
-	static inline void noop(void) {}
-
-	#define __kernel_rseq_register noop
-	#define __kernel_rseq_unregister noop
-#endif
-
-//=======================================================================
-// Cluster wide reader-writer lock
-//=======================================================================
-void  ?{}(__scheduler_RWLock_t & this) {
-	this.max   = __max_processors();
-	this.alloc = 0;
-	this.ready = 0;
-	this.data  = alloc(this.max);
-	this.write_lock  = false;
-
-	/*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.alloc), &this.alloc));
-	/*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.ready), &this.ready));
-
-}
-void ^?{}(__scheduler_RWLock_t & this) {
-	free(this.data);
-}
-
-
-//=======================================================================
-// Lock-Free registering/unregistering of threads
-unsigned register_proc_id( void ) with(*__scheduler_lock) {
-	__kernel_rseq_register();
-
-	bool * handle = (bool *)&kernelTLS().sched_lock;
-
-	// Step - 1 : check if there is already space in the data
-	uint_fast32_t s = ready;
-
-	// Check among all the ready
-	for(uint_fast32_t i = 0; i < s; i++) {
-		bool * volatile * cell = (bool * volatile *)&data[i]; // Cforall is bugged and the double volatiles causes problems
-		/* paranoid */ verify( handle != *cell );
-
-		bool * null = 0p; // Re-write every loop since compare thrashes it
-		if( __atomic_load_n(cell, (int)__ATOMIC_RELAXED) == null
-			&& __atomic_compare_exchange_n( cell, &null, handle, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-			/* paranoid */ verify(i < ready);
-			/* paranoid */ verify( (kernelTLS().sched_id = i, true) );
-			return i;
-		}
-	}
-
-	if(max <= alloc) abort("Trying to create more than %ud processors", __scheduler_lock->max);
-
-	// Step - 2 : F&A to get a new spot in the array.
-	uint_fast32_t n = __atomic_fetch_add(&alloc, 1, __ATOMIC_SEQ_CST);
-	if(max <= n) abort("Trying to create more than %ud processors", __scheduler_lock->max);
-
-	// Step - 3 : Mark space as used and then publish it.
-	data[n] = handle;
-	while() {
-		unsigned copy = n;
-		if( __atomic_load_n(&ready, __ATOMIC_RELAXED) == n
-			&& __atomic_compare_exchange_n(&ready, &copy, n + 1, true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
-			break;
-		Pause();
-	}
-
-	// Return new spot.
-	/* paranoid */ verify(n < ready);
-	/* paranoid */ verify( (kernelTLS().sched_id = n, true) );
-	return n;
-}
-
-void unregister_proc_id( unsigned id ) with(*__scheduler_lock) {
-	/* paranoid */ verify(id < ready);
-	/* paranoid */ verify(id == kernelTLS().sched_id);
-	/* paranoid */ verify(data[id] == &kernelTLS().sched_lock);
-
-	bool * volatile * cell = (bool * volatile *)&data[id]; // Cforall is bugged and the double volatiles causes problems
-
-	__atomic_store_n(cell, 0p, __ATOMIC_RELEASE);
-
-	__kernel_rseq_unregister();
-}
-
-//-----------------------------------------------------------------------
-// Writer side : acquire when changing the ready queue, e.g. adding more
-//  queues or removing them.
-uint_fast32_t ready_mutate_lock( void ) with(*__scheduler_lock) {
-	/* paranoid */ verify( ! __preemption_enabled() );
-
-	// Step 1 : lock global lock
-	// It is needed to avoid processors that register mid Critical-Section
-	//   to simply lock their own lock and enter.
-	__atomic_acquire( &write_lock );
-
-	// Make sure we won't deadlock ourself
-	// Checking before acquiring the writer lock isn't safe
-	// because someone else could have locked us.
-	/* paranoid */ verify( ! kernelTLS().sched_lock );
-
-	// Step 2 : lock per-proc lock
-	// Processors that are currently being registered aren't counted
-	//   but can't be in read_lock or in the critical section.
-	// All other processors are counted
-	uint_fast32_t s = ready;
-	for(uint_fast32_t i = 0; i < s; i++) {
-		volatile bool * llock = data[i];
-		if(llock) __atomic_acquire( llock );
-	}
-
-	/* paranoid */ verify( ! __preemption_enabled() );
-	return s;
-}
-
-void ready_mutate_unlock( uint_fast32_t last_s ) with(*__scheduler_lock) {
-	/* paranoid */ verify( ! __preemption_enabled() );
-
-	// Step 1 : release local locks
-	// This must be done while the global lock is held to avoid
-	//   threads that where created mid critical section
-	//   to race to lock their local locks and have the writer
-	//   immidiately unlock them
-	// Alternative solution : return s in write_lock and pass it to write_unlock
-	for(uint_fast32_t i = 0; i < last_s; i++) {
-		volatile bool * llock = data[i];
-		if(llock) __atomic_store_n(llock, (bool)false, __ATOMIC_RELEASE);
-	}
-
-	// Step 2 : release global lock
-	/*paranoid*/ assert(true == write_lock);
-	__atomic_store_n(&write_lock, (bool)false, __ATOMIC_RELEASE);
-
-	/* paranoid */ verify( ! __preemption_enabled() );
-}
-
 //=======================================================================
 // Cforall Ready Queue used for scheduling
 //=======================================================================
-unsigned long long moving_average(unsigned long long currtsc, unsigned long long instsc, unsigned long long old_avg) {
-	/* paranoid */ verifyf( currtsc < 45000000000000000, "Suspiciously large current time: %'llu (%llx)\n", currtsc, currtsc );
-	/* paranoid */ verifyf( instsc  < 45000000000000000, "Suspiciously large insert time: %'llu (%llx)\n", instsc, instsc );
-	/* paranoid */ verifyf( old_avg < 15000000000000, "Suspiciously large previous average: %'llu (%llx)\n", old_avg, old_avg );
-
-	const unsigned long long new_val = currtsc > instsc ? currtsc - instsc : 0;
-	const unsigned long long total_weight = 16;
-	const unsigned long long new_weight   = 4;
-	const unsigned long long old_weight = total_weight - new_weight;
-	const unsigned long long ret = ((new_weight * new_val) + (old_weight * old_avg)) / total_weight;
-	return ret;
-}
-
 void ?{}(__ready_queue_t & this) with (this) {
 	lanes.data   = 0p;
@@ -271,10 +73,10 @@
 		// Figure out where thread was last time and make sure it's valid
 		/* paranoid */ verify(thrd->preferred >= 0);
-		if(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count) {
-			/* paranoid */ verify(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count);
-			unsigned start = thrd->preferred * READYQ_SHARD_FACTOR;
+		if(thrd->preferred * __readyq_shard_factor < lanes.count) {
+			/* paranoid */ verify(thrd->preferred * __readyq_shard_factor < lanes.count);
+			unsigned start = thrd->preferred * __readyq_shard_factor;
 			do {
 				unsigned r = __tls_rand();
-				i = start + (r % READYQ_SHARD_FACTOR);
+				i = start + (r % __readyq_shard_factor);
 				/* paranoid */ verify( i < lanes.count );
 				// If we can't lock it retry
@@ -288,5 +90,5 @@
 		do {
 			unsigned r = proc->rdq.its++;
-			i = proc->rdq.id + (r % READYQ_SHARD_FACTOR);
+			i = proc->rdq.id + (r % __readyq_shard_factor);
 			/* paranoid */ verify( i < lanes.count );
 			// If we can't lock it retry
@@ -309,5 +111,5 @@
 	unsigned start = proc->rdq.id;
 	unsigned long long max = 0;
-	for(i; READYQ_SHARD_FACTOR) {
+	for(i; __readyq_shard_factor) {
 		unsigned long long ptsc = ts(rdq.lanes.data[start + i]);
 		if(ptsc != -1ull) {
@@ -338,6 +140,6 @@
 	// Super important: don't write the same value over and over again
 	// We want to maximise our chances that his particular values stays in cache
-	if(lanes.caches[this / READYQ_SHARD_FACTOR].id != this_cache)
-		__atomic_store_n(&lanes.caches[this / READYQ_SHARD_FACTOR].id, this_cache, __ATOMIC_RELAXED);
+	if(lanes.caches[this / __readyq_shard_factor].id != this_cache)
+		__atomic_store_n(&lanes.caches[this / __readyq_shard_factor].id, this_cache, __ATOMIC_RELAXED);
 
 	const unsigned long long ctsc = rdtscl();
@@ -348,5 +150,5 @@
 		unsigned other  = (chaos >> 8) % (lanes.count);
 
-		if(ext < 3 || __atomic_load_n(&lanes.caches[other / READYQ_SHARD_FACTOR].id, __ATOMIC_RELAXED) == this_cache) {
+		if(ext < 3 || __atomic_load_n(&lanes.caches[other / __readyq_shard_factor].id, __ATOMIC_RELAXED) == this_cache) {
 			proc->rdq.target = other;
 		}
@@ -368,6 +170,6 @@
 	}
 
-	for(READYQ_SHARD_FACTOR) {
-		unsigned i = this + (proc->rdq.itr++ % READYQ_SHARD_FACTOR);
+	for(__readyq_shard_factor) {
+		unsigned i = this + (proc->rdq.itr++ % __readyq_shard_factor);
 		if(thread$ * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
 	}
@@ -439,5 +241,5 @@
 	}
 
-	thrd->preferred = w / READYQ_SHARD_FACTOR;
+	thrd->preferred = w / __readyq_shard_factor;
 
 	// return the popped thread
@@ -476,27 +278,4 @@
 
 //-----------------------------------------------------------------------
-// Check that all the intrusive queues in the data structure are still consistent
-static void check( __ready_queue_t & q ) with (q) {
-	#if defined(__CFA_WITH_VERIFY__)
-		{
-			for( idx ; lanes.count ) {
-				__intrusive_lane_t & sl = lanes.data[idx];
-				assert(!lanes.data[idx].lock);
-
-					if(is_empty(sl)) {
-						assert( sl.anchor.next == 0p );
-						assert( sl.anchor.ts   == -1llu );
-						assert( mock_head(sl)  == sl.prev );
-					} else {
-						assert( sl.anchor.next != 0p );
-						assert( sl.anchor.ts   != -1llu );
-						assert( mock_head(sl)  != sl.prev );
-					}
-			}
-		}
-	#endif
-}
-
-//-----------------------------------------------------------------------
 // Given 2 indexes, pick the list with the oldest push an try to pop from it
 static inline struct thread$ * try_pop(struct cluster * cltr, unsigned i, unsigned j __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->ready_queue) {
@@ -509,230 +288,2 @@
 	return try_pop(cltr, w __STATS(, stats));
 }
-
-// Call this function of the intrusive list was moved using memcpy
-// fixes the list so that the pointers back to anchors aren't left dangling
-static inline void fix(__intrusive_lane_t & ll) {
-			if(is_empty(ll)) {
-				verify(ll.anchor.next == 0p);
-				ll.prev = mock_head(ll);
-			}
-}
-
-static void assign_list(unsigned & value, dlist(processor) & list, unsigned count) {
-	processor * it = &list`first;
-	for(unsigned i = 0; i < count; i++) {
-		/* paranoid */ verifyf( it, "Unexpected null iterator, at index %u of %u\n", i, count);
-		it->rdq.id = value;
-		it->rdq.target = MAX;
-		value += READYQ_SHARD_FACTOR;
-		it = &(*it)`next;
-	}
-}
-
-static void reassign_cltr_id(struct cluster * cltr) {
-	unsigned preferred = 0;
-	assign_list(preferred, cltr->procs.actives, cltr->procs.total - cltr->procs.idle);
-	assign_list(preferred, cltr->procs.idles  , cltr->procs.idle );
-}
-
-static void fix_times( struct cluster * cltr ) with( cltr->ready_queue ) {
-	lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
-	for(i; lanes.count) {
-		lanes.tscs[i].tv = rdtscl();
-		lanes.tscs[i].ma = 0;
-	}
-}
-
-// Grow the ready queue
-void ready_queue_grow(struct cluster * cltr) {
-	size_t ncount;
-	int target = cltr->procs.total;
-
-	/* paranoid */ verify( ready_mutate_islocked() );
-	__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue\n");
-
-	// Make sure that everything is consistent
-	/* paranoid */ check( cltr->ready_queue );
-
-	// grow the ready queue
-	with( cltr->ready_queue ) {
-		// Find new count
-		// Make sure we always have atleast 1 list
-		if(target >= 2) {
-			ncount = target * READYQ_SHARD_FACTOR;
-		} else {
-			ncount = SEQUENTIAL_SHARD;
-		}
-
-		// Allocate new array (uses realloc and memcpies the data)
-		lanes.data = alloc( ncount, lanes.data`realloc );
-
-		// Fix the moved data
-		for( idx; (size_t)lanes.count ) {
-			fix(lanes.data[idx]);
-		}
-
-		// Construct new data
-		for( idx; (size_t)lanes.count ~ ncount) {
-			(lanes.data[idx]){};
-		}
-
-		// Update original
-		lanes.count = ncount;
-
-		lanes.caches = alloc( target, lanes.caches`realloc );
-	}
-
-	fix_times(cltr);
-
-	reassign_cltr_id(cltr);
-
-	// Make sure that everything is consistent
-	/* paranoid */ check( cltr->ready_queue );
-
-	__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue done\n");
-
-	/* paranoid */ verify( ready_mutate_islocked() );
-}
-
-// Shrink the ready queue
-void ready_queue_shrink(struct cluster * cltr) {
-	/* paranoid */ verify( ready_mutate_islocked() );
-	__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue\n");
-
-	// Make sure that everything is consistent
-	/* paranoid */ check( cltr->ready_queue );
-
-	int target = cltr->procs.total;
-
-	with( cltr->ready_queue ) {
-		// Remember old count
-		size_t ocount = lanes.count;
-
-		// Find new count
-		// Make sure we always have atleast 1 list
-		lanes.count = target >= 2 ? target * READYQ_SHARD_FACTOR: SEQUENTIAL_SHARD;
-		/* paranoid */ verify( ocount >= lanes.count );
-		/* paranoid */ verify( lanes.count == target * READYQ_SHARD_FACTOR || target < 2 );
-
-		// for printing count the number of displaced threads
-		#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
-			__attribute__((unused)) size_t displaced = 0;
-		#endif
-
-		// redistribute old data
-		for( idx; (size_t)lanes.count ~ ocount) {
-			// Lock is not strictly needed but makes checking invariants much easier
-			__attribute__((unused)) bool locked = __atomic_try_acquire(&lanes.data[idx].lock);
-			verify(locked);
-
-			// As long as we can pop from this lane to push the threads somewhere else in the queue
-			while(!is_empty(lanes.data[idx])) {
-				struct thread$ * thrd;
-				unsigned long long _;
-				[thrd, _] = pop(lanes.data[idx]);
-
-				push(cltr, thrd, true);
-
-				// for printing count the number of displaced threads
-				#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
-					displaced++;
-				#endif
-			}
-
-			// Unlock the lane
-			__atomic_unlock(&lanes.data[idx].lock);
-
-			// TODO print the queue statistics here
-
-			^(lanes.data[idx]){};
-		}
-
-		__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue displaced %zu threads\n", displaced);
-
-		// Allocate new array (uses realloc and memcpies the data)
-		lanes.data = alloc( lanes.count, lanes.data`realloc );
-
-		// Fix the moved data
-		for( idx; (size_t)lanes.count ) {
-			fix(lanes.data[idx]);
-		}
-
-		lanes.caches = alloc( target, lanes.caches`realloc );
-	}
-
-	fix_times(cltr);
-
-
-	reassign_cltr_id(cltr);
-
-	// Make sure that everything is consistent
-	/* paranoid */ check( cltr->ready_queue );
-
-	__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue done\n");
-	/* paranoid */ verify( ready_mutate_islocked() );
-}
-
-#if !defined(__CFA_NO_STATISTICS__)
-	unsigned cnt(const __ready_queue_t & this, unsigned idx) {
-		/* paranoid */ verify(this.lanes.count > idx);
-		return this.lanes.data[idx].cnt;
-	}
-#endif
-
-
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-	// No definition needed
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-
-	#if defined( __x86_64 ) || defined( __i386 )
-		#define RSEQ_SIG	0x53053053
-	#elif defined( __ARM_ARCH )
-		#ifdef __ARMEB__
-		#define RSEQ_SIG    0xf3def5e7      /* udf    #24035    ; 0x5de3 (ARMv6+) */
-		#else
-		#define RSEQ_SIG    0xe7f5def3      /* udf    #24035    ; 0x5de3 */
-		#endif
-	#endif
-
-	extern void __disable_interrupts_hard();
-	extern void __enable_interrupts_hard();
-
-	static void __kernel_raw_rseq_register  (void) {
-		/* paranoid */ verify( __cfaabi_rseq.cpu_id == RSEQ_CPU_ID_UNINITIALIZED );
-
-		// int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, (sigset_t *)0p, _NSIG / 8);
-		int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, RSEQ_SIG);
-		if(ret != 0) {
-			int e = errno;
-			switch(e) {
-			case EINVAL: abort("KERNEL ERROR: rseq register invalid argument");
-			case ENOSYS: abort("KERNEL ERROR: rseq register no supported");
-			case EFAULT: abort("KERNEL ERROR: rseq register with invalid argument");
-			case EBUSY : abort("KERNEL ERROR: rseq register already registered");
-			case EPERM : abort("KERNEL ERROR: rseq register sig  argument  on unregistration does not match the signature received on registration");
-			default: abort("KERNEL ERROR: rseq register unexpected return %d", e);
-			}
-		}
-	}
-
-	static void __kernel_raw_rseq_unregister(void) {
-		/* paranoid */ verify( __cfaabi_rseq.cpu_id >= 0 );
-
-		// int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, (sigset_t *)0p, _NSIG / 8);
-		int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
-		if(ret != 0) {
-			int e = errno;
-			switch(e) {
-			case EINVAL: abort("KERNEL ERROR: rseq unregister invalid argument");
-			case ENOSYS: abort("KERNEL ERROR: rseq unregister no supported");
-			case EFAULT: abort("KERNEL ERROR: rseq unregister with invalid argument");
-			case EBUSY : abort("KERNEL ERROR: rseq unregister already registered");
-			case EPERM : abort("KERNEL ERROR: rseq unregister sig  argument  on unregistration does not match the signature received on registration");
-			default: abort("KERNEL ERROR: rseq unregisteunexpected return %d", e);
-			}
-		}
-	}
-#else
-	// No definition needed
-#endif
Index: libcfa/src/concurrency/ready_subqueue.hfa
===================================================================
--- libcfa/src/concurrency/ready_subqueue.hfa	(revision 3c4bf050c37d1391ba25a62e7b3d185e99ef7f44)
+++ libcfa/src/concurrency/ready_subqueue.hfa	(revision c42b8a1098b5adb1a163766aabb8c1e1b5f99fc9)
@@ -25,36 +25,4 @@
 	);
 	return rhead;
-}
-
-// Ctor
-void ?{}( __intrusive_lane_t & this ) {
-	this.lock = false;
-	this.prev = mock_head(this);
-	this.anchor.next = 0p;
-	this.anchor.ts   = -1llu;
-	#if !defined(__CFA_NO_STATISTICS__)
-		this.cnt  = 0;
-	#endif
-
-	// We add a boat-load of assertions here because the anchor code is very fragile
-	/* paranoid */ _Static_assert( offsetof( thread$, link ) == offsetof(__intrusive_lane_t, anchor) );
-	/* paranoid */ verify( offsetof( thread$, link ) == offsetof(__intrusive_lane_t, anchor) );
-	/* paranoid */ verify( ((uintptr_t)( mock_head(this) ) + offsetof( thread$, link )) == (uintptr_t)(&this.anchor) );
-	/* paranoid */ verify( &mock_head(this)->link.next == &this.anchor.next );
-	/* paranoid */ verify( &mock_head(this)->link.ts   == &this.anchor.ts   );
-	/* paranoid */ verify( mock_head(this)->link.next == 0p );
-	/* paranoid */ verify( mock_head(this)->link.ts   == -1llu  );
-	/* paranoid */ verify( mock_head(this) == this.prev );
-	/* paranoid */ verify( __alignof__(__intrusive_lane_t) == 128 );
-	/* paranoid */ verify( __alignof__(this) == 128 );
-	/* paranoid */ verifyf( ((intptr_t)(&this) % 128) == 0, "Expected address to be aligned %p %% 128 == %zd", &this, ((intptr_t)(&this) % 128) );
-}
-
-// Dtor is trivial
-void ^?{}( __intrusive_lane_t & this ) {
-	// Make sure the list is empty
-	/* paranoid */ verify( this.anchor.next == 0p );
-	/* paranoid */ verify( this.anchor.ts   == -1llu );
-	/* paranoid */ verify( mock_head(this)  == this.prev );
 }