Index: libcfa/src/Makefile.am
===================================================================
--- libcfa/src/Makefile.am	(revision 98d6965d977b39416143b988d932952fb731637a)
+++ libcfa/src/Makefile.am	(revision 2a3d44698a8f41614824b5eaaf857999a33d334b)
@@ -48,5 +48,5 @@
 thread_headers_nosrc = concurrency/invoke.h
 thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
-thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
+thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa concurrency/ready_queue.cfa ${thread_headers:.hfa=.cfa}
 else
 headers =
Index: libcfa/src/Makefile.in
===================================================================
--- libcfa/src/Makefile.in	(revision 98d6965d977b39416143b988d932952fb731637a)
+++ libcfa/src/Makefile.in	(revision 2a3d44698a8f41614824b5eaaf857999a33d334b)
@@ -165,7 +165,7 @@
 	concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa \
 	concurrency/invoke.c concurrency/preemption.cfa \
-	concurrency/coroutine.cfa concurrency/thread.cfa \
-	concurrency/kernel.cfa concurrency/monitor.cfa \
-	concurrency/mutex.cfa
+	concurrency/ready_queue.cfa concurrency/coroutine.cfa \
+	concurrency/thread.cfa concurrency/kernel.cfa \
+	concurrency/monitor.cfa concurrency/mutex.cfa
 @BUILDLIB_TRUE@am__objects_3 = concurrency/coroutine.lo \
 @BUILDLIB_TRUE@	concurrency/thread.lo concurrency/kernel.lo \
@@ -174,5 +174,6 @@
 @BUILDLIB_TRUE@	concurrency/CtxSwitch-@ARCHITECTURE@.lo \
 @BUILDLIB_TRUE@	concurrency/alarm.lo concurrency/invoke.lo \
-@BUILDLIB_TRUE@	concurrency/preemption.lo $(am__objects_3)
+@BUILDLIB_TRUE@	concurrency/preemption.lo \
+@BUILDLIB_TRUE@	concurrency/ready_queue.lo $(am__objects_3)
 am_libcfathread_la_OBJECTS = $(am__objects_4)
 libcfathread_la_OBJECTS = $(am_libcfathread_la_OBJECTS)
@@ -462,5 +463,5 @@
 @BUILDLIB_FALSE@thread_headers = 
 @BUILDLIB_TRUE@thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
-@BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
+@BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa concurrency/ready_queue.cfa ${thread_headers:.hfa=.cfa}
 
 #----------------------------------------------------------------------------------------------------------------
@@ -598,4 +599,6 @@
 	concurrency/$(DEPDIR)/$(am__dirstamp)
 concurrency/preemption.lo: concurrency/$(am__dirstamp) \
+	concurrency/$(DEPDIR)/$(am__dirstamp)
+concurrency/ready_queue.lo: concurrency/$(am__dirstamp) \
 	concurrency/$(DEPDIR)/$(am__dirstamp)
 concurrency/coroutine.lo: concurrency/$(am__dirstamp) \
Index: libcfa/src/bits/defs.hfa
===================================================================
--- libcfa/src/bits/defs.hfa	(revision 98d6965d977b39416143b988d932952fb731637a)
+++ libcfa/src/bits/defs.hfa	(revision 2a3d44698a8f41614824b5eaaf857999a33d334b)
@@ -53,2 +53,36 @@
     return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
 }
+
+// #define __CFA_NO_BIT_TEST_AND_SET__
+
+static inline bool bts(volatile unsigned long long int * target, unsigned long long int bit ) {
+	#if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long long int mask = 1ul << bit;
+        unsigned long long int ret = __atomic_fetch_or(target, mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+    #else
+        int result = 0;
+        asm volatile(
+            "LOCK btsq %[bit], %[target]\n\t"
+            : "=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+
+static inline bool btr(volatile unsigned long long int * target, unsigned long long int bit ) {
+	#if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long long int mask = 1ul << bit;
+        unsigned long long int ret = __atomic_fetch_and(target, ~mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+	#else
+        int result = 0;
+        asm volatile(
+            "LOCK btrq %[bit], %[target]\n\t"
+            :"=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision 98d6965d977b39416143b988d932952fb731637a)
+++ libcfa/src/concurrency/invoke.h	(revision 2a3d44698a8f41614824b5eaaf857999a33d334b)
@@ -158,4 +158,12 @@
 	};
 
+	// Link lists fields
+	// instrusive link field for threads
+	struct __thread_desc_link {
+		struct thread_desc * next;
+		struct thread_desc * prev;
+		unsigned long long ts;
+	};
+
 	struct thread_desc {
 		// Core threading fields
@@ -188,5 +196,5 @@
 		// Link lists fields
 		// instrusive link field for threads
-		struct thread_desc * next;
+		struct __thread_desc_link link;
 
 		struct {
@@ -199,5 +207,5 @@
 	extern "Cforall" {
 		static inline thread_desc *& get_next( thread_desc & this ) {
-			return this.next;
+			return this.link.next;
 		}
 
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 98d6965d977b39416143b988d932952fb731637a)
+++ libcfa/src/concurrency/kernel.cfa	(revision 2a3d44698a8f41614824b5eaaf857999a33d334b)
@@ -187,5 +187,6 @@
 	self_mon.recursion = 1;
 	self_mon_p = &self_mon;
-	next = 0p;
+	link.next = 0p;
+	link.prev = 0p;
 
 	node.next = 0p;
@@ -212,4 +213,5 @@
 	this.name = name;
 	this.cltr = &cltr;
+	id = -1u;
 	terminated{ 0 };
 	do_terminate = false;
@@ -242,7 +244,6 @@
 	this.preemption_rate = preemption_rate;
 	ready_queue{};
-	ready_queue_lock{};
-
-	procs{ __get };
+	ready_lock{};
+
 	idles{ __get };
 	threads{ __get };
@@ -273,5 +274,10 @@
 	__cfaabi_dbg_print_safe("Kernel : core %p starting\n", this);
 
-	doregister(this->cltr, this);
+	// register the processor unless it's the main thread which is handled in the boot sequence
+	if(this != mainProcessor) {
+		this->id = doregister(this->cltr, this);
+		ready_queue_grow( this->cltr );
+	}
+
 
 	{
@@ -305,9 +311,16 @@
 	}
 
-	unregister(this->cltr, this);
-
 	V( this->terminated );
 
+
+	// unregister the processor unless it's the main thread which is handled in the boot sequence
+	if(this != mainProcessor) {
+		ready_queue_shrink( this->cltr );
+		unregister(this->cltr, this);
+	}
+
 	__cfaabi_dbg_print_safe("Kernel : core %p terminated\n", this);
+
+	stats_tls_tally(this->cltr);
 }
 
@@ -469,5 +482,5 @@
 	);
 
-	Abort( pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" ); 
+	Abort( pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" );
 
 	Abort( pthread_create( pthread, &attr, start, arg ), "pthread_create" );
@@ -535,12 +548,12 @@
 	verify( ! kernelTLS.preemption_state.enabled );
 
-	verifyf( thrd->next == 0p, "Expected null got %p", thrd->next );
+	verifyf( thrd->link.next == 0p, "Expected null got %p", thrd->link.next );
+
+
+	ready_schedule_lock(thrd->curr_cluster, kernelTLS.this_processor);
+		bool was_empty = push( thrd->curr_cluster, thrd );
+	ready_schedule_unlock(thrd->curr_cluster, kernelTLS.this_processor);
 
 	with( *thrd->curr_cluster ) {
-		lock  ( ready_queue_lock __cfaabi_dbg_ctx2 );
-		bool was_empty = !(ready_queue != 0);
-		append( ready_queue, thrd );
-		unlock( ready_queue_lock );
-
 		if(was_empty) {
 			lock      (proc_list_lock __cfaabi_dbg_ctx2);
@@ -553,5 +566,4 @@
 			wake_fast(idle);
 		}
-
 	}
 
@@ -562,7 +574,9 @@
 thread_desc * nextThread(cluster * this) with( *this ) {
 	verify( ! kernelTLS.preemption_state.enabled );
-	lock( ready_queue_lock __cfaabi_dbg_ctx2 );
-	thread_desc * head = pop_head( ready_queue );
-	unlock( ready_queue_lock );
+
+	ready_schedule_lock(this, kernelTLS.this_processor);
+		thread_desc * head = pop( this );
+	ready_schedule_unlock(this, kernelTLS.this_processor);
+
 	verify( ! kernelTLS.preemption_state.enabled );
 	return head;
@@ -726,4 +740,5 @@
 		pending_preemption = false;
 		kernel_thread = pthread_self();
+		id = -1u;
 
 		runner{ &this };
@@ -735,4 +750,6 @@
 	mainProcessor = (processor *)&storage_mainProcessor;
 	(*mainProcessor){};
+
+	mainProcessor->id = doregister(mainCluster, mainProcessor);
 
 	//initialize the global state variables
@@ -781,12 +798,20 @@
 	kernel_stop_preemption();
 
+	unregister(mainCluster, mainProcessor);
+
 	// Destroy the main processor and its context in reverse order of construction
 	// These were manually constructed so we need manually destroy them
-	^(mainProcessor->runner){};
-	^(mainProcessor){};
+	void ^?{}(processor & this) with( this ) {
+		//don't join the main thread here, that wouldn't make any sense
+		__cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
+	}
+
+	^(*mainProcessor){};
 
 	// Final step, destroy the main thread since it is no longer needed
-	// Since we provided a stack to this taxk it will not destroy anything
-	^(mainThread){};
+	// Since we provided a stack to this task it will not destroy anything
+	^(*mainThread){};
+
+	^(*mainCluster){};
 
 	^(__cfa_dbg_global_clusters.list){};
@@ -804,5 +829,4 @@
 	with( *cltr ) {
 		lock      (proc_list_lock __cfaabi_dbg_ctx2);
-		remove    (procs, *this);
 		push_front(idles, *this);
 		unlock    (proc_list_lock);
@@ -818,5 +842,4 @@
 		lock      (proc_list_lock __cfaabi_dbg_ctx2);
 		remove    (idles, *this);
-		push_front(procs, *this);
 		unlock    (proc_list_lock);
 	}
@@ -959,18 +982,4 @@
 }
 
-void doregister( cluster * cltr, processor * proc ) {
-	lock      (cltr->proc_list_lock __cfaabi_dbg_ctx2);
-	cltr->nprocessors += 1;
-	push_front(cltr->procs, *proc);
-	unlock    (cltr->proc_list_lock);
-}
-
-void unregister( cluster * cltr, processor * proc ) {
-	lock  (cltr->proc_list_lock __cfaabi_dbg_ctx2);
-	remove(cltr->procs, *proc );
-	cltr->nprocessors -= 1;
-	unlock(cltr->proc_list_lock);
-}
-
 //-----------------------------------------------------------------------------
 // Debug
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 98d6965d977b39416143b988d932952fb731637a)
+++ libcfa/src/concurrency/kernel.hfa	(revision 2a3d44698a8f41614824b5eaaf857999a33d334b)
@@ -107,4 +107,5 @@
 	// Cluster from which to get threads
 	struct cluster * cltr;
+	unsigned int id;
 
 	// Name of the processor
@@ -161,12 +162,116 @@
 }
 
+
+//-----------------------------------------------------------------------------
+// Cluster Tools
+struct __processor_id;
+
+// Reader-Writer lock protecting the ready-queue
+struct __clusterRWLock_t {
+	// total cachelines allocated
+	unsigned int max;
+
+	// cachelines currently in use
+	volatile unsigned int alloc;
+
+	// cachelines ready to itereate over
+	// (!= to alloc when thread is in second half of doregister)
+	volatile unsigned int ready;
+
+	// writer lock
+	volatile bool lock;
+
+	// data pointer
+	__processor_id * data;
+};
+
+void  ?{}(__clusterRWLock_t & this);
+void ^?{}(__clusterRWLock_t & this);
+
+// Underlying sub quues of the ready queue
+struct __attribute__((aligned(128))) __intrusive_ready_queue_t {
+	// spin lock protecting the queue
+	volatile bool lock;
+	unsigned int last_id;
+
+	// anchor for the head and the tail of the queue
+	struct __sentinel_t {
+		// Link lists fields
+		// instrusive link field for threads
+		// must be exactly as in thread_desc
+		__thread_desc_link link;
+	} before, after;
+
+	// Optional statistic counters
+	#if !defined(__CFA_NO_SCHED_STATS__)
+		struct __attribute__((aligned(64))) {
+			// difference between number of push and pops
+			ssize_t diff;
+
+			// total number of pushes and pops
+			size_t  push;
+			size_t  pop ;
+		} stat;
+	#endif
+};
+
+void  ?{}(__intrusive_ready_queue_t & this);
+void ^?{}(__intrusive_ready_queue_t & this);
+
+typedef unsigned long long __cfa_readyQ_mask_t;
+
+// enum {
+// 	__cfa_ready_queue_mask_size = (64 - sizeof(size_t)) / sizeof(size_t),
+// 	__cfa_max_ready_queues = __cfa_ready_queue_mask_size * 8 * sizeof(size_t)
+// };
+
+#define __cfa_readyQ_mask_size ((64 - sizeof(size_t)) / sizeof(__cfa_readyQ_mask_t))
+#define __cfa_max_readyQs (__cfa_readyQ_mask_size * 8 * sizeof(__cfa_readyQ_mask_t))
+
+//TODO adjust cache size to ARCHITECTURE
+struct __attribute__((aligned(128))) __ready_queue_t {
+	struct {
+		volatile size_t count;
+		volatile __cfa_readyQ_mask_t mask[ __cfa_readyQ_mask_size ];
+	} empty;
+
+	struct __attribute__((aligned(64))) {
+		__intrusive_ready_queue_t * volatile data;
+		volatile size_t count;
+	} list;
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		__attribute__((aligned(64))) struct {
+			struct {
+				struct {
+					volatile size_t attempt;
+					volatile size_t success;
+				} push;
+				struct {
+					volatile size_t maskrds;
+					volatile size_t attempt;
+					volatile size_t success;
+				} pop;
+			} pick;
+			struct {
+				volatile size_t value;
+				volatile size_t count;
+			} full;
+		} global_stats;
+
+	#endif
+};
+
+void  ?{}(__ready_queue_t & this);
+void ^?{}(__ready_queue_t & this);
+
 //-----------------------------------------------------------------------------
 // Cluster
 struct cluster {
 	// Ready queue locks
-	__spinlock_t ready_queue_lock;
+	__clusterRWLock_t ready_lock;
 
 	// Ready queue for threads
-	__queue_t(thread_desc) ready_queue;
+	__ready_queue_t ready_queue;
 
 	// Name of the cluster
@@ -178,7 +283,5 @@
 	// List of processors
 	__spinlock_t proc_list_lock;
-	__dllist_t(struct processor) procs;
 	__dllist_t(struct processor) idles;
-	unsigned int nprocessors;
 
 	// List of threads
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 98d6965d977b39416143b988d932952fb731637a)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 2a3d44698a8f41614824b5eaaf857999a33d334b)
@@ -101,5 +101,5 @@
 //-----------------------------------------------------------------------------
 // Utils
-#define KERNEL_STORAGE(T,X) static char storage_##X[sizeof(T)]
+#define KERNEL_STORAGE(T,X) __attribute((aligned(__alignof__(T)))) static char storage_##X[sizeof(T)]
 
 static inline uint32_t tls_rand() {
@@ -117,6 +117,90 @@
 void unregister( struct cluster * cltr, struct thread_desc & thrd );
 
-void doregister( struct cluster * cltr, struct processor * proc );
-void unregister( struct cluster * cltr, struct processor * proc );
+//=======================================================================
+// Cluster lock API
+//=======================================================================
+struct __attribute__((aligned(64))) __processor_id {
+	processor * volatile handle;
+	volatile bool lock;
+};
+
+// Lock-Free registering/unregistering of threads
+// Register a processor to a given cluster and get its unique id in return
+unsigned doregister( struct cluster * cltr, struct processor * proc );
+
+// Unregister a processor from a given cluster using its id, getting back the original pointer
+void     unregister( struct cluster * cltr, struct processor * proc );
+
+//=======================================================================
+// Reader-writer lock implementation
+// Concurrent with doregister/unregister,
+//    i.e., threads can be added at any point during or between the entry/exit
+static inline void __atomic_acquire(volatile bool * ll) {
+	while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) {
+		while(__atomic_load_n(ll, (int)__ATOMIC_RELAXED))
+			asm volatile("pause");
+	}
+	/* paranoid */ verify(*ll);
+}
+
+static inline bool __atomic_try_acquire(volatile bool * ll) {
+	return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
+}
+
+static inline void __atomic_unlock(volatile bool * ll) {
+	/* paranoid */ verify(*ll);
+	__atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
+}
+
+//-----------------------------------------------------------------------
+// Reader side : acquire when using the ready queue to schedule but not
+//  creating/destroying queues
+static inline void ready_schedule_lock( struct cluster * cltr, struct processor * proc) with(cltr->ready_lock) {
+	unsigned iproc = proc->id;
+	/*paranoid*/ verify(data[iproc].handle == proc);
+	/*paranoid*/ verify(iproc < ready);
+
+	// Step 1 : make sure no writer are in the middle of the critical section
+	while(__atomic_load_n(&lock, (int)__ATOMIC_RELAXED))
+		asm volatile("pause");
+
+	// Fence needed because we don't want to start trying to acquire the lock
+	// before we read a false.
+	// Not needed on x86
+	// std::atomic_thread_fence(std::memory_order_seq_cst);
+
+	// Step 2 : acquire our local lock
+	__atomic_acquire( &data[iproc].lock );
+	/*paranoid*/ verify(data[iproc].lock);
+}
+
+static inline void ready_schedule_unlock( struct cluster * cltr, struct processor * proc) with(cltr->ready_lock) {
+	unsigned iproc = proc->id;
+	/*paranoid*/ verify(data[iproc].handle == proc);
+	/*paranoid*/ verify(iproc < ready);
+	/*paranoid*/ verify(data[iproc].lock);
+	__atomic_store_n(&data[iproc].lock, false, __ATOMIC_RELEASE);
+}
+
+//-----------------------------------------------------------------------
+// Writer side : acquire when changing the ready queue, e.g. adding more
+//  queues or removing them.
+uint_fast32_t ready_mutate_lock( struct cluster & cltr );
+
+void ready_mutate_unlock( struct cluster & cltr, uint_fast32_t );
+
+//=======================================================================
+// Ready-Queue API
+
+__attribute__((hot)) bool push(struct cluster * cltr, struct thread_desc * thrd);
+__attribute__((hot)) thread_desc * pop(struct cluster * cltr);
+void ready_queue_grow  (struct cluster * cltr);
+void ready_queue_shrink(struct cluster * cltr);
+
+#if !defined(__CFA_NO_STATISTICS__)
+void stats_tls_tally(struct cluster * cltr);
+#else
+static inline void stats_tls_tally(struct cluster * cltr) {}
+#endif
 
 // Local Variables: //
Index: libcfa/src/concurrency/monitor.cfa
===================================================================
--- libcfa/src/concurrency/monitor.cfa	(revision 98d6965d977b39416143b988d932952fb731637a)
+++ libcfa/src/concurrency/monitor.cfa	(revision 2a3d44698a8f41614824b5eaaf857999a33d334b)
@@ -841,5 +841,5 @@
 	for(	thread_desc ** thrd_it = &entry_queue.head;
 		*thrd_it;
-		thrd_it = &(*thrd_it)->next
+		thrd_it = &(*thrd_it)->link.next
 	) {
 		// For each acceptable check if it matches
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 2a3d44698a8f41614824b5eaaf857999a33d334b)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 2a3d44698a8f41614824b5eaaf857999a33d334b)
@@ -0,0 +1,804 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2019 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// ready_queue.cfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Mon Nov dd 16:29:18 2019
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#define __cforall_thread__
+
+#include "bits/defs.hfa"
+#include "kernel_private.hfa"
+
+#define _GNU_SOURCE
+#include "stdlib.hfa"
+
+static const size_t cache_line_size = 64;
+
+static inline unsigned __max_processors_fallback() {
+	#ifdef __CFA_MAX_PROCESSORS__
+		return __CFA_MAX_PROCESSORS__;
+	#else
+		// No overriden function, no environment variable, no define
+		// fall back to a magic number
+		return 128;
+	#endif
+}
+
+__attribute__((weak)) unsigned __max_processors() {
+	const char * max_cores_s = getenv("CFA_MAX_PROCESSORS");
+	if(!max_cores_s) {
+		__cfaabi_dbg_print_nolock("No CFA_MAX_PROCESSORS in ENV");
+		return __max_processors_fallback();
+	}
+
+	char * endptr = 0p;
+	long int max_cores_l = strtol(max_cores_s, &endptr, 10);
+	if(max_cores_l < 1 || max_cores_l > 65535) {
+		__cfaabi_dbg_print_nolock("CFA_MAX_PROCESSORS out of range : %ld", max_cores_l);
+		return __max_processors_fallback();
+	}
+	if('\0' != *endptr) {
+		__cfaabi_dbg_print_nolock("CFA_MAX_PROCESSORS not a decimal number : %s", max_cores_s);
+		return __max_processors_fallback();
+	}
+
+	return max_cores_l;
+}
+
+static inline unsigned rand_bit(unsigned rnum, size_t mask) {
+	verify(sizeof(mask) == 8);
+	unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
+#if !defined(__BMI2__)
+	uint64_t v = mask;   // Input value to find position with rank r.
+	unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
+	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
+	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
+	unsigned int t;      // Bit count temporary.
+
+	// Do a normal parallel bit count for a 64-bit integer,
+	// but store all intermediate steps.
+	a =  v - ((v >> 1) & ~0UL/3);
+	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
+	c = (b + (b >> 4)) & ~0UL/0x11;
+	d = (c + (c >> 8)) & ~0UL/0x101;
+
+
+	t = (d >> 32) + (d >> 48);
+	// Now do branchless select!
+	s  = 64;
+	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
+	t  = (d >> (s - 16)) & 0xff;
+	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
+	t  = (c >> (s - 8)) & 0xf;
+	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
+	t  = (b >> (s - 4)) & 0x7;
+	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
+	t  = (a >> (s - 2)) & 0x3;
+	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
+	t  = (v >> (s - 1)) & 0x1;
+	s -= ((t - r) & 256) >> 8;
+	return s - 1;
+#else
+	uint64_t picked = _pdep_u64(1ul << bit, mask);
+	return picked ? __builtin_ctzl(picked) : 0;
+#endif
+}
+
+static inline __cfa_readyQ_mask_t readyQ_mask_full       () { return (8 * sizeof(__cfa_readyQ_mask_t)) - 1; }
+static inline __cfa_readyQ_mask_t readyQ_mask_shit_length() { return (8 * sizeof(__cfa_readyQ_mask_t)) - __builtin_clzl(readyQ_mask_full()); }
+
+static inline [__cfa_readyQ_mask_t, __cfa_readyQ_mask_t] extract(__cfa_readyQ_mask_t idx) {
+	__cfa_readyQ_mask_t word = idx >> readyQ_mask_shit_length();
+	__cfa_readyQ_mask_t bit  = idx &  readyQ_mask_full();
+	return [bit, word];
+}
+
+//=======================================================================
+// Cluster wide reader-writer lock
+//=======================================================================
+void  ?{}(__clusterRWLock_t & this) {
+	this.max   = __max_processors();
+	this.alloc = 0;
+	this.ready = 0;
+	this.lock  = false;
+	this.data  = alloc(this.max);
+
+	/*paranoid*/ verify( 0 == (((uintptr_t)(this.data    )) % 64) );
+	/*paranoid*/ verify( 0 == (((uintptr_t)(this.data + 1)) % 64) );
+	/*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.alloc), &this.alloc));
+	/*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.ready), &this.ready));
+
+}
+void ^?{}(__clusterRWLock_t & this) {
+	free(this.data);
+}
+
+void ?{}( __processor_id & this, struct processor * proc ) {
+	this.handle = proc;
+	this.lock   = false;
+}
+
+//=======================================================================
+// Lock-Free registering/unregistering of threads
+unsigned doregister( struct cluster * cltr, struct processor * proc ) with(cltr->ready_lock) {
+	// Step - 1 : check if there is already space in the data
+	uint_fast32_t s = ready;
+
+	// Check among all the ready
+	for(uint_fast32_t i = 0; i < s; i++) {
+		processor * null = 0p; // Re-write every loop since compare thrashes it
+		if( __atomic_load_n(&data[i].handle, (int)__ATOMIC_RELAXED) == null
+			&& __atomic_compare_exchange_n( &data[i].handle, &null, proc, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+			/*paranoid*/ verify(i < ready);
+			/*paranoid*/ verify(__alignof__(data[i]) == cache_line_size);
+			/*paranoid*/ verify((((uintptr_t)&data[i]) % cache_line_size) == 0);
+			return i;
+		}
+	}
+
+	if(max <= alloc) abort("Trying to create more than %ud processors", cltr->ready_lock.max);
+
+	// Step - 2 : F&A to get a new spot in the array.
+	uint_fast32_t n = __atomic_fetch_add(&alloc, 1, __ATOMIC_SEQ_CST);
+	if(max <= n) abort("Trying to create more than %ud processors", cltr->ready_lock.max);
+
+	// Step - 3 : Mark space as used and then publish it.
+	__processor_id * storage = (__processor_id *)&data[n];
+	(*storage){ proc };
+	while(true) {
+		unsigned copy = n;
+		if( __atomic_load_n(&ready, __ATOMIC_RELAXED) == n
+			&& __atomic_compare_exchange_n(&ready, &copy, n + 1, true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
+			break;
+		asm volatile("pause");
+	}
+
+	// Return new spot.
+	/*paranoid*/ verify(n < ready);
+	/*paranoid*/ verify(__alignof__(data[n]) == cache_line_size);
+	/*paranoid*/ verify((((uintptr_t)&data[n]) % cache_line_size) == 0);
+	return n;
+}
+
+void unregister( struct cluster * cltr, struct processor * proc ) with(cltr->ready_lock) {
+	unsigned id = proc->id;
+	/*paranoid*/ verify(id < ready);
+	/*paranoid*/ verify(proc == __atomic_load_n(&data[id].handle, __ATOMIC_RELAXED));
+	__atomic_store_n(&data[id].handle, 0p, __ATOMIC_RELEASE);
+}
+
+//-----------------------------------------------------------------------
+// Writer side : acquire when changing the ready queue, e.g. adding more
+//  queues or removing them.
+uint_fast32_t ready_mutate_lock( struct cluster & cltr ) with(cltr.ready_lock) {
+	// Step 1 : lock global lock
+	// It is needed to avoid processors that register mid Critical-Section
+	//   to simply lock their own lock and enter.
+	__atomic_acquire( &lock );
+
+	// Step 2 : lock per-proc lock
+	// Processors that are currently being registered aren't counted
+	//   but can't be in read_lock or in the critical section.
+	// All other processors are counted
+	uint_fast32_t s = ready;
+	for(uint_fast32_t i = 0; i < s; i++) {
+		__atomic_acquire( &data[i].lock );
+	}
+
+	return s;
+}
+
+void ready_mutate_unlock( struct cluster & cltr, uint_fast32_t last_s ) with(cltr.ready_lock) {
+	// Step 1 : release local locks
+	// This must be done while the global lock is held to avoid
+	//   threads that where created mid critical section
+	//   to race to lock their local locks and have the writer
+	//   immidiately unlock them
+	// Alternative solution : return s in write_lock and pass it to write_unlock
+	for(uint_fast32_t i = 0; i < last_s; i++) {
+		verify(data[i].lock);
+		__atomic_store_n(&data[i].lock, (bool)false, __ATOMIC_RELEASE);
+	}
+
+	// Step 2 : release global lock
+	/*paranoid*/ assert(true == lock);
+	__atomic_store_n(&lock, (bool)false, __ATOMIC_RELEASE);
+}
+
+//=======================================================================
+// Intrusive Queue used by ready queue
+//=======================================================================
+// Get the head pointer (one before the first element) from the anchor
+static inline thread_desc * head(const __intrusive_ready_queue_t & this) {
+	thread_desc * rhead = (thread_desc *)(
+		(uintptr_t)( &this.before ) - offsetof( thread_desc, link )
+	);
+	/* paranoid */ verify(rhead);
+	return rhead;
+}
+
+// Get the tail pointer (one after the last element) from the anchor
+static inline thread_desc * tail(const __intrusive_ready_queue_t & this) {
+	thread_desc * rtail = (thread_desc *)(
+		(uintptr_t)( &this.after ) - offsetof( thread_desc, link )
+	);
+	/* paranoid */ verify(rtail);
+	return rtail;
+}
+
+// Ctor
+void ?{}( __intrusive_ready_queue_t & this ) {
+	this.lock = false;
+	this.last_id = -1u;
+
+	this.before.link.prev = 0p;
+	this.before.link.next = tail(this);
+	this.before.link.ts   = 0;
+
+	this.after .link.prev = head(this);
+	this.after .link.next = 0p;
+	this.after .link.ts   = 0;
+
+	#if !defined(__CFA_NO_SCHED_STATS__)
+		this.stat.diff = 0;
+		this.stat.push = 0;
+		this.stat.pop  = 0;
+	#endif
+
+	// We add a boat-load of assertions here because the anchor code is very fragile
+	/* paranoid */ verify(((uintptr_t)( head(this) ) + offsetof( thread_desc, link )) == (uintptr_t)(&this.before));
+	/* paranoid */ verify(((uintptr_t)( tail(this) ) + offsetof( thread_desc, link )) == (uintptr_t)(&this.after ));
+	/* paranoid */ verify(head(this)->link.prev == 0p );
+	/* paranoid */ verify(head(this)->link.next == tail(this) );
+	/* paranoid */ verify(tail(this)->link.next == 0p );
+	/* paranoid */ verify(tail(this)->link.prev == head(this) );
+	/* paranoid */ verify(&head(this)->link.prev == &this.before.link.prev );
+	/* paranoid */ verify(&head(this)->link.next == &this.before.link.next );
+	/* paranoid */ verify(&tail(this)->link.prev == &this.after .link.prev );
+	/* paranoid */ verify(&tail(this)->link.next == &this.after .link.next );
+	/* paranoid */ verify(sizeof(__intrusive_ready_queue_t) == 128);
+	/* paranoid */ verify(sizeof(this) == 128);
+	/* paranoid */ verify(__alignof__(__intrusive_ready_queue_t) == 128);
+	/* paranoid */ verify(__alignof__(this) == 128);
+	/* paranoid */ verifyf(((intptr_t)(&this) % 128) == 0, "Expected address to be aligned %p %% 128 == %zd", &this, ((intptr_t)(&this) % 128));
+
+	/* paranoid */ verifyf(readyQ_mask_shit_length() == 6 , "%zu", readyQ_mask_shit_length());
+	/* paranoid */ verifyf(readyQ_mask_full()        == 63, "%zu", readyQ_mask_full());
+}
+
+// Dtor is trivial
+void ^?{}( __intrusive_ready_queue_t & this ) {
+	// Make sure the list is empty
+	/* paranoid */ verify(head(this)->link.prev == 0p );
+	/* paranoid */ verify(head(this)->link.next == tail(this) );
+	/* paranoid */ verify(tail(this)->link.next == 0p );
+	/* paranoid */ verify(tail(this)->link.prev == head(this) );
+}
+
+
+
+bool push(__intrusive_ready_queue_t & this, thread_desc * node) {
+	verify(this.lock);
+	verify(node->link.ts != 0);
+	verify(node->link.next == 0p);
+	verify(node->link.prev == 0p);
+
+	if(this.before.link.ts == 0l) {
+		verify(tail(this)->link.next == 0p);
+		verify(tail(this)->link.prev == head(this));
+		verify(head(this)->link.next == tail(this));
+		verify(head(this)->link.prev == 0p);
+	}
+
+	// Get the relevant nodes locally
+	thread_desc * tail = tail(this);
+	thread_desc * prev = tail->link.prev;
+
+	// Do the push
+	node->link.next = tail;
+	node->link.prev = prev;
+	prev->link.next = node;
+	tail->link.prev = node;
+
+	// Update stats
+	#ifndef __CFA_NO_SCHED_STATS__
+		this.stat.diff++;
+		this.stat.push++;
+	#endif
+
+	verify(node->link.next == tail(this));
+
+	// Check if the queue used to be empty
+	if(this.before.link.ts == 0l) {
+		this.before.link.ts = node->link.ts;
+		verify(node->link.prev == head(this));
+		return true;
+	}
+	return false;
+}
+
+[thread_desc *, bool] pop(__intrusive_ready_queue_t & this) {
+	verify(this.lock);
+	verify(this.before.link.ts != 0ul);
+	thread_desc * head = head(this);
+	thread_desc * tail = tail(this);
+
+	thread_desc * node = head->link.next;
+	thread_desc * next = node->link.next;
+	if(node == tail) {
+		verify(false);
+		verify(this.before.link.ts == 0ul);
+		verify(tail(this)->link.next == 0p);
+		verify(tail(this)->link.prev == head(this));
+		verify(head(this)->link.next == tail(this));
+		verify(head(this)->link.prev == 0p);
+		return [0p, false];
+	}
+
+	/* paranoid */ verify(node);
+
+	head->link.next = next;
+	next->link.prev = head;
+
+	#ifndef __CFA_NO_SCHED_STATS__
+		this.stat.diff--;
+		this.stat.pop ++;
+	#endif
+
+	if(next == tail) {
+		this.before.link.ts = 0ul;
+		verify(tail(this)->link.next == 0p);
+		verify(tail(this)->link.prev == head(this));
+		verify(head(this)->link.next == tail(this));
+		verify(head(this)->link.prev == 0p);
+		node->link.[next, prev] = 0p;
+		return [node, true];
+	}
+	else {
+		verify(next->link.ts != 0);
+		this.before.link.ts = next->link.ts;
+		verify(this.before.link.ts != 0);
+		node->link.[next, prev] = 0p;
+		return [node, false];
+	}
+}
+
+static inline unsigned long long ts(__intrusive_ready_queue_t & this) {
+	return this.before.link.ts;
+}
+
+//=======================================================================
+// Cforall Reqdy Queue used by ready queue
+//=======================================================================
+
+static __attribute__((aligned(128))) thread_local struct {
+	struct {
+		struct {
+			size_t attempt;
+			size_t success;
+		} push;
+		struct {
+			size_t maskrds;
+			size_t attempt;
+			size_t success;
+		} pop;
+	} pick;
+	struct {
+		size_t value;
+		size_t count;
+	} full;
+} tls = {
+	/* pick */{
+		/* push */{ 0, 0 },
+		/* pop  */{ 0, 0, 0 },
+	},
+	/* full */{ 0, 0 }
+};
+
+//-----------------------------------------------------------------------
+
+void ?{}(__ready_queue_t & this) with (this) {
+	empty.count = 0;
+	for( i ; __cfa_readyQ_mask_size ) {
+		empty.mask[i] = 0;
+	}
+
+	list.data = alloc(4);
+	for( i; 4 ) {
+		(list.data[i]){};
+	}
+	list.count = 4;
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		global_stats.pick.push.attempt = 0;
+		global_stats.pick.push.success = 0;
+		global_stats.pick.pop .maskrds = 0;
+		global_stats.pick.pop .attempt = 0;
+		global_stats.pick.pop .success = 0;
+
+		global_stats.full.value = 0;
+		global_stats.full.count = 0;
+	#endif
+}
+
+void ^?{}(__ready_queue_t & this) with (this) {
+	verify( 4  == list .count );
+	verify( 0  == empty.count );
+
+	for( i; 4 ) {
+		^(list.data[i]){};
+	}
+	free(list.data);
+
+
+	#if defined(__CFA_WITH_VERIFY__)
+		for( i ; __cfa_readyQ_mask_size ) {
+			assert( 0 == empty.mask[i] );
+		}
+	#endif
+}
+
+//-----------------------------------------------------------------------
+
+__attribute__((hot)) bool push(struct cluster * cltr, struct thread_desc * thrd) with (cltr->ready_queue) {
+	thrd->link.ts = rdtscl();
+
+	while(true) {
+		// Pick a random list
+		unsigned i = tls_rand() % list.count;
+
+		#if !defined(__CFA_NO_STATISTICS__)
+			tls.pick.push.attempt++;
+		#endif
+
+		// If we can't lock it retry
+		if( !__atomic_try_acquire( &list.data[i].lock ) ) continue;
+		verify(list.data[i].last_id == -1u);
+		list.data[i].last_id = kernelTLS.this_processor->id;
+
+		__attribute__((unused)) size_t num = __atomic_load_n( &empty.count, __ATOMIC_RELAXED );
+		bool first = false;
+
+		verify( list.data[i].last_id == kernelTLS.this_processor->id );
+		verify( list.data[i].lock );
+		// Actually push it
+		if(push(list.data[i], thrd)) {
+			size_t ret = __atomic_fetch_add( &empty.count, 1z, __ATOMIC_SEQ_CST);
+			first = (ret == 0);
+
+			__cfa_readyQ_mask_t word;
+			__cfa_readyQ_mask_t bit;
+			[bit, word] = extract(i);
+			verifyf((empty.mask[word] & (1ull << bit)) == 0, "Before set %llu:%llu (%u), %llx & %llx", word, bit, i, empty.mask[word], (1ull << bit));
+			__attribute__((unused)) bool ret = bts(&empty.mask[word], bit);
+			verify(!(bool)ret);
+			verifyf((empty.mask[word] & (1ull << bit)) != 0, "After set %llu:%llu (%u), %llx & %llx", word, bit, i, empty.mask[word], (1ull << bit));
+		}
+		verify(empty.count <= (int)list.count);
+		verify( list.data[i].last_id == kernelTLS.this_processor->id );
+		verify( list.data[i].lock );
+
+		// Unlock and return
+		list.data[i].last_id = -1u;
+		__atomic_unlock( &list.data[i].lock );
+
+		#if !defined(__CFA_NO_STATISTICS__)
+			tls.pick.push.success++;
+			tls.full.value += num;
+			tls.full.count += 1;
+		#endif
+		return first;
+	}
+}
+
+//-----------------------------------------------------------------------
+
+static struct thread_desc * try_pop(struct cluster * cltr, unsigned i, unsigned j) with (cltr->ready_queue) {
+	#if !defined(__CFA_NO_STATISTICS__)
+		tls.pick.pop.attempt++;
+	#endif
+
+	// Pick the bet list
+	int w = i;
+	if( __builtin_expect(ts(list.data[j]) != 0, true) ) {
+		w = (ts(list.data[i]) < ts(list.data[j])) ? i : j;
+	}
+
+	__intrusive_ready_queue_t & list = list.data[w];
+	// If list looks empty retry
+	if( ts(list) == 0 ) return 0p;
+
+	// If we can't get the lock retry
+	if( !__atomic_try_acquire(&list.lock) ) return 0p;
+	verify(list.last_id == -1u);
+	list.last_id = kernelTLS.this_processor->id;
+
+	verify(list.last_id == kernelTLS.this_processor->id);
+
+	__attribute__((unused)) int num = __atomic_load_n( &empty.count, __ATOMIC_RELAXED );
+
+
+	// If list is empty, unlock and retry
+	if( ts(list) == 0 ) {
+		list.last_id = -1u;
+		__atomic_unlock(&list.lock);
+		return 0p;
+	}
+	{
+		__cfa_readyQ_mask_t word;
+		__cfa_readyQ_mask_t bit;
+		[bit, word] = extract(w);
+		verify((empty.mask[word] & (1ull << bit)) != 0);
+	}
+
+	verify(list.last_id == kernelTLS.this_processor->id);
+	verify(list.lock);
+
+	// Actually pop the list
+	struct thread_desc * thrd;
+	bool emptied;
+	[thrd, emptied] = pop(list);
+	verify(thrd);
+
+	verify(list.last_id == kernelTLS.this_processor->id);
+	verify(list.lock);
+
+	if(emptied) {
+		__atomic_fetch_sub( &empty.count, 1z, __ATOMIC_SEQ_CST);
+
+		__cfa_readyQ_mask_t word;
+		__cfa_readyQ_mask_t bit;
+		[bit, word] = extract(w);
+		verify((empty.mask[word] & (1ull << bit)) != 0);
+		__attribute__((unused)) bool ret = btr(&empty.mask[word], bit);
+		verify(ret);
+		verify((empty.mask[word] & (1ull << bit)) == 0);
+	}
+
+	verify(list.lock);
+
+	// Unlock and return
+	list.last_id = -1u;
+	__atomic_unlock(&list.lock);
+	verify(empty.count >= 0);
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		tls.pick.pop.success++;
+		tls.full.value += num;
+		tls.full.count += 1;
+	#endif
+
+	return thrd;
+}
+
+__attribute__((hot)) thread_desc * pop(struct cluster * cltr) with (cltr->ready_queue) {
+	verify( list.count > 0 );
+	while( __atomic_load_n( &empty.count, __ATOMIC_RELAXED ) != 0) {
+		#if !defined(__CFA_READQ_NO_BITMASK__)
+			tls.pick.pop.maskrds++;
+			unsigned i, j;
+			{
+				#if !defined(__CFA_NO_SCHED_STATS__)
+					tls.pick.pop.maskrds++;
+				#endif
+
+				// Pick two lists at random
+				unsigned num = ((__atomic_load_n( &list.count, __ATOMIC_RELAXED ) - 1) >> 6) + 1;
+
+				unsigned ri = tls_rand();
+				unsigned rj = tls_rand();
+
+				unsigned wdxi = (ri >> 6u) % num;
+				unsigned wdxj = (rj >> 6u) % num;
+
+				size_t maski = __atomic_load_n( &empty.mask[wdxi], __ATOMIC_RELAXED );
+				size_t maskj = __atomic_load_n( &empty.mask[wdxj], __ATOMIC_RELAXED );
+
+				if(maski == 0 && maskj == 0) continue;
+
+				unsigned bi = rand_bit(ri, maski);
+				unsigned bj = rand_bit(rj, maskj);
+
+				verifyf(bi < 64, "%zu %u", maski, bi);
+				verifyf(bj < 64, "%zu %u", maskj, bj);
+
+				i = bi | (wdxi << 6);
+				j = bj | (wdxj << 6);
+
+				verifyf(i < list.count, "%u", wdxi << 6);
+				verifyf(j < list.count, "%u", wdxj << 6);
+			}
+
+			struct thread_desc * thrd = try_pop(cltr, i, j);
+			if(thrd) return thrd;
+		#else
+			// Pick two lists at random
+			int i = tls_rand() % __atomic_load_n( &list.count, __ATOMIC_RELAXED );
+			int j = tls_rand() % __atomic_load_n( &list.count, __ATOMIC_RELAXED );
+
+			struct thread_desc * thrd = try_pop(cltr, i, j);
+			if(thrd) return thrd;
+		#endif
+	}
+
+	return 0p;
+}
+
+//-----------------------------------------------------------------------
+
+static void check( __ready_queue_t & q ) with (q) {
+	#if defined(__CFA_WITH_VERIFY__)
+		{
+			int idx = 0;
+			for( w ; __cfa_readyQ_mask_size ) {
+				for( b ; 8 * sizeof(__cfa_readyQ_mask_t) ) {
+					bool is_empty = idx < list.count ? (ts(list.data[idx]) == 0) : true;
+					bool should_be_empty = 0 == (empty.mask[w] & (1z << b));
+					assertf(should_be_empty == is_empty, "Inconsistent list %d, mask expect : %d, actual is got %d", idx, should_be_empty, (bool)is_empty);
+					assert(__cfa_max_readyQs > idx);
+					idx++;
+				}
+			}
+		}
+
+		{
+			for( idx ; list.count ) {
+				__intrusive_ready_queue_t & sl = list.data[idx];
+				assert(!list.data[idx].lock);
+
+				assert(head(sl)->link.prev == 0p );
+				assert(head(sl)->link.next->link.prev == head(sl) );
+				assert(tail(sl)->link.next == 0p );
+				assert(tail(sl)->link.prev->link.next == tail(sl) );
+
+				if(sl.before.link.ts == 0l) {
+					assert(tail(sl)->link.next == 0p);
+					assert(tail(sl)->link.prev == head(sl));
+					assert(head(sl)->link.next == tail(sl));
+					assert(head(sl)->link.prev == 0p);
+				}
+			}
+		}
+	#endif
+}
+
+// Call this function of the intrusive list was moved using memcpy
+// fixes the list so that the pointers back to anchors aren't left
+// dangling
+static inline void fix(__intrusive_ready_queue_t & ll) {
+	// if the list is not empty then follow he pointer
+	// and fix its reverse
+	if(ll.before.link.ts != 0l) {
+		head(ll)->link.next->link.prev = head(ll);
+		tail(ll)->link.prev->link.next = tail(ll);
+	}
+	// Otherwise just reset the list
+	else {
+		tail(ll)->link.next = 0p;
+		tail(ll)->link.prev = head(ll);
+		head(ll)->link.next = tail(ll);
+		head(ll)->link.prev = 0p;
+	}
+}
+
+void ready_queue_grow  (struct cluster * cltr) {
+	uint_fast32_t last_size = ready_mutate_lock( *cltr );
+	check( cltr->ready_queue );
+
+	with( cltr->ready_queue ) {
+		size_t ncount = list.count;
+
+		// Check that we have some space left
+		if(ncount + 4 >= __cfa_max_readyQs) abort("Program attempted to create more than maximum number of Ready Queues (%zu)", __cfa_max_readyQs);
+
+		ncount += 4;
+
+		// Allocate new array
+		list.data = alloc(list.data, ncount);
+
+		// Fix the moved data
+		for( idx; (size_t)list.count ) {
+			fix(list.data[idx]);
+		}
+
+		// Construct new data
+		for( idx; (size_t)list.count ~ ncount) {
+			(list.data[idx]){};
+		}
+
+		// Update original
+		list.count = ncount;
+		// fields in empty don't need to change
+	}
+
+	// Make sure that everything is consistent
+	check( cltr->ready_queue );
+	ready_mutate_unlock( *cltr, last_size );
+}
+
+void ready_queue_shrink(struct cluster * cltr) {
+	uint_fast32_t last_size = ready_mutate_lock( *cltr );
+	with( cltr->ready_queue ) {
+		size_t ocount = list.count;
+		// Check that we have some space left
+		if(ocount < 8) abort("Program attempted to destroy more Ready Queues than were created");
+
+		list.count -= 4;
+
+		// redistribute old data
+		verify(ocount > list.count);
+		for( idx; (size_t)list.count ~ ocount) {
+			// This is not strictly needed but makes checking invariants much easier
+			bool locked = __atomic_try_acquire(&list.data[idx].lock);
+			verify(locked);
+			while(0 != ts(list.data[idx])) {
+				struct thread_desc * thrd;
+				__attribute__((unused)) bool _;
+				[thrd, _] = pop(list.data[idx]);
+				verify(thrd);
+				push(cltr, thrd);
+			}
+
+			__atomic_unlock(&list.data[idx].lock);
+
+			// TODO print the queue statistics here
+
+			^(list.data[idx]){};
+		}
+
+		// clear the now unused masks
+		{
+			__cfa_readyQ_mask_t fword, fbit, lword, lbit;
+			[fbit, fword] = extract(ocount);
+			[lbit, lword] = extract(list.count);
+
+			// For now assume that all queues where coverd by the same bitmask
+			// This is highly probable as long as grow and shrink use groups of 4
+			// exclusively
+			verify(fword == lword);
+			__cfa_readyQ_mask_t clears = ~0;
+
+			for( b ; fbit ~ lbit ) {
+				clears ^= 1 << b;
+			}
+
+			empty.mask[fword] &= clears;
+		}
+
+		// Allocate new array
+		list.data = alloc(list.data, list.count);
+
+		// Fix the moved data
+		for( idx; (size_t)list.count ) {
+			fix(list.data[idx]);
+		}
+	}
+
+	// Make sure that everything is consistent
+	check( cltr->ready_queue );
+	ready_mutate_unlock( *cltr, last_size );
+}
+
+//-----------------------------------------------------------------------
+
+#if !defined(__CFA_NO_STATISTICS__)
+void stats_tls_tally(struct cluster * cltr) with (cltr->ready_queue) {
+	__atomic_fetch_add( &global_stats.pick.push.attempt, tls.pick.push.attempt, __ATOMIC_SEQ_CST );
+	__atomic_fetch_add( &global_stats.pick.push.success, tls.pick.push.success, __ATOMIC_SEQ_CST );
+	__atomic_fetch_add( &global_stats.pick.pop .maskrds, tls.pick.pop .maskrds, __ATOMIC_SEQ_CST );
+	__atomic_fetch_add( &global_stats.pick.pop .attempt, tls.pick.pop .attempt, __ATOMIC_SEQ_CST );
+	__atomic_fetch_add( &global_stats.pick.pop .success, tls.pick.pop .success, __ATOMIC_SEQ_CST );
+
+	__atomic_fetch_add( &global_stats.full.value, tls.full.value, __ATOMIC_SEQ_CST );
+	__atomic_fetch_add( &global_stats.full.count, tls.full.count, __ATOMIC_SEQ_CST );
+}
+#endif
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 98d6965d977b39416143b988d932952fb731637a)
+++ libcfa/src/concurrency/thread.cfa	(revision 2a3d44698a8f41614824b5eaaf857999a33d334b)
@@ -41,5 +41,6 @@
 	self_mon_p = &self_mon;
 	curr_cluster = &cl;
-	next = 0p;
+	link.next = 0p;
+	link.prev = 0p;
 
 	node.next = 0p;