Index: libcfa/src/Makefile.am
===================================================================
--- libcfa/src/Makefile.am	(revision 2cbfe9291f0e8e1365d80ff4c80cf2d8025148dd)
+++ libcfa/src/Makefile.am	(revision b7d6a36126af369a09bd3958b2fe17c537999f77)
@@ -48,5 +48,5 @@
 thread_headers_nosrc = concurrency/invoke.h
 thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
-thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
+thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa concurrency/ready_queue.cfa ${thread_headers:.hfa=.cfa}
 else
 headers =
Index: libcfa/src/Makefile.in
===================================================================
--- libcfa/src/Makefile.in	(revision 2cbfe9291f0e8e1365d80ff4c80cf2d8025148dd)
+++ libcfa/src/Makefile.in	(revision b7d6a36126af369a09bd3958b2fe17c537999f77)
@@ -165,7 +165,7 @@
 	concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa \
 	concurrency/invoke.c concurrency/preemption.cfa \
-	concurrency/coroutine.cfa concurrency/thread.cfa \
-	concurrency/kernel.cfa concurrency/monitor.cfa \
-	concurrency/mutex.cfa
+	concurrency/ready_queue.cfa concurrency/coroutine.cfa \
+	concurrency/thread.cfa concurrency/kernel.cfa \
+	concurrency/monitor.cfa concurrency/mutex.cfa
 @BUILDLIB_TRUE@am__objects_3 = concurrency/coroutine.lo \
 @BUILDLIB_TRUE@	concurrency/thread.lo concurrency/kernel.lo \
@@ -174,5 +174,6 @@
 @BUILDLIB_TRUE@	concurrency/CtxSwitch-@ARCHITECTURE@.lo \
 @BUILDLIB_TRUE@	concurrency/alarm.lo concurrency/invoke.lo \
-@BUILDLIB_TRUE@	concurrency/preemption.lo $(am__objects_3)
+@BUILDLIB_TRUE@	concurrency/preemption.lo \
+@BUILDLIB_TRUE@	concurrency/ready_queue.lo $(am__objects_3)
 am_libcfathread_la_OBJECTS = $(am__objects_4)
 libcfathread_la_OBJECTS = $(am_libcfathread_la_OBJECTS)
@@ -470,5 +471,5 @@
 @BUILDLIB_FALSE@thread_headers = 
 @BUILDLIB_TRUE@thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
-@BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
+@BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa concurrency/ready_queue.cfa ${thread_headers:.hfa=.cfa}
 
 #----------------------------------------------------------------------------------------------------------------
@@ -606,4 +607,6 @@
 	concurrency/$(DEPDIR)/$(am__dirstamp)
 concurrency/preemption.lo: concurrency/$(am__dirstamp) \
+	concurrency/$(DEPDIR)/$(am__dirstamp)
+concurrency/ready_queue.lo: concurrency/$(am__dirstamp) \
 	concurrency/$(DEPDIR)/$(am__dirstamp)
 concurrency/coroutine.lo: concurrency/$(am__dirstamp) \
Index: libcfa/src/bits/debug.hfa
===================================================================
--- libcfa/src/bits/debug.hfa	(revision 2cbfe9291f0e8e1365d80ff4c80cf2d8025148dd)
+++ libcfa/src/bits/debug.hfa	(revision b7d6a36126af369a09bd3958b2fe17c537999f77)
@@ -37,4 +37,5 @@
 	#include <stdarg.h>
 	#include <stdio.h>
+	#include <unistd.h>
 
 	extern void __cfaabi_bits_write( int fd, const char buffer[], int len );
@@ -49,13 +50,15 @@
 #endif
 
+// #define __CFA_DEBUG_PRINT__
+
 #ifdef __CFA_DEBUG_PRINT__
 	#define __cfaabi_dbg_write( buffer, len )         __cfaabi_bits_write( STDERR_FILENO, buffer, len )
 	#define __cfaabi_dbg_acquire()                    __cfaabi_bits_acquire()
 	#define __cfaabi_dbg_release()                    __cfaabi_bits_release()
-	#define __cfaabi_dbg_print_safe(...)              __cfaabi_bits_print_safe   (__VA_ARGS__)
-	#define __cfaabi_dbg_print_nolock(...)            __cfaabi_bits_print_nolock (__VA_ARGS__)
-	#define __cfaabi_dbg_print_buffer(...)            __cfaabi_bits_print_buffer (__VA_ARGS__)
-	#define __cfaabi_dbg_print_buffer_decl(...)       char __dbg_text[256]; int __dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_bits_write( __dbg_text, __dbg_len );
-	#define __cfaabi_dbg_print_buffer_local(...)      __dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_dbg_write( __dbg_text, __dbg_len );
+	#define __cfaabi_dbg_print_safe(...)              __cfaabi_bits_print_safe  ( STDERR_FILENO, __VA_ARGS__ )
+	#define __cfaabi_dbg_print_nolock(...)            __cfaabi_bits_print_nolock( STDERR_FILENO, __VA_ARGS__ )
+	#define __cfaabi_dbg_print_buffer(...)            __cfaabi_bits_print_buffer( STDERR_FILENO, __VA_ARGS__ )
+	#define __cfaabi_dbg_print_buffer_decl(...)       char __dbg_text[256]; int __dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_bits_write( STDERR_FILENO, __dbg_text, __dbg_len );
+	#define __cfaabi_dbg_print_buffer_local(...)      __dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_bits_write( STDERR_FILENO, __dbg_text, __dbg_len );
 #else
 	#define __cfaabi_dbg_write(...)               ((void)0)
Index: libcfa/src/bits/defs.hfa
===================================================================
--- libcfa/src/bits/defs.hfa	(revision 2cbfe9291f0e8e1365d80ff4c80cf2d8025148dd)
+++ libcfa/src/bits/defs.hfa	(revision b7d6a36126af369a09bd3958b2fe17c537999f77)
@@ -54,2 +54,74 @@
     return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
 }
+
+// #define __CFA_NO_BIT_TEST_AND_SET__
+
+#if defined( __i386 )
+static inline bool __atomic_bts(volatile unsigned long int * target, unsigned long int bit ) {
+	#if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long int mask = 1ul << bit;
+        unsigned long int ret = __atomic_fetch_or(target, mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+    #else
+        int result = 0;
+        asm volatile(
+            "LOCK btsl %[bit], %[target]\n\t"
+            : "=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+
+static inline bool __atomic_btr(volatile unsigned long int * target, unsigned long int bit ) {
+	#if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long int mask = 1ul << bit;
+        unsigned long int ret = __atomic_fetch_and(target, ~mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+	#else
+        int result = 0;
+        asm volatile(
+            "LOCK btrl %[bit], %[target]\n\t"
+            :"=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+#elif defined( __x86_64 )
+static inline bool __atomic_bts(volatile unsigned long long int * target, unsigned long long int bit ) {
+	#if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long long int mask = 1ul << bit;
+        unsigned long long int ret = __atomic_fetch_or(target, mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+    #else
+        int result = 0;
+        asm volatile(
+            "LOCK btsq %[bit], %[target]\n\t"
+            : "=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+
+static inline bool __atomic_btr(volatile unsigned long long int * target, unsigned long long int bit ) {
+	#if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long long int mask = 1ul << bit;
+        unsigned long long int ret = __atomic_fetch_and(target, ~mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+	#else
+        int result = 0;
+        asm volatile(
+            "LOCK btrq %[bit], %[target]\n\t"
+            :"=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+#elif defined( __ARM_ARCH )
+    #error __atomic_bts and __atomic_btr not implemented for arm
+#else
+	#error uknown hardware architecture
+#endif
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision 2cbfe9291f0e8e1365d80ff4c80cf2d8025148dd)
+++ libcfa/src/concurrency/invoke.h	(revision b7d6a36126af369a09bd3958b2fe17c537999f77)
@@ -158,4 +158,12 @@
 	};
 
+	// Link lists fields
+	// instrusive link field for threads
+	struct __thread_desc_link {
+		struct thread_desc * next;
+		struct thread_desc * prev;
+		unsigned long long ts;
+	};
+
 	struct thread_desc {
 		// Core threading fields
@@ -188,5 +196,5 @@
 		// Link lists fields
 		// instrusive link field for threads
-		struct thread_desc * next;
+		struct __thread_desc_link link;
 
 		struct {
@@ -199,5 +207,5 @@
 	extern "Cforall" {
 		static inline thread_desc *& get_next( thread_desc & this ) {
-			return this.next;
+			return this.link.next;
 		}
 
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 2cbfe9291f0e8e1365d80ff4c80cf2d8025148dd)
+++ libcfa/src/concurrency/kernel.cfa	(revision b7d6a36126af369a09bd3958b2fe17c537999f77)
@@ -187,5 +187,6 @@
 	self_mon.recursion = 1;
 	self_mon_p = &self_mon;
-	next = 0p;
+	link.next = 0p;
+	link.prev = 0p;
 
 	node.next = 0p;
@@ -212,4 +213,5 @@
 	this.name = name;
 	this.cltr = &cltr;
+	id = -1u;
 	terminated{ 0 };
 	do_terminate = false;
@@ -242,7 +244,6 @@
 	this.preemption_rate = preemption_rate;
 	ready_queue{};
-	ready_queue_lock{};
-
-	procs{ __get };
+	ready_lock{};
+
 	idles{ __get };
 	threads{ __get };
@@ -273,5 +274,10 @@
 	__cfaabi_dbg_print_safe("Kernel : core %p starting\n", this);
 
-	doregister(this->cltr, this);
+	// register the processor unless it's the main thread which is handled in the boot sequence
+	if(this != mainProcessor) {
+		this->id = doregister(this->cltr, this);
+		ready_queue_grow( this->cltr );
+	}
+
 
 	{
@@ -305,9 +311,16 @@
 	}
 
-	unregister(this->cltr, this);
-
 	V( this->terminated );
 
+
+	// unregister the processor unless it's the main thread which is handled in the boot sequence
+	if(this != mainProcessor) {
+		ready_queue_shrink( this->cltr );
+		unregister(this->cltr, this);
+	}
+
 	__cfaabi_dbg_print_safe("Kernel : core %p terminated\n", this);
+
+	stats_tls_tally(this->cltr);
 }
 
@@ -538,23 +551,22 @@
 	verify( ! kernelTLS.preemption_state.enabled );
 
-	verifyf( thrd->next == 0p, "Expected null got %p", thrd->next );
+	verifyf( thrd->link.next == 0p, "Expected null got %p", thrd->link.next );
+
+
+	ready_schedule_lock(thrd->curr_cluster, kernelTLS.this_processor);
+		bool was_empty = push( thrd->curr_cluster, thrd );
+	ready_schedule_unlock(thrd->curr_cluster, kernelTLS.this_processor);
 
 	with( *thrd->curr_cluster ) {
-		lock  ( ready_queue_lock __cfaabi_dbg_ctx2 );
-		bool was_empty = !(ready_queue != 0);
-		append( ready_queue, thrd );
-		unlock( ready_queue_lock );
-
-		if(was_empty) {
-			lock      (proc_list_lock __cfaabi_dbg_ctx2);
-			if(idles) {
-				wake_fast(idles.head);
-			}
-			unlock    (proc_list_lock);
-		}
-		else if( struct processor * idle = idles.head ) {
-			wake_fast(idle);
-		}
-
+		// if(was_empty) {
+		// 	lock      (proc_list_lock __cfaabi_dbg_ctx2);
+		// 	if(idles) {
+		// 		wake_fast(idles.head);
+		// 	}
+		// 	unlock    (proc_list_lock);
+		// }
+		// else if( struct processor * idle = idles.head ) {
+		// 	wake_fast(idle);
+		// }
 	}
 
@@ -565,7 +577,9 @@
 thread_desc * nextThread(cluster * this) with( *this ) {
 	verify( ! kernelTLS.preemption_state.enabled );
-	lock( ready_queue_lock __cfaabi_dbg_ctx2 );
-	thread_desc * head = pop_head( ready_queue );
-	unlock( ready_queue_lock );
+
+	ready_schedule_lock(this, kernelTLS.this_processor);
+		thread_desc * head = pop( this );
+	ready_schedule_unlock(this, kernelTLS.this_processor);
+
 	verify( ! kernelTLS.preemption_state.enabled );
 	return head;
@@ -729,4 +743,5 @@
 		pending_preemption = false;
 		kernel_thread = pthread_self();
+		id = -1u;
 
 		runner{ &this };
@@ -738,4 +753,6 @@
 	mainProcessor = (processor *)&storage_mainProcessor;
 	(*mainProcessor){};
+
+	mainProcessor->id = doregister(mainCluster, mainProcessor);
 
 	//initialize the global state variables
@@ -784,12 +801,20 @@
 	kernel_stop_preemption();
 
+	unregister(mainCluster, mainProcessor);
+
 	// Destroy the main processor and its context in reverse order of construction
 	// These were manually constructed so we need manually destroy them
-	^(mainProcessor->runner){};
-	^(mainProcessor){};
+	void ^?{}(processor & this) with( this ) {
+		//don't join the main thread here, that wouldn't make any sense
+		__cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
+	}
+
+	^(*mainProcessor){};
 
 	// Final step, destroy the main thread since it is no longer needed
-	// Since we provided a stack to this taxk it will not destroy anything
-	^(mainThread){};
+	// Since we provided a stack to this task it will not destroy anything
+	^(*mainThread){};
+
+	^(*mainCluster){};
 
 	^(__cfa_dbg_global_clusters.list){};
@@ -803,25 +828,23 @@
 //=============================================================================================
 static void halt(processor * this) with( *this ) {
-	// verify( ! __atomic_load_n(&do_terminate, __ATOMIC_SEQ_CST) );
-
-	with( *cltr ) {
-		lock      (proc_list_lock __cfaabi_dbg_ctx2);
-		remove    (procs, *this);
-		push_front(idles, *this);
-		unlock    (proc_list_lock);
-	}
-
-	__cfaabi_dbg_print_safe("Kernel : Processor %p ready to sleep\n", this);
-
-	wait( idleLock );
-
-	__cfaabi_dbg_print_safe("Kernel : Processor %p woke up and ready to run\n", this);
-
-	with( *cltr ) {
-		lock      (proc_list_lock __cfaabi_dbg_ctx2);
-		remove    (idles, *this);
-		push_front(procs, *this);
-		unlock    (proc_list_lock);
-	}
+	// // verify( ! __atomic_load_n(&do_terminate, __ATOMIC_SEQ_CST) );
+
+	// with( *cltr ) {
+	// 	lock      (proc_list_lock __cfaabi_dbg_ctx2);
+	// 	push_front(idles, *this);
+	// 	unlock    (proc_list_lock);
+	// }
+
+	// __cfaabi_dbg_print_safe("Kernel : Processor %p ready to sleep\n", this);
+
+	// wait( idleLock );
+
+	// __cfaabi_dbg_print_safe("Kernel : Processor %p woke up and ready to run\n", this);
+
+	// with( *cltr ) {
+	// 	lock      (proc_list_lock __cfaabi_dbg_ctx2);
+	// 	remove    (idles, *this);
+	// 	unlock    (proc_list_lock);
+	// }
 }
 
@@ -963,18 +986,4 @@
 }
 
-void doregister( cluster * cltr, processor * proc ) {
-	lock      (cltr->proc_list_lock __cfaabi_dbg_ctx2);
-	cltr->nprocessors += 1;
-	push_front(cltr->procs, *proc);
-	unlock    (cltr->proc_list_lock);
-}
-
-void unregister( cluster * cltr, processor * proc ) {
-	lock  (cltr->proc_list_lock __cfaabi_dbg_ctx2);
-	remove(cltr->procs, *proc );
-	cltr->nprocessors -= 1;
-	unlock(cltr->proc_list_lock);
-}
-
 //-----------------------------------------------------------------------------
 // Debug
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 2cbfe9291f0e8e1365d80ff4c80cf2d8025148dd)
+++ libcfa/src/concurrency/kernel.hfa	(revision b7d6a36126af369a09bd3958b2fe17c537999f77)
@@ -107,4 +107,5 @@
 	// Cluster from which to get threads
 	struct cluster * cltr;
+	unsigned int id;
 
 	// Name of the processor
@@ -161,12 +162,162 @@
 }
 
+
+//-----------------------------------------------------------------------------
+// Cluster Tools
+
+// Cells use by the reader writer lock
+// while not generic it only relies on a opaque pointer
+struct __processor_id;
+
+// Reader-Writer lock protecting the ready-queue
+// while this lock is mostly generic some aspects
+// have been hard-coded to for the ready-queue for
+// simplicity and performance
+struct __clusterRWLock_t {
+	// total cachelines allocated
+	unsigned int max;
+
+	// cachelines currently in use
+	volatile unsigned int alloc;
+
+	// cachelines ready to itereate over
+	// (!= to alloc when thread is in second half of doregister)
+	volatile unsigned int ready;
+
+	// writer lock
+	volatile bool lock;
+
+	// data pointer
+	__processor_id * data;
+};
+
+void  ?{}(__clusterRWLock_t & this);
+void ^?{}(__clusterRWLock_t & this);
+
+// Intrusives lanes which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __intrusive_lane_t {
+	// spin lock protecting the queue
+	volatile bool lock;
+
+	// anchor for the head and the tail of the queue
+	struct __sentinel_t {
+		// Link lists fields
+		// instrusive link field for threads
+		// must be exactly as in thread_desc
+		__thread_desc_link link;
+	} before, after;
+
+#if defined(__CFA_WITH_VERIFY__)
+	// id of last processor to acquire the lock
+	// needed only to check for mutual exclusion violations
+	unsigned int last_id;
+
+	// number of items on this list
+	// needed only to check for deadlocks
+	unsigned int count;
+#endif
+
+	// Optional statistic counters
+	#if !defined(__CFA_NO_SCHED_STATS__)
+		struct __attribute__((aligned(64))) {
+			// difference between number of push and pops
+			ssize_t diff;
+
+			// total number of pushes and pops
+			size_t  push;
+			size_t  pop ;
+		} stat;
+	#endif
+};
+
+void  ?{}(__intrusive_lane_t & this);
+void ^?{}(__intrusive_lane_t & this);
+
+typedef unsigned long long __cfa_readyQ_mask_t;
+
+// enum {
+// 	__cfa_ready_queue_mask_size = (64 - sizeof(size_t)) / sizeof(size_t),
+// 	__cfa_max_ready_queues = __cfa_ready_queue_mask_size * 8 * sizeof(size_t)
+// };
+
+#define __cfa_lane_mask_size ((64 - sizeof(size_t)) / sizeof(__cfa_readyQ_mask_t))
+#define __cfa_max_lanes (__cfa_lane_mask_size * 8 * sizeof(__cfa_readyQ_mask_t))
+
+//TODO adjust cache size to ARCHITECTURE
+// Structure holding the relaxed ready queue
+struct __attribute__((aligned(128))) __ready_queue_t {
+	// Data tracking how many/which lanes are used
+	// Aligned to 128 for cache locality
+	struct {
+		// number of non-empty lanes
+		volatile size_t count;
+
+		// bit mask, set bits indentify which lanes are non-empty
+		volatile __cfa_readyQ_mask_t mask[ __cfa_lane_mask_size ];
+	} used;
+
+	// Data tracking the actual lanes
+	// On a seperate cacheline from the used struct since
+	// used can change on each push/pop but this data
+	// only changes on shrink/grow
+	struct __attribute__((aligned(64))) {
+		// Arary of lanes
+		__intrusive_lane_t * volatile data;
+
+		// Number of lanes (empty or not)
+		volatile size_t count;
+	} lanes;
+
+	// Statistics
+	#if !defined(__CFA_NO_STATISTICS__)
+		__attribute__((aligned(64))) struct {
+			struct {
+				// Push statistic
+				struct {
+					// number of attemps at pushing something
+					volatile size_t attempt;
+
+					// number of successes at pushing
+					volatile size_t success;
+				} push;
+
+				// Pop statistic
+				struct {
+					// number of reads of the mask
+					// picking an empty __cfa_readyQ_mask_t counts here
+					// but not as an attempt
+					volatile size_t maskrds;
+
+					// number of attemps at poping something
+					volatile size_t attempt;
+
+					// number of successes at poping
+					volatile size_t success;
+				} pop;
+			} pick;
+
+			// stats on the "used" struct of the queue
+			// tracks average number of queues that are not empty
+			// when pushing / poping
+			struct {
+				volatile size_t value;
+				volatile size_t count;
+			} used;
+		} global_stats;
+
+	#endif
+};
+
+void  ?{}(__ready_queue_t & this);
+void ^?{}(__ready_queue_t & this);
+
 //-----------------------------------------------------------------------------
 // Cluster
 struct cluster {
 	// Ready queue locks
-	__spinlock_t ready_queue_lock;
+	__clusterRWLock_t ready_lock;
 
 	// Ready queue for threads
-	__queue_t(thread_desc) ready_queue;
+	__ready_queue_t ready_queue;
 
 	// Name of the cluster
@@ -178,7 +329,5 @@
 	// List of processors
 	__spinlock_t proc_list_lock;
-	__dllist_t(struct processor) procs;
 	__dllist_t(struct processor) idles;
-	unsigned int nprocessors;
 
 	// List of threads
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 2cbfe9291f0e8e1365d80ff4c80cf2d8025148dd)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision b7d6a36126af369a09bd3958b2fe17c537999f77)
@@ -100,5 +100,5 @@
 //-----------------------------------------------------------------------------
 // Utils
-#define KERNEL_STORAGE(T,X) static char storage_##X[sizeof(T)]
+#define KERNEL_STORAGE(T,X) __attribute((aligned(__alignof__(T)))) static char storage_##X[sizeof(T)]
 
 static inline uint32_t tls_rand() {
@@ -116,6 +116,110 @@
 void unregister( struct cluster * cltr, struct thread_desc & thrd );
 
-void doregister( struct cluster * cltr, struct processor * proc );
-void unregister( struct cluster * cltr, struct processor * proc );
+//=======================================================================
+// Cluster lock API
+//=======================================================================
+struct __attribute__((aligned(64))) __processor_id {
+	processor * volatile handle;
+	volatile bool lock;
+};
+
+// Lock-Free registering/unregistering of threads
+// Register a processor to a given cluster and get its unique id in return
+unsigned doregister( struct cluster * cltr, struct processor * proc );
+
+// Unregister a processor from a given cluster using its id, getting back the original pointer
+void     unregister( struct cluster * cltr, struct processor * proc );
+
+//=======================================================================
+// Reader-writer lock implementation
+// Concurrent with doregister/unregister,
+//    i.e., threads can be added at any point during or between the entry/exit
+
+//-----------------------------------------------------------------------
+// simple spinlock underlying the RWLock
+// Blocking acquire
+static inline void __atomic_acquire(volatile bool * ll) {
+	while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) {
+		while(__atomic_load_n(ll, (int)__ATOMIC_RELAXED))
+			asm volatile("pause");
+	}
+	/* paranoid */ verify(*ll);
+}
+
+// Non-Blocking acquire
+static inline bool __atomic_try_acquire(volatile bool * ll) {
+	return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
+}
+
+// Release
+static inline void __atomic_unlock(volatile bool * ll) {
+	/* paranoid */ verify(*ll);
+	__atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
+}
+
+//-----------------------------------------------------------------------
+// Reader side : acquire when using the ready queue to schedule but not
+//  creating/destroying queues
+static inline void ready_schedule_lock( struct cluster * cltr, struct processor * proc) with(cltr->ready_lock) {
+	unsigned iproc = proc->id;
+	/*paranoid*/ verify(data[iproc].handle == proc);
+	/*paranoid*/ verify(iproc < ready);
+
+	// Step 1 : make sure no writer are in the middle of the critical section
+	while(__atomic_load_n(&lock, (int)__ATOMIC_RELAXED))
+		asm volatile("pause");
+
+	// Fence needed because we don't want to start trying to acquire the lock
+	// before we read a false.
+	// Not needed on x86
+	// std::atomic_thread_fence(std::memory_order_seq_cst);
+
+	// Step 2 : acquire our local lock
+	__atomic_acquire( &data[iproc].lock );
+	/*paranoid*/ verify(data[iproc].lock);
+}
+
+static inline void ready_schedule_unlock( struct cluster * cltr, struct processor * proc) with(cltr->ready_lock) {
+	unsigned iproc = proc->id;
+	/*paranoid*/ verify(data[iproc].handle == proc);
+	/*paranoid*/ verify(iproc < ready);
+	/*paranoid*/ verify(data[iproc].lock);
+	__atomic_unlock(&data[iproc].lock);
+}
+
+//-----------------------------------------------------------------------
+// Writer side : acquire when changing the ready queue, e.g. adding more
+//  queues or removing them.
+uint_fast32_t ready_mutate_lock( struct cluster & cltr );
+
+void ready_mutate_unlock( struct cluster & cltr, uint_fast32_t /* value returned by lock */ );
+
+//=======================================================================
+// Ready-Queue API
+//-----------------------------------------------------------------------
+// push thread onto a ready queue for a cluster
+// returns true if the list was previously empty, false otherwise
+__attribute__((hot)) bool push(struct cluster * cltr, struct thread_desc * thrd);
+
+//-----------------------------------------------------------------------
+// pop thread from the ready queue of a cluster
+// returns 0p if empty
+__attribute__((hot)) thread_desc * pop(struct cluster * cltr);
+
+//-----------------------------------------------------------------------
+// Increase the width of the ready queue (number of lanes) by 4
+void ready_queue_grow  (struct cluster * cltr);
+
+//-----------------------------------------------------------------------
+// Decrease the width of the ready queue (number of lanes) by 4
+void ready_queue_shrink(struct cluster * cltr);
+
+//-----------------------------------------------------------------------
+// Statics call at the end of each thread to register statistics
+#if !defined(__CFA_NO_STATISTICS__)
+void stats_tls_tally(struct cluster * cltr);
+#else
+static inline void stats_tls_tally(struct cluster * cltr) {}
+#endif
 
 // Local Variables: //
Index: libcfa/src/concurrency/monitor.cfa
===================================================================
--- libcfa/src/concurrency/monitor.cfa	(revision 2cbfe9291f0e8e1365d80ff4c80cf2d8025148dd)
+++ libcfa/src/concurrency/monitor.cfa	(revision b7d6a36126af369a09bd3958b2fe17c537999f77)
@@ -817,5 +817,5 @@
 	}
 
-	__cfaabi_dbg_print_safe( "Kernel :  Runing %i (%p)\n", ready2run, ready2run ? node->waiting_thread : 0p );
+	__cfaabi_dbg_print_safe( "Kernel :  Runing %i (%p)\n", ready2run, ready2run ? (thread_desc*)node->waiting_thread : (thread_desc*)0p );
 	return ready2run ? node->waiting_thread : 0p;
 }
@@ -842,5 +842,5 @@
 	for(	thread_desc ** thrd_it = &entry_queue.head;
 		*thrd_it;
-		thrd_it = &(*thrd_it)->next
+		thrd_it = &(*thrd_it)->link.next
 	) {
 		// For each acceptable check if it matches
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 2cbfe9291f0e8e1365d80ff4c80cf2d8025148dd)
+++ libcfa/src/concurrency/preemption.cfa	(revision b7d6a36126af369a09bd3958b2fe17c537999f77)
@@ -120,5 +120,5 @@
 	// If there are still alarms pending, reset the timer
 	if( alarms->head ) {
-		__cfaabi_dbg_print_buffer_decl( " KERNEL: @%ju(%ju) resetting alarm to %ju.\n", currtime.tv, __kernel_get_time().tv, (alarms->head->alarm - currtime).tv);
+		// __cfaabi_dbg_print_buffer_decl( " KERNEL: @%ju(%ju) resetting alarm to %ju.\n", currtime.tv, __kernel_get_time().tv, (alarms->head->alarm - currtime).tv);
 		Duration delta = alarms->head->alarm - currtime;
 		Duration caped = max(delta, 50`us);
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision b7d6a36126af369a09bd3958b2fe17c537999f77)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision b7d6a36126af369a09bd3958b2fe17c537999f77)
@@ -0,0 +1,979 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2019 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// ready_queue.cfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Mon Nov dd 16:29:18 2019
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#define __cforall_thread__
+
+#include "bits/defs.hfa"
+#include "kernel_private.hfa"
+
+#define _GNU_SOURCE
+#include "stdlib.hfa"
+
+static const size_t cache_line_size = 64;
+
+// No overriden function, no environment variable, no define
+// fall back to a magic number
+#ifndef __CFA_MAX_PROCESSORS__
+	#define __CFA_MAX_PROCESSORS__ 128
+#endif
+
+// returns the maximum number of processors the RWLock support
+__attribute__((weak)) unsigned __max_processors() {
+	const char * max_cores_s = getenv("CFA_MAX_PROCESSORS");
+	if(!max_cores_s) {
+		__cfaabi_dbg_print_nolock("No CFA_MAX_PROCESSORS in ENV");
+		return __CFA_MAX_PROCESSORS__;
+	}
+
+	char * endptr = 0p;
+	long int max_cores_l = strtol(max_cores_s, &endptr, 10);
+	if(max_cores_l < 1 || max_cores_l > 65535) {
+		__cfaabi_dbg_print_nolock("CFA_MAX_PROCESSORS out of range : %ld", max_cores_l);
+		return __CFA_MAX_PROCESSORS__;
+	}
+	if('\0' != *endptr) {
+		__cfaabi_dbg_print_nolock("CFA_MAX_PROCESSORS not a decimal number : %s", max_cores_s);
+		return __CFA_MAX_PROCESSORS__;
+	}
+
+	return max_cores_l;
+}
+
+// Picks a random 1 bit in 'mask' according to random number 'rnum'.
+static inline unsigned rand_bit(unsigned rnum, __cfa_readyQ_mask_t mask) {
+#if defined( __i386 )
+	static_assert(sizeof(mask) == 4);
+	unsigned bit = mask ? rnum % __builtin_popcount(mask) : 0;
+	#if !defined(__BMI2__)
+		#error rand_bit not implemented for non __BMI2__ i386
+	#else
+		uint32_t picked = _pdep_u32(1ul << bit, mask);
+		return picked ? __builtin_ctz(picked) : 0;
+	#endif
+#elif defined( __x86_64 )
+	static_assert(sizeof(mask) == 8);
+	unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
+	#if !defined(__BMI2__)
+		uint64_t v = mask;   // Input value to find position with rank r.
+		unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
+		unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
+		uint64_t a, b, c, d; // Intermediate temporaries for bit count.
+		unsigned int t;      // Bit count temporary.
+
+		// Do a normal parallel bit count for a 64-bit integer,
+		// but store all intermediate steps.
+		a =  v - ((v >> 1) & ~0UL/3);
+		b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
+		c = (b + (b >> 4)) & ~0UL/0x11;
+		d = (c + (c >> 8)) & ~0UL/0x101;
+
+
+		t = (d >> 32) + (d >> 48);
+		// Now do branchless select!
+		s  = 64;
+		s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
+		t  = (d >> (s - 16)) & 0xff;
+		s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
+		t  = (c >> (s - 8)) & 0xf;
+		s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
+		t  = (b >> (s - 4)) & 0x7;
+		s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
+		t  = (a >> (s - 2)) & 0x3;
+		s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
+		t  = (v >> (s - 1)) & 0x1;
+		s -= ((t - r) & 256) >> 8;
+		return s - 1;
+	#else
+		uint64_t picked = _pdep_u64(1ul << bit, mask);
+		return picked ? __builtin_ctzl(picked) : 0;
+	#endif
+#elif defined( __ARM_ARCH )
+	#error rand_bit not implemented for arm
+#else
+	#error uknown hardware architecture
+#endif
+}
+
+
+//-----------------------------------------------------------------------------
+// Helpers used by extract
+// (_mask_bitsidx() & X) returns a bit index valid for a __cfa_readyQ_mask_t, where X is any integer
+static inline __cfa_readyQ_mask_t _mask_bitsidx () { return (8 * sizeof(__cfa_readyQ_mask_t)) - 1; }
+
+// (X >> _mask_shiftidx()) retuns an index into an array of __cfa_readyQ_mask_t
+static inline __cfa_readyQ_mask_t _mask_shiftidx() { return (8 * sizeof(__cfa_readyQ_mask_t)) - __builtin_clzl(_mask_bitsidx()); }
+
+
+// Assuming a large bit mask represented as an array of __cfa_readyQ_mask_t
+// Given an index into the large mask, returns the bit index and which __cfa_readyQ_mask_t index in the array
+static inline [__cfa_readyQ_mask_t, __cfa_readyQ_mask_t] extract(__cfa_readyQ_mask_t idx) {
+	__cfa_readyQ_mask_t word = idx >> _mask_bitsidx();
+	__cfa_readyQ_mask_t bit  = idx &  _mask_shiftidx();
+	return [bit, word];
+}
+
+//=======================================================================
+// Cluster wide reader-writer lock
+//=======================================================================
+void  ?{}(__clusterRWLock_t & this) {
+	this.max   = __max_processors();
+	this.alloc = 0;
+	this.ready = 0;
+	this.lock  = false;
+	this.data  = alloc(this.max);
+
+	/*paranoid*/ verify( 0 == (((uintptr_t)(this.data    )) % 64) );
+	/*paranoid*/ verify( 0 == (((uintptr_t)(this.data + 1)) % 64) );
+	/*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.alloc), &this.alloc));
+	/*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.ready), &this.ready));
+
+}
+void ^?{}(__clusterRWLock_t & this) {
+	free(this.data);
+}
+
+void ?{}( __processor_id & this, struct processor * proc ) {
+	this.handle = proc;
+	this.lock   = false;
+}
+
+//=======================================================================
+// Lock-Free registering/unregistering of threads
+unsigned doregister( struct cluster * cltr, struct processor * proc ) with(cltr->ready_lock) {
+	// Step - 1 : check if there is already space in the data
+	uint_fast32_t s = ready;
+
+	// Check among all the ready
+	for(uint_fast32_t i = 0; i < s; i++) {
+		processor * null = 0p; // Re-write every loop since compare thrashes it
+		if( __atomic_load_n(&data[i].handle, (int)__ATOMIC_RELAXED) == null
+			&& __atomic_compare_exchange_n( &data[i].handle, &null, proc, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+			/*paranoid*/ verify(i < ready);
+			/*paranoid*/ verify(__alignof__(data[i]) == cache_line_size);
+			/*paranoid*/ verify((((uintptr_t)&data[i]) % cache_line_size) == 0);
+			return i;
+		}
+	}
+
+	if(max <= alloc) abort("Trying to create more than %ud processors", cltr->ready_lock.max);
+
+	// Step - 2 : F&A to get a new spot in the array.
+	uint_fast32_t n = __atomic_fetch_add(&alloc, 1, __ATOMIC_SEQ_CST);
+	if(max <= n) abort("Trying to create more than %ud processors", cltr->ready_lock.max);
+
+	// Step - 3 : Mark space as used and then publish it.
+	__processor_id * storage = (__processor_id *)&data[n];
+	(*storage){ proc };
+	while(true) {
+		unsigned copy = n;
+		if( __atomic_load_n(&ready, __ATOMIC_RELAXED) == n
+			&& __atomic_compare_exchange_n(&ready, &copy, n + 1, true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
+			break;
+		asm volatile("pause");
+	}
+
+	// Return new spot.
+	/*paranoid*/ verify(n < ready);
+	/*paranoid*/ verify(__alignof__(data[n]) == cache_line_size);
+	/*paranoid*/ verify((((uintptr_t)&data[n]) % cache_line_size) == 0);
+	return n;
+}
+
+void unregister( struct cluster * cltr, struct processor * proc ) with(cltr->ready_lock) {
+	unsigned id = proc->id;
+	/*paranoid*/ verify(id < ready);
+	/*paranoid*/ verify(proc == __atomic_load_n(&data[id].handle, __ATOMIC_RELAXED));
+	__atomic_store_n(&data[id].handle, 0p, __ATOMIC_RELEASE);
+}
+
+//-----------------------------------------------------------------------
+// Writer side : acquire when changing the ready queue, e.g. adding more
+//  queues or removing them.
+uint_fast32_t ready_mutate_lock( struct cluster & cltr ) with(cltr.ready_lock) {
+	// Step 1 : lock global lock
+	// It is needed to avoid processors that register mid Critical-Section
+	//   to simply lock their own lock and enter.
+	__atomic_acquire( &lock );
+
+	// Step 2 : lock per-proc lock
+	// Processors that are currently being registered aren't counted
+	//   but can't be in read_lock or in the critical section.
+	// All other processors are counted
+	uint_fast32_t s = ready;
+	for(uint_fast32_t i = 0; i < s; i++) {
+		__atomic_acquire( &data[i].lock );
+	}
+
+	return s;
+}
+
+void ready_mutate_unlock( struct cluster & cltr, uint_fast32_t last_s ) with(cltr.ready_lock) {
+	// Step 1 : release local locks
+	// This must be done while the global lock is held to avoid
+	//   threads that where created mid critical section
+	//   to race to lock their local locks and have the writer
+	//   immidiately unlock them
+	// Alternative solution : return s in write_lock and pass it to write_unlock
+	for(uint_fast32_t i = 0; i < last_s; i++) {
+		verify(data[i].lock);
+		__atomic_store_n(&data[i].lock, (bool)false, __ATOMIC_RELEASE);
+	}
+
+	// Step 2 : release global lock
+	/*paranoid*/ assert(true == lock);
+	__atomic_store_n(&lock, (bool)false, __ATOMIC_RELEASE);
+}
+
+//=======================================================================
+// Intrusive Queue used by ready queue
+//=======================================================================
+// Get the head pointer (one before the first element) from the anchor
+static inline thread_desc * head(const __intrusive_lane_t & this) {
+	thread_desc * rhead = (thread_desc *)(
+		(uintptr_t)( &this.before ) - offsetof( thread_desc, link )
+	);
+	/* paranoid */ verify(rhead);
+	return rhead;
+}
+
+// Get the tail pointer (one after the last element) from the anchor
+static inline thread_desc * tail(const __intrusive_lane_t & this) {
+	thread_desc * rtail = (thread_desc *)(
+		(uintptr_t)( &this.after ) - offsetof( thread_desc, link )
+	);
+	/* paranoid */ verify(rtail);
+	return rtail;
+}
+
+// Ctor
+void ?{}( __intrusive_lane_t & this ) {
+	this.lock = false;
+	this.last_id = -1u;
+	this.count = 0u;
+
+	this.before.link.prev = 0p;
+	this.before.link.next = tail(this);
+	this.before.link.ts   = 0;
+
+	this.after .link.prev = head(this);
+	this.after .link.next = 0p;
+	this.after .link.ts   = 0;
+
+	#if !defined(__CFA_NO_SCHED_STATS__)
+		this.stat.diff = 0;
+		this.stat.push = 0;
+		this.stat.pop  = 0;
+	#endif
+
+	// We add a boat-load of assertions here because the anchor code is very fragile
+	/* paranoid */ verify(((uintptr_t)( head(this) ) + offsetof( thread_desc, link )) == (uintptr_t)(&this.before));
+	/* paranoid */ verify(((uintptr_t)( tail(this) ) + offsetof( thread_desc, link )) == (uintptr_t)(&this.after ));
+	/* paranoid */ verify(head(this)->link.prev == 0p );
+	/* paranoid */ verify(head(this)->link.next == tail(this) );
+	/* paranoid */ verify(tail(this)->link.next == 0p );
+	/* paranoid */ verify(tail(this)->link.prev == head(this) );
+	/* paranoid */ verify(&head(this)->link.prev == &this.before.link.prev );
+	/* paranoid */ verify(&head(this)->link.next == &this.before.link.next );
+	/* paranoid */ verify(&tail(this)->link.prev == &this.after .link.prev );
+	/* paranoid */ verify(&tail(this)->link.next == &this.after .link.next );
+	/* paranoid */ verify(sizeof(__intrusive_lane_t) == 128);
+	/* paranoid */ verify(sizeof(this) == 128);
+	/* paranoid */ verify(__alignof__(__intrusive_lane_t) == 128);
+	/* paranoid */ verify(__alignof__(this) == 128);
+	/* paranoid */ verifyf(((intptr_t)(&this) % 128) == 0, "Expected address to be aligned %p %% 128 == %zd", &this, ((intptr_t)(&this) % 128));
+
+	/* paranoid */ verifyf(_mask_shiftidx() == 6 , "%zu", _mask_shiftidx());
+	/* paranoid */ verifyf(_mask_bitsidx () == 63, "%zu", _mask_bitsidx());
+}
+
+// Dtor is trivial
+void ^?{}( __intrusive_lane_t & this ) {
+	// Make sure the list is empty
+	/* paranoid */ verify(head(this)->link.prev == 0p );
+	/* paranoid */ verify(head(this)->link.next == tail(this) );
+	/* paranoid */ verify(tail(this)->link.next == 0p );
+	/* paranoid */ verify(tail(this)->link.prev == head(this) );
+	/* paranoid */ verify(this.count == 0u );
+}
+
+// Push a thread onto this lane
+// returns true of lane was empty before push, false otherwise
+bool push(__intrusive_lane_t & this, thread_desc * node) {
+	#if defined(__CFA_WITH_VERIFY__)
+		/* paranoid */ verify(this.lock);
+		/* paranoid */ verify(node->link.ts != 0);
+		/* paranoid */ verify(node->link.next == 0p);
+		/* paranoid */ verify(node->link.prev == 0p);
+
+		this.count++;
+
+		if(this.before.link.ts == 0l) {
+			/* paranoid */ verify(tail(this)->link.next == 0p);
+			/* paranoid */ verify(tail(this)->link.prev == head(this));
+			/* paranoid */ verify(head(this)->link.next == tail(this));
+			/* paranoid */ verify(head(this)->link.prev == 0p);
+		}
+	#endif
+
+	// Get the relevant nodes locally
+	thread_desc * tail = tail(this);
+	thread_desc * prev = tail->link.prev;
+
+	// Do the push
+	node->link.next = tail;
+	node->link.prev = prev;
+	prev->link.next = node;
+	tail->link.prev = node;
+
+	// Update stats
+	#if !defined(__CFA_NO_SCHED_STATS__)
+		this.stat.diff++;
+		this.stat.push++;
+	#endif
+
+	verify(node->link.next == tail(this));
+
+	// Check if the queue used to be empty
+	if(this.before.link.ts == 0l) {
+		this.before.link.ts = node->link.ts;
+		/* paranoid */ verify(node->link.prev == head(this));
+		return true;
+	}
+	return false;
+}
+
+// Pop a thread from this lane (must be non-empty)
+// returns popped
+// returns true of lane was empty before push, false otherwise
+[thread_desc *, bool] pop(__intrusive_lane_t & this) {
+	/* paranoid */ verify(this.lock);
+	/* paranoid */ verify(this.before.link.ts != 0ul);
+
+	// Get anchors locally
+	thread_desc * head = head(this);
+	thread_desc * tail = tail(this);
+
+	// Get the relevant nodes locally
+	thread_desc * node = head->link.next;
+	thread_desc * next = node->link.next;
+
+	#if defined(__CFA_WITH_VERIFY__)
+		this.count--;
+		/* paranoid */ verify(node != tail);
+		/* paranoid */ verify(node);
+	#endif
+
+	// Do the pop
+	head->link.next = next;
+	next->link.prev = head;
+	node->link.[next, prev] = 0p;
+
+	// Update head time stamp
+	this.before.link.ts = next->link.ts;
+
+	// Update stats
+	#ifndef __CFA_NO_SCHED_STATS__
+		this.stat.diff--;
+		this.stat.pop ++;
+	#endif
+
+	// Check if we emptied list and return accordingly
+	if(next == tail) {
+		/* paranoid */ verify(this.before.link.ts == 0);
+		/* paranoid */ verify(tail(this)->link.next == 0p);
+		/* paranoid */ verify(tail(this)->link.prev == head(this));
+		/* paranoid */ verify(head(this)->link.next == tail(this));
+		/* paranoid */ verify(head(this)->link.prev == 0p);
+		return [node, true];
+	}
+	else {
+		/* paranoid */ verify(next->link.ts != 0);
+		/* paranoid */ verify(this.before.link.ts != 0);
+		return [node, false];
+	}
+}
+
+// Check whether or not list is empty
+static inline bool is_empty(__intrusive_lane_t & this) {
+	verify( (this.before.link.ts == 0) == (this.count == 0) );
+	return this.before.link.ts == 0;
+}
+
+// Return the timestamp
+static inline unsigned long long ts(__intrusive_lane_t & this) {
+	verify( this.before.link.ts == this.before.link.next->link.ts );
+	return this.before.link.ts;
+}
+
+//=======================================================================
+// Cforall Reqdy Queue used by ready queue
+//=======================================================================
+
+// Thread local mirror of ready queue statistics
+#if !defined(__CFA_NO_STATISTICS__)
+static __attribute__((aligned(128))) thread_local struct {
+	struct {
+		struct {
+			size_t attempt;
+			size_t success;
+		} push;
+		struct {
+			size_t maskrds;
+			size_t attempt;
+			size_t success;
+		} pop;
+	} pick;
+	struct {
+		size_t value;
+		size_t count;
+	} used;
+} tls = {
+	/* pick */{
+		/* push */{ 0, 0 },
+		/* pop  */{ 0, 0, 0 },
+	},
+	/* used */{ 0, 0 }
+};
+#endif
+
+//-----------------------------------------------------------------------
+
+void ?{}(__ready_queue_t & this) with (this) {
+	used.count = 0;
+	for( i ; __cfa_lane_mask_size ) {
+		used.mask[i] = 0;
+	}
+
+	lanes.data = alloc(4);
+	for( i; 4 ) {
+		(lanes.data[i]){};
+	}
+	lanes.count = 4;
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		global_stats.pick.push.attempt = 0;
+		global_stats.pick.push.success = 0;
+		global_stats.pick.pop .maskrds = 0;
+		global_stats.pick.pop .attempt = 0;
+		global_stats.pick.pop .success = 0;
+
+		global_stats.used.value = 0;
+		global_stats.used.count = 0;
+	#endif
+}
+
+void ^?{}(__ready_queue_t & this) with (this) {
+	verify( 4  == lanes.count );
+	verify( 0  == used .count );
+
+	for( i; 4 ) {
+		^(lanes.data[i]){};
+	}
+	free(lanes.data);
+
+
+	#if defined(__CFA_WITH_VERIFY__)
+		for( i ; __cfa_lane_mask_size ) {
+			assert( 0 == used.mask[i] );
+		}
+	#endif
+}
+
+//-----------------------------------------------------------------------
+enum mask_strictness {
+	STRICT,
+	NOCHECK
+};
+
+// Set a given bit in the bit mask array
+// strictness determines of the bit had to be cleared before
+static inline void mask_set(__cfa_readyQ_mask_t * mask, unsigned index, mask_strictness strict) {
+	// Extract the array and bit indexes
+	__cfa_readyQ_mask_t word;
+	__cfa_readyQ_mask_t bit;
+	[bit, word] = extract(index);
+
+	// Conditional check
+	verifyf(
+		strict == STRICT && // Conditional check if it was expected to be cleared
+		((mask[word] & (1ull << bit)) == 0),
+		"Before set %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit)
+	);
+
+	// Atomically set the bit
+	__attribute__((unused)) bool ret = __atomic_bts(&mask[word], bit);
+
+	// Conditional check
+	verifyf(
+		strict == STRICT && // Conditional check if it was expected to be cleared
+		!ret,
+		"Bit was not set but bts returned true"
+	);
+
+	// Unconditional check
+	verifyf(
+		(mask[word] & (1ull << bit)) != 0,
+		"After set %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit)
+	);
+}
+
+static inline void mask_clear(__cfa_readyQ_mask_t * mask, unsigned index, mask_strictness strict) {
+	// Extract the array and bit indexes
+	__cfa_readyQ_mask_t word;
+	__cfa_readyQ_mask_t bit;
+	[bit, word] = extract(index);
+
+	// Conditional check
+	verifyf(
+		strict == STRICT && // Conditional check if it was expected to be set
+		((mask[word] & (1ull << bit)) != 0),
+		"Before clear %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit)
+	);
+
+	// Atomically clear the bit
+	__attribute__((unused)) bool ret = __atomic_btr(&mask[word], bit);
+
+	// Conditional check
+	verifyf(
+		strict == STRICT && // Conditional check if it was expected to be cleared
+		ret,
+		"Bit was set but btr returned false"
+	);
+
+	// Unconditional check
+	verifyf(
+		(mask[word] & (1ull << bit)) == 0,
+		"After clear %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit)
+	);
+}
+
+//-----------------------------------------------------------------------
+__attribute__((hot)) bool push(struct cluster * cltr, struct thread_desc * thrd) with (cltr->ready_queue) {
+	// write timestamp
+	thrd->link.ts = rdtscl();
+
+	// Try to pick a lane and lock it
+	unsigned i;
+	do {
+		// Pick the index of a lane
+		unsigned i = tls_rand() % lanes.count;
+
+		#if !defined(__CFA_NO_STATISTICS__)
+			tls.pick.push.attempt++;
+		#endif
+
+		// If we can't lock it retry
+	} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+
+	#if defined(__CFA_WITH_VERIFY__)
+		/* paranoid */ verify(lanes.data[i].last_id == -1u);
+		/* paranoid */ lanes.data[i].last_id = kernelTLS.this_processor->id;
+	#endif
+
+	__attribute__((unused)) size_t num = __atomic_load_n( &used.count, __ATOMIC_RELAXED );
+	bool first = false;
+
+	// Actually push it
+	bool lane_first = push(lanes.data[i], thrd);
+
+	// If this lane used to be empty we need to do more
+	if(lane_first) {
+		// Update the global count
+		size_t ret = __atomic_fetch_add( &used.count, 1z, __ATOMIC_SEQ_CST);
+
+		// Check if the entire quue used to be empty
+		first = (ret == 0);
+
+		// Update the bit mask
+		mask_set((__cfa_readyQ_mask_t *)used.mask, i, STRICT);
+	}
+
+	#if defined(__CFA_WITH_VERIFY__)
+		/* paranoid */ verifyf( used.count <= lanes.count, "Non-empty count (%zu) exceeds actual count (%zu)\n", used.count, lanes.count );
+		/* paranoid */ verifyf( lanes.data[i].last_id == kernelTLS.this_processor->id, "Expected last processor to lock queue %u to be %u, was %u\n", i, lanes.data[i].last_id, kernelTLS.this_processor->id );
+		/* paranoid */ verifyf( lanes.data[i].lock, "List %u is not locked\n", i );
+		/* paranoid */ lanes.data[i].last_id = -1u;
+	#endif
+
+	// Unlock and return
+	__atomic_unlock( &lanes.data[i].lock );
+
+	// Update statistics
+	#if !defined(__CFA_NO_STATISTICS__)
+		tls.pick.push.success++;
+		tls.used.value += num;
+		tls.used.count += 1;
+	#endif
+
+	// return whether or not the list was empty before this push
+	return first;
+}
+
+//-----------------------------------------------------------------------
+// Given 2 indexes, pick the list with the oldest push an try to pop from it
+static struct thread_desc * try_pop(struct cluster * cltr, unsigned i, unsigned j) with (cltr->ready_queue) {
+	#if !defined(__CFA_NO_STATISTICS__)
+		tls.pick.pop.attempt++;
+	#endif
+
+	// Pick the bet list
+	int w = i;
+	if( __builtin_expect(!is_empty(lanes.data[j]), true) ) {
+		w = (ts(lanes.data[i]) < ts(lanes.data[j])) ? i : j;
+	}
+
+	// Get relevant elements locally
+	__intrusive_lane_t & lane = lanes.data[w];
+
+	// If list looks empty retry
+	if( is_empty(lane) ) return 0p;
+
+	// If we can't get the lock retry
+	if( !__atomic_try_acquire(&lane.lock) ) return 0p;
+
+	#if defined(__CFA_WITH_VERIFY__)
+		/* paranoid */ verify(lane.last_id == -1u);
+		/* paranoid */ lane.last_id = kernelTLS.this_processor->id;
+	#endif
+
+
+	// If list is empty, unlock and retry
+	if( is_empty(lane) ) {
+		#if defined(__CFA_WITH_VERIFY__)
+			/* paranoid */ verify(lane.last_id == kernelTLS.this_processor->id);
+			/* paranoid */ lane.last_id = -1u;
+		#endif
+
+		__atomic_unlock(&lane.lock);
+		return 0p;
+	}
+
+	// Actually pop the list
+	struct thread_desc * thrd;
+	bool emptied;
+	[thrd, emptied] = pop(lane);
+
+	/* paranoid */ verify(thrd);
+	/* paranoid */ verify(lane.last_id == kernelTLS.this_processor->id);
+	/* paranoid */ verify(lane.lock);
+
+	// If this was the last element in the lane
+	if(emptied) {
+		// Update the global count
+		__atomic_fetch_sub( &used.count, 1z, __ATOMIC_SEQ_CST);
+
+		// Update the bit mask
+		mask_clear((__cfa_readyQ_mask_t *)used.mask, w, STRICT);
+	}
+
+	#if defined(__CFA_WITH_VERIFY__)
+		/* paranoid */ verify(lane.last_id == kernelTLS.this_processor->id);
+		/* paranoid */ lane.last_id = -1u;
+	#endif
+
+	// For statistics, check the count before we release the lock
+	#if !defined(__CFA_NO_STATISTICS__)
+		int num = __atomic_load_n( &used.count, __ATOMIC_RELAXED );
+	#endif
+
+	// Unlock and return
+	__atomic_unlock(&lane.lock);
+
+	// Update statistics
+	#if !defined(__CFA_NO_STATISTICS__)
+		tls.pick.pop.success++;
+		tls.used.value += num;
+		tls.used.count += 1;
+	#endif
+
+	// return the popped thread
+	return thrd;
+}
+
+// Pop from the ready queue from a given cluster
+__attribute__((hot)) thread_desc * pop(struct cluster * cltr) with (cltr->ready_queue) {
+	/* paranoid */ verify( lanes.count > 0 );
+
+	// As long as the list is not empty, try finding a lane that isn't empty and pop from it
+	while( __atomic_load_n( &used.count, __ATOMIC_RELAXED ) != 0) {
+		#if !defined(__CFA_READQ_NO_BITMASK__)
+			// If using bit masks
+			#if !defined(__CFA_NO_SCHED_STATS__)
+				tls.pick.pop.maskrds++;
+			#endif
+
+			// Pick two lists at random
+			unsigned ri = tls_rand();
+			unsigned rj = tls_rand();
+
+			// Find which __cfa_readyQ_mask_t the two lists belong
+			unsigned num = ((__atomic_load_n( &lanes.count, __ATOMIC_RELAXED ) - 1) >> 6) + 1;
+			unsigned wdxi = (ri >> 6u) % num;
+			unsigned wdxj = (rj >> 6u) % num;
+
+			// Get the actual __cfa_readyQ_mask_t
+			size_t maski = __atomic_load_n( &used.mask[wdxi], __ATOMIC_RELAXED );
+			size_t maskj = __atomic_load_n( &used.mask[wdxj], __ATOMIC_RELAXED );
+
+			// If both of these masks are empty, retry
+			if(maski == 0 && maskj == 0) continue;
+
+			// Pick one of the non-zero bits in the masks and get the bit indexes
+			unsigned bi = rand_bit(ri, maski);
+			unsigned bj = rand_bit(rj, maskj);
+
+			// some checks
+			/* paranoid */ verifyf(bi < 64, "%zu %u", maski, bi);
+			/* paranoid */ verifyf(bj < 64, "%zu %u", maskj, bj);
+
+			// get the general list index
+			unsigned i = bi | (wdxi << 6);
+			unsigned j = bj | (wdxj << 6);
+
+			// some more checks
+			/* paranoid */ verifyf(i < lanes.count, "%u", wdxi << 6);
+			/* paranoid */ verifyf(j < lanes.count, "%u", wdxj << 6);
+
+			// try popping from the 2 picked lists
+			struct thread_desc * thrd = try_pop(cltr, i, j);
+			if(thrd) return thrd;
+		#else
+			// Pick two lists at random
+			int i = tls_rand() % __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+			int j = tls_rand() % __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+
+			// try popping from the 2 picked lists
+			struct thread_desc * thrd = try_pop(cltr, i, j);
+			if(thrd) return thrd;
+		#endif
+	}
+
+	// All lanes where empty return 0p
+	return 0p;
+}
+
+//-----------------------------------------------------------------------
+
+static void check( __ready_queue_t & q ) with (q) {
+	#if defined(__CFA_WITH_VERIFY__)
+		{
+			int idx = 0;
+			for( w ; __cfa_lane_mask_size ) {
+				for( b ; 8 * sizeof(__cfa_readyQ_mask_t) ) {
+					bool is_empty = idx < lanes.count ? (ts(lanes.data[idx]) == 0) : true;
+					bool should_be_empty = 0 == (used.mask[w] & (1z << b));
+					assertf(should_be_empty == is_empty, "Inconsistent list %d, mask expect : %d, actual is got %d", idx, should_be_empty, (bool)is_empty);
+					assert(__cfa_max_lanes > idx);
+					idx++;
+				}
+			}
+		}
+
+		{
+			for( idx ; lanes.count ) {
+				__intrusive_lane_t & sl = lanes.data[idx];
+				assert(!lanes.data[idx].lock);
+
+				assert(head(sl)->link.prev == 0p );
+				assert(head(sl)->link.next->link.prev == head(sl) );
+				assert(tail(sl)->link.next == 0p );
+				assert(tail(sl)->link.prev->link.next == tail(sl) );
+
+				if(sl.before.link.ts == 0l) {
+					assert(tail(sl)->link.next == 0p);
+					assert(tail(sl)->link.prev == head(sl));
+					assert(head(sl)->link.next == tail(sl));
+					assert(head(sl)->link.prev == 0p);
+				}
+			}
+		}
+	#endif
+}
+
+// Call this function of the intrusive list was moved using memcpy
+// fixes the list so that the pointers back to anchors aren't left dangling
+static inline void fix(__intrusive_lane_t & ll) {
+	// if the list is not empty then follow he pointer and fix its reverse
+	if(!is_empty(ll)) {
+		head(ll)->link.next->link.prev = head(ll);
+		tail(ll)->link.prev->link.next = tail(ll);
+	}
+	// Otherwise just reset the list
+	else {
+		verify(tail(ll)->link.next == 0p);
+		tail(ll)->link.prev = head(ll);
+		head(ll)->link.next = tail(ll);
+		verify(head(ll)->link.prev == 0p);
+	}
+}
+
+// Grow the ready queue
+void ready_queue_grow  (struct cluster * cltr) {
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock( *cltr );
+
+	__cfaabi_dbg_print_safe("Kernel : Growing ready queue\n");
+
+	// Make sure that everything is consistent
+	/* paranoid */ check( cltr->ready_queue );
+
+	// grow the ready queue
+	with( cltr->ready_queue ) {
+		size_t ncount = lanes.count;
+
+		// Check that we have some space left
+		if(ncount + 4 >= __cfa_max_lanes) abort("Program attempted to create more than maximum number of Ready Queues (%zu)", __cfa_max_lanes);
+
+		// increase count
+		ncount += 4;
+
+		// Allocate new array (uses realloc and memcpies the data)
+		lanes.data = alloc(lanes.data, ncount);
+
+		// Fix the moved data
+		for( idx; (size_t)lanes.count ) {
+			fix(lanes.data[idx]);
+		}
+
+		// Construct new data
+		for( idx; (size_t)lanes.count ~ ncount) {
+			(lanes.data[idx]){};
+		}
+
+		// Update original
+		lanes.count = ncount;
+
+		// fields in 'used' don't need to change when growing
+	}
+
+	// Make sure that everything is consistent
+	/* paranoid */ check( cltr->ready_queue );
+
+	__cfaabi_dbg_print_safe("Kernel : Growing ready queue done\n");
+
+	// Unlock the RWlock
+	ready_mutate_unlock( *cltr, last_size );
+}
+
+// Shrink the ready queue
+void ready_queue_shrink(struct cluster * cltr) {
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock( *cltr );
+
+	__cfaabi_dbg_print_safe("Kernel : Shrinking ready queue\n");
+
+	// Make sure that everything is consistent
+	/* paranoid */ check( cltr->ready_queue );
+
+	with( cltr->ready_queue ) {
+		// Make sure that the total thread count stays the same
+		#if defined(__CFA_WITH_VERIFY__)
+			size_t nthreads = 0;
+			for( idx; (size_t)lanes.count ) {
+				nthreads += lanes.data[idx].count;
+			}
+		#endif
+
+		size_t ocount = lanes.count;
+		// Check that we have some space left
+		if(ocount < 8) abort("Program attempted to destroy more Ready Queues than were created");
+
+		// reduce the actual count so push doesn't use the old queues
+		lanes.count -= 4;
+		verify(ocount > lanes.count);
+
+		// for printing count the number of displaced threads
+		#if defined(__CFA_DEBUG_PRINT__)
+			__attribute__((unused)) size_t displaced = 0;
+		#endif
+
+		// redistribute old data
+		for( idx; (size_t)lanes.count ~ ocount) {
+			// Lock is not strictly needed but makes checking invariants much easier
+			bool locked = __atomic_try_acquire(&lanes.data[idx].lock);
+			verify(locked);
+
+			// As long as we can pop from this lane to push the threads somewhere else in the queue
+			while(!is_empty(lanes.data[idx])) {
+				struct thread_desc * thrd;
+				__attribute__((unused)) bool _;
+				[thrd, _] = pop(lanes.data[idx]);
+
+				push(cltr, thrd);
+
+				// for printing count the number of displaced threads
+				#if defined(__CFA_DEBUG_PRINT__)
+					displaced++;
+				#endif
+			}
+
+			mask_clear((__cfa_readyQ_mask_t *)used.mask, idx, NOCHECK);
+
+			// Unlock the lane
+			__atomic_unlock(&lanes.data[idx].lock);
+
+			// TODO print the queue statistics here
+
+			^(lanes.data[idx]){};
+		}
+
+		__cfaabi_dbg_print_safe("Kernel : Shrinking ready queue displaced %zu threads\n", displaced);
+
+		// recompute the used.count instead of maintaining it
+		used.count = 0;
+		for( i ; __cfa_lane_mask_size ) {
+			used.count += __builtin_popcountl(used.mask[i]);
+		}
+
+		// Allocate new array (uses realloc and memcpies the data)
+		lanes.data = alloc(lanes.data, lanes.count);
+
+		// Fix the moved data
+		for( idx; (size_t)lanes.count ) {
+			fix(lanes.data[idx]);
+		}
+
+		// Make sure that the total thread count stayed the same
+		#if defined(__CFA_WITH_VERIFY__)
+			for( idx; (size_t)lanes.count ) {
+				nthreads -= lanes.data[idx].count;
+			}
+			verifyf(nthreads == 0, "Shrinking changed number of threads");
+		#endif
+	}
+
+	// Make sure that everything is consistent
+	/* paranoid */ check( cltr->ready_queue );
+
+	__cfaabi_dbg_print_safe("Kernel : Shrinking ready queue done\n");
+
+	// Unlock the RWlock
+	ready_mutate_unlock( *cltr, last_size );
+}
+
+//-----------------------------------------------------------------------
+
+#if !defined(__CFA_NO_STATISTICS__)
+void stats_tls_tally(struct cluster * cltr) with (cltr->ready_queue) {
+	__atomic_fetch_add( &global_stats.pick.push.attempt, tls.pick.push.attempt, __ATOMIC_SEQ_CST );
+	__atomic_fetch_add( &global_stats.pick.push.success, tls.pick.push.success, __ATOMIC_SEQ_CST );
+	__atomic_fetch_add( &global_stats.pick.pop .maskrds, tls.pick.pop .maskrds, __ATOMIC_SEQ_CST );
+	__atomic_fetch_add( &global_stats.pick.pop .attempt, tls.pick.pop .attempt, __ATOMIC_SEQ_CST );
+	__atomic_fetch_add( &global_stats.pick.pop .success, tls.pick.pop .success, __ATOMIC_SEQ_CST );
+
+	__atomic_fetch_add( &global_stats.used.value, tls.used.value, __ATOMIC_SEQ_CST );
+	__atomic_fetch_add( &global_stats.used.count, tls.used.count, __ATOMIC_SEQ_CST );
+}
+#endif
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 2cbfe9291f0e8e1365d80ff4c80cf2d8025148dd)
+++ libcfa/src/concurrency/thread.cfa	(revision b7d6a36126af369a09bd3958b2fe17c537999f77)
@@ -41,5 +41,6 @@
 	self_mon_p = &self_mon;
 	curr_cluster = &cl;
-	next = 0p;
+	link.next = 0p;
+	link.prev = 0p;
 
 	node.next = 0p;
Index: libcfa/src/stdhdr/assert.h
===================================================================
--- libcfa/src/stdhdr/assert.h	(revision 2cbfe9291f0e8e1365d80ff4c80cf2d8025148dd)
+++ libcfa/src/stdhdr/assert.h	(revision b7d6a36126af369a09bd3958b2fe17c537999f77)
@@ -33,8 +33,10 @@
 	#define verify(x) assert(x)
 	#define verifyf(x, ...) assertf(x, __VA_ARGS__)
+	#define verifyfail(...)
 	#define __CFA_WITH_VERIFY__
 #else
 	#define verify(x)
 	#define verifyf(x, ...)
+	#define verifyfail(...)
 #endif