Index: libcfa/src/Makefile.am
===================================================================
--- libcfa/src/Makefile.am	(revision 78cdb060aebec67f15c7fd9fe672626963efe0b5)
+++ libcfa/src/Makefile.am	(revision 0f9ceacb0d69af179f9dba9f14b304e5fa6560d7)
@@ -48,5 +48,5 @@
 thread_headers_nosrc = concurrency/invoke.h
 thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
-thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
+thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa concurrency/ready_queue.cfa ${thread_headers:.hfa=.cfa}
 else
 headers =
Index: libcfa/src/Makefile.in
===================================================================
--- libcfa/src/Makefile.in	(revision 78cdb060aebec67f15c7fd9fe672626963efe0b5)
+++ libcfa/src/Makefile.in	(revision 0f9ceacb0d69af179f9dba9f14b304e5fa6560d7)
@@ -165,7 +165,7 @@
 	concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa \
 	concurrency/invoke.c concurrency/preemption.cfa \
-	concurrency/coroutine.cfa concurrency/thread.cfa \
-	concurrency/kernel.cfa concurrency/monitor.cfa \
-	concurrency/mutex.cfa
+	concurrency/ready_queue.cfa concurrency/coroutine.cfa \
+	concurrency/thread.cfa concurrency/kernel.cfa \
+	concurrency/monitor.cfa concurrency/mutex.cfa
 @BUILDLIB_TRUE@am__objects_3 = concurrency/coroutine.lo \
 @BUILDLIB_TRUE@	concurrency/thread.lo concurrency/kernel.lo \
@@ -174,5 +174,6 @@
 @BUILDLIB_TRUE@	concurrency/CtxSwitch-@ARCHITECTURE@.lo \
 @BUILDLIB_TRUE@	concurrency/alarm.lo concurrency/invoke.lo \
-@BUILDLIB_TRUE@	concurrency/preemption.lo $(am__objects_3)
+@BUILDLIB_TRUE@	concurrency/preemption.lo \
+@BUILDLIB_TRUE@	concurrency/ready_queue.lo $(am__objects_3)
 am_libcfathread_la_OBJECTS = $(am__objects_4)
 libcfathread_la_OBJECTS = $(am_libcfathread_la_OBJECTS)
@@ -463,5 +464,5 @@
 @BUILDLIB_FALSE@thread_headers = 
 @BUILDLIB_TRUE@thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
-@BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
+@BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa concurrency/ready_queue.cfa ${thread_headers:.hfa=.cfa}
 
 #----------------------------------------------------------------------------------------------------------------
@@ -599,4 +600,6 @@
 	concurrency/$(DEPDIR)/$(am__dirstamp)
 concurrency/preemption.lo: concurrency/$(am__dirstamp) \
+	concurrency/$(DEPDIR)/$(am__dirstamp)
+concurrency/ready_queue.lo: concurrency/$(am__dirstamp) \
 	concurrency/$(DEPDIR)/$(am__dirstamp)
 concurrency/coroutine.lo: concurrency/$(am__dirstamp) \
Index: libcfa/src/bits/defs.hfa
===================================================================
--- libcfa/src/bits/defs.hfa	(revision 78cdb060aebec67f15c7fd9fe672626963efe0b5)
+++ libcfa/src/bits/defs.hfa	(revision 0f9ceacb0d69af179f9dba9f14b304e5fa6560d7)
@@ -53,2 +53,36 @@
     return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
 }
+
+#define __CFA_NO_BIT_TEST_AND_SET__
+
+static inline bool bts(volatile unsigned long long int * target, unsigned long long int bit ) {
+	#if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long long int mask = 1ul << bit;
+        unsigned long long int ret = __atomic_fetch_or(target, mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+    #else
+        int result = 0;
+        asm volatile(
+            "LOCK btsq %[bit], %[target]\n\t"
+            :"=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+
+static inline bool btr(volatile unsigned long long int * target, unsigned long long int bit ) {
+	#if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long long int mask = 1ul << bit;
+        unsigned long long int ret = __atomic_fetch_and(target, ~mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+	#else
+        int result = 0;
+        asm volatile(
+            "LOCK btrq %[bit], %[target]\n\t"
+            :"=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision 78cdb060aebec67f15c7fd9fe672626963efe0b5)
+++ libcfa/src/concurrency/invoke.h	(revision 0f9ceacb0d69af179f9dba9f14b304e5fa6560d7)
@@ -189,4 +189,6 @@
 		// instrusive link field for threads
 		struct thread_desc * next;
+		struct thread_desc * prev;
+		unsigned long long ts;
 
 		struct {
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 78cdb060aebec67f15c7fd9fe672626963efe0b5)
+++ libcfa/src/concurrency/kernel.cfa	(revision 0f9ceacb0d69af179f9dba9f14b304e5fa6560d7)
@@ -210,4 +210,5 @@
 	this.name = name;
 	this.cltr = &cltr;
+	id = -1u;
 	terminated{ 0 };
 	do_terminate = false;
@@ -239,7 +240,6 @@
 	this.preemption_rate = preemption_rate;
 	ready_queue{};
-	ready_queue_lock{};
-
-	procs{ __get };
+	ready_lock{};
+
 	idles{ __get };
 	threads{ __get };
@@ -270,5 +270,7 @@
 	__cfaabi_dbg_print_safe("Kernel : core %p starting\n", this);
 
-	doregister(this->cltr, this);
+	// register the processor unless it's the main thread which is handled in the boot sequence
+	if(this != mainProcessor)
+		this->id = doregister(this->cltr, this);
 
 	{
@@ -306,7 +308,9 @@
 	}
 
-	unregister(this->cltr, this);
-
 	V( this->terminated );
+
+	// unregister the processor unless it's the main thread which is handled in the boot sequence
+	if(this != mainProcessor)
+		unregister(this->cltr, this);
 
 	__cfaabi_dbg_print_safe("Kernel : core %p terminated\n", this);
@@ -505,8 +509,10 @@
 
 	with( *thrd->curr_cluster ) {
-		lock  ( ready_queue_lock __cfaabi_dbg_ctx2 );
-		bool was_empty = !(ready_queue != 0);
-		append( ready_queue, thrd );
-		unlock( ready_queue_lock );
+		ready_schedule_lock(*thrd->curr_cluster, kernelTLS.this_processor);
+		__atomic_acquire(&ready_queue.lock);
+		thrd->ts = rdtscl();
+		bool was_empty = push( ready_queue, thrd );
+		__atomic_unlock(&ready_queue.lock);
+		ready_schedule_unlock(*thrd->curr_cluster, kernelTLS.this_processor);
 
 		if(was_empty) {
@@ -529,7 +535,13 @@
 thread_desc * nextThread(cluster * this) with( *this ) {
 	verify( ! kernelTLS.preemption_state.enabled );
-	lock( ready_queue_lock __cfaabi_dbg_ctx2 );
-	thread_desc * head = pop_head( ready_queue );
-	unlock( ready_queue_lock );
+
+	ready_schedule_lock(*this, kernelTLS.this_processor);
+		__atomic_acquire(&ready_queue.lock);
+			thread_desc * head;
+			__attribute__((unused)) bool _;
+			[head, _] = pop( ready_queue );
+		__atomic_unlock(&ready_queue.lock);
+	ready_schedule_unlock(*this, kernelTLS.this_processor);
+
 	verify( ! kernelTLS.preemption_state.enabled );
 	return head;
@@ -693,4 +705,5 @@
 		pending_preemption = false;
 		kernel_thread = pthread_self();
+		id = -1u;
 
 		runner{ &this };
@@ -702,4 +715,6 @@
 	mainProcessor = (processor *)&storage_mainProcessor;
 	(*mainProcessor){};
+
+	mainProcessor->id = doregister(mainCluster, mainProcessor);
 
 	//initialize the global state variables
@@ -748,12 +763,16 @@
 	kernel_stop_preemption();
 
+	unregister(mainCluster, mainProcessor);
+
 	// Destroy the main processor and its context in reverse order of construction
 	// These were manually constructed so we need manually destroy them
 	^(mainProcessor->runner){};
-	^(mainProcessor){};
+	^(*mainProcessor){};
 
 	// Final step, destroy the main thread since it is no longer needed
-	// Since we provided a stack to this taxk it will not destroy anything
-	^(mainThread){};
+	// Since we provided a stack to this task it will not destroy anything
+	^(*mainThread){};
+
+	^(*mainCluster){};
 
 	^(__cfa_dbg_global_clusters.list){};
@@ -771,5 +790,4 @@
 	with( *cltr ) {
 		lock      (proc_list_lock __cfaabi_dbg_ctx2);
-		remove    (procs, *this);
 		push_front(idles, *this);
 		unlock    (proc_list_lock);
@@ -785,5 +803,4 @@
 		lock      (proc_list_lock __cfaabi_dbg_ctx2);
 		remove    (idles, *this);
-		push_front(procs, *this);
 		unlock    (proc_list_lock);
 	}
@@ -926,18 +943,4 @@
 }
 
-void doregister( cluster * cltr, processor * proc ) {
-	lock      (cltr->proc_list_lock __cfaabi_dbg_ctx2);
-	cltr->nprocessors += 1;
-	push_front(cltr->procs, *proc);
-	unlock    (cltr->proc_list_lock);
-}
-
-void unregister( cluster * cltr, processor * proc ) {
-	lock  (cltr->proc_list_lock __cfaabi_dbg_ctx2);
-	remove(cltr->procs, *proc );
-	cltr->nprocessors -= 1;
-	unlock(cltr->proc_list_lock);
-}
-
 //-----------------------------------------------------------------------------
 // Debug
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 78cdb060aebec67f15c7fd9fe672626963efe0b5)
+++ libcfa/src/concurrency/kernel.hfa	(revision 0f9ceacb0d69af179f9dba9f14b304e5fa6560d7)
@@ -107,4 +107,5 @@
 	// Cluster from which to get threads
 	struct cluster * cltr;
+	unsigned int id;
 
 	// Name of the processor
@@ -158,12 +159,67 @@
 }
 
+
+//-----------------------------------------------------------------------------
+// Cluster Tools
+struct __processor_id;
+
+// Reader-Writer lock protecting the ready-queue
+struct __clusterRWLock_t {
+	// total cachelines allocated
+	unsigned int max;
+
+	// cachelines currently in use
+	volatile unsigned int alloc;
+
+	// cachelines ready to itereate over
+	// (!= to alloc when thread is in second half of doregister)
+	volatile unsigned int ready;
+
+	// writer lock
+	volatile bool lock;
+
+	// data pointer
+	__processor_id * data;
+};
+
+void  ?{}(__clusterRWLock_t & this);
+void ^?{}(__clusterRWLock_t & this);
+
+// Underlying sub quues of the ready queue
+struct __attribute__((aligned(128))) __intrusive_ready_queue_t {
+	// spin lock protecting the queue
+	volatile bool lock;
+
+	// anchor for the head and the tail of the queue
+	struct __sentinel_t {
+		struct thread_desc * next;
+		struct thread_desc * prev;
+		unsigned long long ts;
+	} before, after;
+
+	// Optional statistic counters
+	#ifndef __CFA_NO_SCHED_STATS__
+		struct __attribute__((aligned(64))) {
+			// difference between number of push and pops
+			ssize_t diff;
+
+			// total number of pushes and pops
+			size_t  push;
+			size_t  pop ;
+		} stat;
+	#endif
+};
+
+void  ?{}(__intrusive_ready_queue_t & this);
+void ^?{}(__intrusive_ready_queue_t & this);
+
 //-----------------------------------------------------------------------------
 // Cluster
 struct cluster {
 	// Ready queue locks
-	__spinlock_t ready_queue_lock;
+	__clusterRWLock_t ready_lock;
 
 	// Ready queue for threads
-	__queue_t(thread_desc) ready_queue;
+	__intrusive_ready_queue_t ready_queue;
 
 	// Name of the cluster
@@ -175,7 +231,5 @@
 	// List of processors
 	__spinlock_t proc_list_lock;
-	__dllist_t(struct processor) procs;
 	__dllist_t(struct processor) idles;
-	unsigned int nprocessors;
 
 	// List of threads
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 78cdb060aebec67f15c7fd9fe672626963efe0b5)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 0f9ceacb0d69af179f9dba9f14b304e5fa6560d7)
@@ -99,5 +99,5 @@
 //-----------------------------------------------------------------------------
 // Utils
-#define KERNEL_STORAGE(T,X) static char storage_##X[sizeof(T)]
+#define KERNEL_STORAGE(T,X) __attribute((aligned(__alignof__(T)))) static char storage_##X[sizeof(T)]
 
 static inline uint32_t tls_rand() {
@@ -115,6 +115,79 @@
 void unregister( struct cluster * cltr, struct thread_desc & thrd );
 
-void doregister( struct cluster * cltr, struct processor * proc );
-void unregister( struct cluster * cltr, struct processor * proc );
+//=======================================================================
+// Cluster lock API
+//=======================================================================
+struct __attribute__((aligned(64))) __processor_id {
+	processor * volatile handle;
+	volatile bool lock;
+};
+
+// Lock-Free registering/unregistering of threads
+// Register a processor to a given cluster and get its unique id in return
+unsigned doregister( struct cluster * cltr, struct processor * proc );
+
+// Unregister a processor from a given cluster using its id, getting back the original pointer
+void     unregister( struct cluster * cltr, struct processor * proc );
+
+//=======================================================================
+// Reader-writer lock implementation
+// Concurrent with doregister/unregister,
+//    i.e., threads can be added at any point during or between the entry/exit
+static inline void __atomic_acquire(volatile bool * ll) {
+	while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) {
+		while(__atomic_load_n(ll, (int)__ATOMIC_RELAXED))
+			asm volatile("pause");
+	}
+	/* paranoid */ verify(*ll);
+}
+
+static inline bool __atomic_try_acquire(volatile bool * ll) {
+	return __atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
+}
+
+static inline void __atomic_unlock(volatile bool * ll) {
+	/* paranoid */ verify(*ll);
+	__atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
+}
+
+//-----------------------------------------------------------------------
+// Reader side : acquire when using the ready queue to schedule but not
+//  creating/destroying queues
+static inline void ready_schedule_lock( struct cluster & cltr, struct processor * proc) with(cltr.ready_lock) {
+	unsigned iproc = proc->id;
+	/*paranoid*/ verify(data[iproc].handle == proc);
+	/*paranoid*/ verify(iproc < ready);
+
+	// Step 1 : make sure no writer are in the middle of the critical section
+	while(__atomic_load_n(&lock, (int)__ATOMIC_RELAXED))
+		asm volatile("pause");
+
+	// Fence needed because we don't want to start trying to acquire the lock
+	// before we read a false.
+	// Not needed on x86
+	// std::atomic_thread_fence(std::memory_order_seq_cst);
+
+	// Step 2 : acquire our local lock
+	__atomic_acquire( &data[iproc].lock );
+	/*paranoid*/ verify(data[iproc].lock);
+}
+
+static inline void ready_schedule_unlock( struct cluster & cltr, struct processor * proc) with(cltr.ready_lock) {
+	unsigned iproc = proc->id;
+	/*paranoid*/ verify(data[iproc].handle == proc);
+	/*paranoid*/ verify(iproc < ready);
+	/*paranoid*/ verify(data[iproc].lock);
+	__atomic_store_n(&data[iproc].lock, false, __ATOMIC_RELEASE);
+}
+
+//-----------------------------------------------------------------------
+// Writer side : acquire when changing the ready queue, e.g. adding more
+//  queues or removing them.
+uint_fast32_t ready_mutate_lock( struct cluster & cltr );
+
+void ready_mutate_unlock( struct cluster & cltr, uint_fast32_t );
+
+bool push(__intrusive_ready_queue_t & this, thread_desc * node);
+[thread_desc *, bool] pop(__intrusive_ready_queue_t & this);
 
 // Local Variables: //
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 0f9ceacb0d69af179f9dba9f14b304e5fa6560d7)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 0f9ceacb0d69af179f9dba9f14b304e5fa6560d7)
@@ -0,0 +1,296 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2019 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// ready_queue.cfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Mon Nov dd 16:29:18 2019
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#define __cforall_thread__
+
+#include "bits/defs.hfa"
+#include "kernel_private.hfa"
+
+#define _GNU_SOURCE
+#include "stdlib.hfa"
+
+static const size_t cache_line_size = 64;
+
+static inline unsigned __max_processors_fallback() {
+	#ifdef __CFA_MAX_PROCESSORS__
+		return __CFA_MAX_PROCESSORS__;
+	#else
+		// No overriden function, no environment variable, no define
+		// fall back to a magic number
+		return 128;
+	#endif
+}
+
+__attribute__((weak)) unsigned __max_processors() {
+	const char * max_cores_s = getenv("CFA_MAX_PROCESSORS");
+	if(!max_cores_s) {
+		__cfaabi_dbg_print_nolock("No CFA_MAX_PROCESSORS in ENV");
+		return __max_processors_fallback();
+	}
+
+	char * endptr = 0p;
+	long int max_cores_l = strtol(max_cores_s, &endptr, 10);
+	if(max_cores_l < 1 || max_cores_l > 65535) {
+		__cfaabi_dbg_print_nolock("CFA_MAX_PROCESSORS out of range : %ld", max_cores_l);
+		return __max_processors_fallback();
+	}
+	if('\0' != *endptr) {
+		__cfaabi_dbg_print_nolock("CFA_MAX_PROCESSORS not a decimal number : %s", max_cores_s);
+		return __max_processors_fallback();
+	}
+
+	return max_cores_l;
+}
+
+//=======================================================================
+// Cluster wide reader-writer lock
+//=======================================================================
+void  ?{}(__clusterRWLock_t & this) {
+	this.max   = __max_processors();
+	this.alloc = 0;
+	this.ready = 0;
+	this.lock  = false;
+	this.data  = alloc(this.max);
+
+	/*paranoid*/ verify( 0 == (((uintptr_t)(this.data    )) % 64) );
+	/*paranoid*/ verify( 0 == (((uintptr_t)(this.data + 1)) % 64) );
+	/*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.alloc), &this.alloc));
+	/*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.ready), &this.ready));
+
+}
+void ^?{}(__clusterRWLock_t & this) {
+	free(this.data);
+}
+
+void ?{}( __processor_id & this, struct processor * proc ) {
+	this.handle = proc;
+	this.lock   = false;
+}
+
+//=======================================================================
+// Lock-Free registering/unregistering of threads
+unsigned doregister( struct cluster * cltr, struct processor * proc ) with(cltr->ready_lock) {
+	// Step - 1 : check if there is already space in the data
+	uint_fast32_t s = ready;
+
+	// Check among all the ready
+	for(uint_fast32_t i = 0; i < s; i++) {
+		processor * null = 0p; // Re-write every loop since compare thrashes it
+		if( __atomic_load_n(&data[i].handle, (int)__ATOMIC_RELAXED) == null
+			&& __atomic_compare_exchange_n( &data[i].handle, &null, proc, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+			/*paranoid*/ verify(i < ready);
+			/*paranoid*/ verify(__alignof__(data[i]) == cache_line_size);
+			/*paranoid*/ verify((((uintptr_t)&data[i]) % cache_line_size) == 0);
+			return i;
+		}
+	}
+
+	if(max <= alloc) abort("Trying to create more than %ud processors", cltr->ready_lock.max);
+
+	// Step - 2 : F&A to get a new spot in the array.
+	uint_fast32_t n = __atomic_fetch_add(&alloc, 1, __ATOMIC_SEQ_CST);
+	if(max <= n) abort("Trying to create more than %ud processors", cltr->ready_lock.max);
+
+	// Step - 3 : Mark space as used and then publish it.
+	__processor_id * storage = (__processor_id *)&data[n];
+	(*storage){ proc };
+	while(true) {
+		unsigned copy = n;
+		if( __atomic_load_n(&ready, __ATOMIC_RELAXED) == n
+			&& __atomic_compare_exchange_n(&ready, &copy, n + 1, true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
+			break;
+		asm volatile("pause");
+	}
+
+	// Return new spot.
+	/*paranoid*/ verify(n < ready);
+	/*paranoid*/ verify(__alignof__(data[n]) == cache_line_size);
+	/*paranoid*/ verify((((uintptr_t)&data[n]) % cache_line_size) == 0);
+	return n;
+}
+
+void unregister( struct cluster * cltr, struct processor * proc ) with(cltr->ready_lock) {
+	unsigned id = proc->id;
+	/*paranoid*/ verify(id < ready);
+	/*paranoid*/ verify(proc == __atomic_load_n(&data[id].handle, __ATOMIC_RELAXED));
+	__atomic_store_n(&data[id].handle, 0p, __ATOMIC_RELEASE);
+}
+
+//-----------------------------------------------------------------------
+// Writer side : acquire when changing the ready queue, e.g. adding more
+//  queues or removing them.
+uint_fast32_t ready_mutate_lock( struct cluster & cltr ) with(cltr.ready_lock) {
+	// Step 1 : lock global lock
+	// It is needed to avoid processors that register mid Critical-Section
+	//   to simply lock their own lock and enter.
+	__atomic_acquire( &lock );
+
+	// Step 2 : lock per-proc lock
+	// Processors that are currently being registered aren't counted
+	//   but can't be in read_lock or in the critical section.
+	// All other processors are counted
+	uint_fast32_t s = ready;
+	for(uint_fast32_t i = 0; i < s; i++) {
+		__atomic_acquire( &data[i].lock );
+	}
+
+	return s;
+}
+
+void ready_mutate_unlock( struct cluster & cltr, uint_fast32_t last_s ) with(cltr.ready_lock) {
+	// Step 1 : release local locks
+	// This must be done while the global lock is held to avoid
+	//   threads that where created mid critical section
+	//   to race to lock their local locks and have the writer
+	//   immidiately unlock them
+	// Alternative solution : return s in write_lock and pass it to write_unlock
+	for(uint_fast32_t i = 0; i < last_s; i++) {
+		verify(data[i].lock);
+		__atomic_store_n(&data[i].lock, (bool)false, __ATOMIC_RELEASE);
+	}
+
+	// Step 2 : release global lock
+	/*paranoid*/ assert(true == lock);
+	__atomic_store_n(&lock, (bool)false, __ATOMIC_RELEASE);
+}
+
+//=======================================================================
+// Intrusive Queue used by ready queue
+//=======================================================================
+static const size_t fields_offset = offsetof( thread_desc, next );
+
+// Get the head pointer (one before the first element) from the anchor
+static inline thread_desc * head(const __intrusive_ready_queue_t & this) {
+	thread_desc * rhead = (thread_desc *)(
+		(uintptr_t)( &this.before ) - fields_offset
+	);
+	/* paranoid */ verify(rhead);
+	return rhead;
+}
+
+// Get the tail pointer (one after the last element) from the anchor
+static inline thread_desc * tail(const __intrusive_ready_queue_t & this) {
+	thread_desc * rtail = (thread_desc *)(
+		(uintptr_t)( &this.after ) - fields_offset
+	);
+	/* paranoid */ verify(rtail);
+	return rtail;
+}
+
+// Ctor
+void ?{}( __intrusive_ready_queue_t & this ) {
+	this.before.prev = 0p;
+	this.before.next = tail(this);
+
+	this.after .prev = head(this);
+	this.after .next = 0p;
+
+	// We add a boat-load of assertions here because the anchor code is very fragile
+	/* paranoid */ verify(((uintptr_t)( head(this) ) + fields_offset) == (uintptr_t)(&this.before));
+	/* paranoid */ verify(((uintptr_t)( tail(this) ) + fields_offset) == (uintptr_t)(&this.after ));
+	/* paranoid */ verify(head(this)->prev == 0p );
+	/* paranoid */ verify(head(this)->next == tail(this) );
+	/* paranoid */ verify(tail(this)->next == 0p );
+	/* paranoid */ verify(tail(this)->prev == head(this) );
+	/* paranoid */ verify(&head(this)->prev == &this.before.prev );
+	/* paranoid */ verify(&head(this)->next == &this.before.next );
+	/* paranoid */ verify(&tail(this)->prev == &this.after .prev );
+	/* paranoid */ verify(&tail(this)->next == &this.after .next );
+	/* paranoid */ verify(sizeof(__intrusive_ready_queue_t) == 128);
+	/* paranoid */ verify(sizeof(this) == 128);
+	/* paranoid */ verify(__alignof__(__intrusive_ready_queue_t) == 128);
+	/* paranoid */ verify(__alignof__(this) == 128);
+	/* paranoid */ verifyf(((intptr_t)(&this) % 128) == 0, "Expected address to be aligned %p %% 128 == %zd", &this, ((intptr_t)(&this) % 128));
+}
+
+// Dtor is trivial
+void ^?{}( __intrusive_ready_queue_t & this ) {
+	// Make sure the list is empty
+	/* paranoid */ verify(head(this)->prev == 0p );
+	/* paranoid */ verify(head(this)->next == tail(this) );
+	/* paranoid */ verify(tail(this)->next == 0p );
+	/* paranoid */ verify(tail(this)->prev == head(this) );
+}
+
+
+
+bool push(__intrusive_ready_queue_t & this, thread_desc * node) {
+	verify(this.lock);
+	verify(node->ts != 0);
+	verify(node->next == 0p);
+	verify(node->prev == 0p);
+
+
+	// Get the relevant nodes locally
+	thread_desc * tail = tail(this);
+	thread_desc * prev = tail->prev;
+
+	// Do the push
+	node->next = tail;
+	node->prev = prev;
+	prev->next = node;
+	tail->prev = node;
+
+	// Update stats
+	#ifndef __CFA_NO_SCHED_STATS__
+		this.stat.diff++;
+		this.stat.push++;
+	#endif
+
+	// Check if the queue used to be empty
+	if(this.before.ts == 0l) {
+		this.before.ts = node->ts;
+		verify(node->prev == head(this));
+		return true;
+	}
+	return false;
+}
+
+[thread_desc *, bool] pop(__intrusive_ready_queue_t & this) {
+	verify(this.lock);
+	thread_desc * head = head(this);
+	thread_desc * tail = tail(this);
+
+	thread_desc * node = head->next;
+	thread_desc * next = node->next;
+	if(node == tail) return [0p, false];
+
+	/* paranoid */ verify(node);
+
+	head->next = next;
+	next->prev = head;
+
+	#ifndef __CFA_NO_SCHED_STATS__
+		this.stat.diff--;
+		this.stat.pop ++;
+	#endif
+
+	if(next == tail) {
+		this.before.ts = 0ul;
+		node->[next, prev] = 0p;
+		return [node, true];
+	}
+	else {
+		verify(next->ts != 0);
+		this.before.ts = next->ts;
+		verify(this.before.ts != 0);
+		node->[next, prev] = 0p;
+		return [node, false];
+	}
+}
+
+static inline unsigned long long ts(__intrusive_ready_queue_t & this) {
+	return this.before.ts;
+}
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 78cdb060aebec67f15c7fd9fe672626963efe0b5)
+++ libcfa/src/concurrency/thread.cfa	(revision 0f9ceacb0d69af179f9dba9f14b304e5fa6560d7)
@@ -41,5 +41,6 @@
 	self_mon_p = &self_mon;
 	curr_cluster = &cl;
-	next = NULL;
+	next = 0p;
+	prev = 0p;
 
 	node.next = NULL;