Index: libcfa/src/bits/defs.hfa
===================================================================
--- libcfa/src/bits/defs.hfa	(revision d4f1521a0b4c3222ff20073dd7fde49ad6004db2)
+++ libcfa/src/bits/defs.hfa	(revision b798713f34d313e4f132850f1f6d89bdc4947d8b)
@@ -54,5 +54,5 @@
 }
 
-#define __CFA_NO_BIT_TEST_AND_SET__
+// #define __CFA_NO_BIT_TEST_AND_SET__
 
 static inline bool bts(volatile unsigned long long int * target, unsigned long long int bit ) {
@@ -65,5 +65,5 @@
         asm volatile(
             "LOCK btsq %[bit], %[target]\n\t"
-            :"=@ccc" (result)
+            : "=@ccc" (result)
             : [target] "m" (*target), [bit] "r" (bit)
         );
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision d4f1521a0b4c3222ff20073dd7fde49ad6004db2)
+++ libcfa/src/concurrency/invoke.h	(revision b798713f34d313e4f132850f1f6d89bdc4947d8b)
@@ -158,4 +158,12 @@
 	};
 
+	// Link lists fields
+	// instrusive link field for threads
+	struct __thread_desc_link {
+		struct thread_desc * next;
+		struct thread_desc * prev;
+		unsigned long long ts;
+	};
+
 	struct thread_desc {
 		// Core threading fields
@@ -188,7 +196,5 @@
 		// Link lists fields
 		// instrusive link field for threads
-		struct thread_desc * next;
-		struct thread_desc * prev;
-		unsigned long long ts;
+		struct __thread_desc_link link;
 
 		struct {
@@ -201,5 +207,5 @@
 	extern "Cforall" {
 		static inline thread_desc *& get_next( thread_desc & this ) {
-			return this.next;
+			return this.link.next;
 		}
 
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision d4f1521a0b4c3222ff20073dd7fde49ad6004db2)
+++ libcfa/src/concurrency/kernel.cfa	(revision b798713f34d313e4f132850f1f6d89bdc4947d8b)
@@ -185,5 +185,6 @@
 	self_mon.recursion = 1;
 	self_mon_p = &self_mon;
-	next = NULL;
+	link.next = 0p;
+	link.prev = 0p;
 
 	node.next = NULL;
@@ -271,6 +272,9 @@
 
 	// register the processor unless it's the main thread which is handled in the boot sequence
-	if(this != mainProcessor)
+	if(this != mainProcessor) {
 		this->id = doregister(this->cltr, this);
+		ready_queue_grow( this->cltr );
+	}
+
 
 	{
@@ -310,9 +314,14 @@
 	V( this->terminated );
 
+
 	// unregister the processor unless it's the main thread which is handled in the boot sequence
-	if(this != mainProcessor)
+	if(this != mainProcessor) {
+		ready_queue_shrink( this->cltr );
 		unregister(this->cltr, this);
+	}
 
 	__cfaabi_dbg_print_safe("Kernel : core %p terminated\n", this);
+
+	stats_tls_tally(this->cltr);
 }
 
@@ -506,14 +515,12 @@
 	verify( ! kernelTLS.preemption_state.enabled );
 
-	verifyf( thrd->next == NULL, "Expected null got %p", thrd->next );
+	verifyf( thrd->link.next == NULL, "Expected null got %p", thrd->link.next );
+
+
+	ready_schedule_lock(thrd->curr_cluster, kernelTLS.this_processor);
+		bool was_empty = push( thrd->curr_cluster, thrd );
+	ready_schedule_unlock(thrd->curr_cluster, kernelTLS.this_processor);
 
 	with( *thrd->curr_cluster ) {
-		ready_schedule_lock(*thrd->curr_cluster, kernelTLS.this_processor);
-		__atomic_acquire(&ready_queue.lock);
-		thrd->ts = rdtscl();
-		bool was_empty = push( ready_queue, thrd );
-		__atomic_unlock(&ready_queue.lock);
-		ready_schedule_unlock(*thrd->curr_cluster, kernelTLS.this_processor);
-
 		if(was_empty) {
 			lock      (proc_list_lock __cfaabi_dbg_ctx2);
@@ -526,5 +533,4 @@
 			wake_fast(idle);
 		}
-
 	}
 
@@ -536,11 +542,7 @@
 	verify( ! kernelTLS.preemption_state.enabled );
 
-	ready_schedule_lock(*this, kernelTLS.this_processor);
-		__atomic_acquire(&ready_queue.lock);
-			thread_desc * head;
-			__attribute__((unused)) bool _;
-			[head, _] = pop( ready_queue );
-		__atomic_unlock(&ready_queue.lock);
-	ready_schedule_unlock(*this, kernelTLS.this_processor);
+	ready_schedule_lock(this, kernelTLS.this_processor);
+		thread_desc * head = pop( this );
+	ready_schedule_unlock(this, kernelTLS.this_processor);
 
 	verify( ! kernelTLS.preemption_state.enabled );
@@ -767,5 +769,9 @@
 	// Destroy the main processor and its context in reverse order of construction
 	// These were manually constructed so we need manually destroy them
-	^(mainProcessor->runner){};
+	void ^?{}(processor & this) with( this ) {
+		//don't join the main thread here, that wouldn't make any sense
+		__cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
+	}
+
 	^(*mainProcessor){};
 
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision d4f1521a0b4c3222ff20073dd7fde49ad6004db2)
+++ libcfa/src/concurrency/kernel.hfa	(revision b798713f34d313e4f132850f1f6d89bdc4947d8b)
@@ -190,14 +190,16 @@
 	// spin lock protecting the queue
 	volatile bool lock;
+	unsigned int last_id;
 
 	// anchor for the head and the tail of the queue
 	struct __sentinel_t {
-		struct thread_desc * next;
-		struct thread_desc * prev;
-		unsigned long long ts;
+		// Link lists fields
+		// instrusive link field for threads
+		// must be exactly as in thread_desc
+		__thread_desc_link link;
 	} before, after;
 
 	// Optional statistic counters
-	#ifndef __CFA_NO_SCHED_STATS__
+	#if !defined(__CFA_NO_SCHED_STATS__)
 		struct __attribute__((aligned(64))) {
 			// difference between number of push and pops
@@ -214,4 +216,51 @@
 void ^?{}(__intrusive_ready_queue_t & this);
 
+typedef unsigned long long __cfa_readyQ_mask_t;
+
+// enum {
+// 	__cfa_ready_queue_mask_size = (64 - sizeof(size_t)) / sizeof(size_t),
+// 	__cfa_max_ready_queues = __cfa_ready_queue_mask_size * 8 * sizeof(size_t)
+// };
+
+#define __cfa_readyQ_mask_size ((64 - sizeof(size_t)) / sizeof(__cfa_readyQ_mask_t))
+#define __cfa_max_readyQs (__cfa_readyQ_mask_size * 8 * sizeof(__cfa_readyQ_mask_t))
+
+//TODO adjust cache size to ARCHITECTURE
+struct __attribute__((aligned(128))) __ready_queue_t {
+	struct {
+		volatile size_t count;
+		volatile __cfa_readyQ_mask_t mask[ __cfa_readyQ_mask_size ];
+	} empty;
+
+	struct __attribute__((aligned(64))) {
+		__intrusive_ready_queue_t * volatile data;
+		volatile size_t count;
+	} list;
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		__attribute__((aligned(64))) struct {
+			struct {
+				struct {
+					volatile size_t attempt;
+					volatile size_t success;
+				} push;
+				struct {
+					volatile size_t maskrds;
+					volatile size_t attempt;
+					volatile size_t success;
+				} pop;
+			} pick;
+			struct {
+				volatile size_t value;
+				volatile size_t count;
+			} full;
+		} global_stats;
+
+	#endif
+};
+
+void  ?{}(__ready_queue_t & this);
+void ^?{}(__ready_queue_t & this);
+
 //-----------------------------------------------------------------------------
 // Cluster
@@ -221,5 +270,5 @@
 
 	// Ready queue for threads
-	__intrusive_ready_queue_t ready_queue;
+	__ready_queue_t ready_queue;
 
 	// Name of the cluster
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision d4f1521a0b4c3222ff20073dd7fde49ad6004db2)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision b798713f34d313e4f132850f1f6d89bdc4947d8b)
@@ -143,5 +143,5 @@
 
 static inline bool __atomic_try_acquire(volatile bool * ll) {
-	return __atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
+	return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
 }
 
@@ -154,5 +154,5 @@
 // Reader side : acquire when using the ready queue to schedule but not
 //  creating/destroying queues
-static inline void ready_schedule_lock( struct cluster & cltr, struct processor * proc) with(cltr.ready_lock) {
+static inline void ready_schedule_lock( struct cluster * cltr, struct processor * proc) with(cltr->ready_lock) {
 	unsigned iproc = proc->id;
 	/*paranoid*/ verify(data[iproc].handle == proc);
@@ -173,5 +173,5 @@
 }
 
-static inline void ready_schedule_unlock( struct cluster & cltr, struct processor * proc) with(cltr.ready_lock) {
+static inline void ready_schedule_unlock( struct cluster * cltr, struct processor * proc) with(cltr->ready_lock) {
 	unsigned iproc = proc->id;
 	/*paranoid*/ verify(data[iproc].handle == proc);
@@ -188,6 +188,17 @@
 void ready_mutate_unlock( struct cluster & cltr, uint_fast32_t );
 
-bool push(__intrusive_ready_queue_t & this, thread_desc * node);
-[thread_desc *, bool] pop(__intrusive_ready_queue_t & this);
+//=======================================================================
+// Ready-Queue API
+
+__attribute__((hot)) bool push(struct cluster * cltr, struct thread_desc * thrd);
+__attribute__((hot)) thread_desc * pop(struct cluster * cltr);
+void ready_queue_grow  (struct cluster * cltr);
+void ready_queue_shrink(struct cluster * cltr);
+
+#if !defined(__CFA_NO_STATISTICS__)
+void stats_tls_tally(struct cluster * cltr);
+#else
+static inline void stats_tls_tally(struct cluster * cltr) {}
+#endif
 
 // Local Variables: //
Index: libcfa/src/concurrency/monitor.cfa
===================================================================
--- libcfa/src/concurrency/monitor.cfa	(revision d4f1521a0b4c3222ff20073dd7fde49ad6004db2)
+++ libcfa/src/concurrency/monitor.cfa	(revision b798713f34d313e4f132850f1f6d89bdc4947d8b)
@@ -841,5 +841,5 @@
 	for(	thread_desc ** thrd_it = &entry_queue.head;
 		*thrd_it;
-		thrd_it = &(*thrd_it)->next
+		thrd_it = &(*thrd_it)->link.next
 	) {
 		// For each acceptable check if it matches
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision d4f1521a0b4c3222ff20073dd7fde49ad6004db2)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision b798713f34d313e4f132850f1f6d89bdc4947d8b)
@@ -55,4 +55,52 @@
 }
 
+static inline unsigned rand_bit(unsigned rnum, size_t mask) {
+	verify(sizeof(mask) == 8);
+	unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
+#if !defined(__BMI2__)
+	uint64_t v = mask;   // Input value to find position with rank r.
+	unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
+	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
+	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
+	unsigned int t;      // Bit count temporary.
+
+	// Do a normal parallel bit count for a 64-bit integer,
+	// but store all intermediate steps.
+	a =  v - ((v >> 1) & ~0UL/3);
+	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
+	c = (b + (b >> 4)) & ~0UL/0x11;
+	d = (c + (c >> 8)) & ~0UL/0x101;
+
+
+	t = (d >> 32) + (d >> 48);
+	// Now do branchless select!
+	s  = 64;
+	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
+	t  = (d >> (s - 16)) & 0xff;
+	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
+	t  = (c >> (s - 8)) & 0xf;
+	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
+	t  = (b >> (s - 4)) & 0x7;
+	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
+	t  = (a >> (s - 2)) & 0x3;
+	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
+	t  = (v >> (s - 1)) & 0x1;
+	s -= ((t - r) & 256) >> 8;
+	return s - 1;
+#else
+	uint64_t picked = _pdep_u64(1ul << bit, mask);
+	return picked ? __builtin_ctzl(picked) : 0;
+#endif
+}
+
+static inline __cfa_readyQ_mask_t readyQ_mask_full       () { return (8 * sizeof(__cfa_readyQ_mask_t)) - 1; }
+static inline __cfa_readyQ_mask_t readyQ_mask_shit_length() { return (8 * sizeof(__cfa_readyQ_mask_t)) - __builtin_clzl(readyQ_mask_full()); }
+
+static inline [__cfa_readyQ_mask_t, __cfa_readyQ_mask_t] extract(__cfa_readyQ_mask_t idx) {
+	__cfa_readyQ_mask_t word = idx >> readyQ_mask_shit_length();
+	__cfa_readyQ_mask_t bit  = idx &  readyQ_mask_full();
+	return [bit, word];
+}
+
 //=======================================================================
 // Cluster wide reader-writer lock
@@ -170,10 +218,8 @@
 // Intrusive Queue used by ready queue
 //=======================================================================
-static const size_t fields_offset = offsetof( thread_desc, next );
-
 // Get the head pointer (one before the first element) from the anchor
 static inline thread_desc * head(const __intrusive_ready_queue_t & this) {
 	thread_desc * rhead = (thread_desc *)(
-		(uintptr_t)( &this.before ) - fields_offset
+		(uintptr_t)( &this.before ) - offsetof( thread_desc, link )
 	);
 	/* paranoid */ verify(rhead);
@@ -184,5 +230,5 @@
 static inline thread_desc * tail(const __intrusive_ready_queue_t & this) {
 	thread_desc * rtail = (thread_desc *)(
-		(uintptr_t)( &this.after ) - fields_offset
+		(uintptr_t)( &this.after ) - offsetof( thread_desc, link )
 	);
 	/* paranoid */ verify(rtail);
@@ -192,21 +238,32 @@
 // Ctor
 void ?{}( __intrusive_ready_queue_t & this ) {
-	this.before.prev = 0p;
-	this.before.next = tail(this);
-
-	this.after .prev = head(this);
-	this.after .next = 0p;
+	this.lock = false;
+	this.last_id = -1u;
+
+	this.before.link.prev = 0p;
+	this.before.link.next = tail(this);
+	this.before.link.ts   = 0;
+
+	this.after .link.prev = head(this);
+	this.after .link.next = 0p;
+	this.after .link.ts   = 0;
+
+	#if !defined(__CFA_NO_SCHED_STATS__)
+		this.stat.diff = 0;
+		this.stat.push = 0;
+		this.stat.pop  = 0;
+	#endif
 
 	// We add a boat-load of assertions here because the anchor code is very fragile
-	/* paranoid */ verify(((uintptr_t)( head(this) ) + fields_offset) == (uintptr_t)(&this.before));
-	/* paranoid */ verify(((uintptr_t)( tail(this) ) + fields_offset) == (uintptr_t)(&this.after ));
-	/* paranoid */ verify(head(this)->prev == 0p );
-	/* paranoid */ verify(head(this)->next == tail(this) );
-	/* paranoid */ verify(tail(this)->next == 0p );
-	/* paranoid */ verify(tail(this)->prev == head(this) );
-	/* paranoid */ verify(&head(this)->prev == &this.before.prev );
-	/* paranoid */ verify(&head(this)->next == &this.before.next );
-	/* paranoid */ verify(&tail(this)->prev == &this.after .prev );
-	/* paranoid */ verify(&tail(this)->next == &this.after .next );
+	/* paranoid */ verify(((uintptr_t)( head(this) ) + offsetof( thread_desc, link )) == (uintptr_t)(&this.before));
+	/* paranoid */ verify(((uintptr_t)( tail(this) ) + offsetof( thread_desc, link )) == (uintptr_t)(&this.after ));
+	/* paranoid */ verify(head(this)->link.prev == 0p );
+	/* paranoid */ verify(head(this)->link.next == tail(this) );
+	/* paranoid */ verify(tail(this)->link.next == 0p );
+	/* paranoid */ verify(tail(this)->link.prev == head(this) );
+	/* paranoid */ verify(&head(this)->link.prev == &this.before.link.prev );
+	/* paranoid */ verify(&head(this)->link.next == &this.before.link.next );
+	/* paranoid */ verify(&tail(this)->link.prev == &this.after .link.prev );
+	/* paranoid */ verify(&tail(this)->link.next == &this.after .link.next );
 	/* paranoid */ verify(sizeof(__intrusive_ready_queue_t) == 128);
 	/* paranoid */ verify(sizeof(this) == 128);
@@ -214,4 +271,7 @@
 	/* paranoid */ verify(__alignof__(this) == 128);
 	/* paranoid */ verifyf(((intptr_t)(&this) % 128) == 0, "Expected address to be aligned %p %% 128 == %zd", &this, ((intptr_t)(&this) % 128));
+
+	/* paranoid */ verifyf(readyQ_mask_shit_length() == 6 , "%zu", readyQ_mask_shit_length());
+	/* paranoid */ verifyf(readyQ_mask_full()        == 63, "%zu", readyQ_mask_full());
 }
 
@@ -219,8 +279,8 @@
 void ^?{}( __intrusive_ready_queue_t & this ) {
 	// Make sure the list is empty
-	/* paranoid */ verify(head(this)->prev == 0p );
-	/* paranoid */ verify(head(this)->next == tail(this) );
-	/* paranoid */ verify(tail(this)->next == 0p );
-	/* paranoid */ verify(tail(this)->prev == head(this) );
+	/* paranoid */ verify(head(this)->link.prev == 0p );
+	/* paranoid */ verify(head(this)->link.next == tail(this) );
+	/* paranoid */ verify(tail(this)->link.next == 0p );
+	/* paranoid */ verify(tail(this)->link.prev == head(this) );
 }
 
@@ -229,18 +289,24 @@
 bool push(__intrusive_ready_queue_t & this, thread_desc * node) {
 	verify(this.lock);
-	verify(node->ts != 0);
-	verify(node->next == 0p);
-	verify(node->prev == 0p);
-
+	verify(node->link.ts != 0);
+	verify(node->link.next == 0p);
+	verify(node->link.prev == 0p);
+
+	if(this.before.link.ts == 0l) {
+		verify(tail(this)->link.next == 0p);
+		verify(tail(this)->link.prev == head(this));
+		verify(head(this)->link.next == tail(this));
+		verify(head(this)->link.prev == 0p);
+	}
 
 	// Get the relevant nodes locally
 	thread_desc * tail = tail(this);
-	thread_desc * prev = tail->prev;
+	thread_desc * prev = tail->link.prev;
 
 	// Do the push
-	node->next = tail;
-	node->prev = prev;
-	prev->next = node;
-	tail->prev = node;
+	node->link.next = tail;
+	node->link.prev = prev;
+	prev->link.next = node;
+	tail->link.prev = node;
 
 	// Update stats
@@ -250,8 +316,10 @@
 	#endif
 
+	verify(node->link.next == tail(this));
+
 	// Check if the queue used to be empty
-	if(this.before.ts == 0l) {
-		this.before.ts = node->ts;
-		verify(node->prev == head(this));
+	if(this.before.link.ts == 0l) {
+		this.before.link.ts = node->link.ts;
+		verify(node->link.prev == head(this));
 		return true;
 	}
@@ -261,15 +329,24 @@
 [thread_desc *, bool] pop(__intrusive_ready_queue_t & this) {
 	verify(this.lock);
+	verify(this.before.link.ts != 0ul);
 	thread_desc * head = head(this);
 	thread_desc * tail = tail(this);
 
-	thread_desc * node = head->next;
-	thread_desc * next = node->next;
-	if(node == tail) return [0p, false];
+	thread_desc * node = head->link.next;
+	thread_desc * next = node->link.next;
+	if(node == tail) {
+		verify(false);
+		verify(this.before.link.ts == 0ul);
+		verify(tail(this)->link.next == 0p);
+		verify(tail(this)->link.prev == head(this));
+		verify(head(this)->link.next == tail(this));
+		verify(head(this)->link.prev == 0p);
+		return [0p, false];
+	}
 
 	/* paranoid */ verify(node);
 
-	head->next = next;
-	next->prev = head;
+	head->link.next = next;
+	next->link.prev = head;
 
 	#ifndef __CFA_NO_SCHED_STATS__
@@ -279,13 +356,17 @@
 
 	if(next == tail) {
-		this.before.ts = 0ul;
-		node->[next, prev] = 0p;
+		this.before.link.ts = 0ul;
+		verify(tail(this)->link.next == 0p);
+		verify(tail(this)->link.prev == head(this));
+		verify(head(this)->link.next == tail(this));
+		verify(head(this)->link.prev == 0p);
+		node->link.[next, prev] = 0p;
 		return [node, true];
 	}
 	else {
-		verify(next->ts != 0);
-		this.before.ts = next->ts;
-		verify(this.before.ts != 0);
-		node->[next, prev] = 0p;
+		verify(next->link.ts != 0);
+		this.before.link.ts = next->link.ts;
+		verify(this.before.link.ts != 0);
+		node->link.[next, prev] = 0p;
 		return [node, false];
 	}
@@ -293,4 +374,431 @@
 
 static inline unsigned long long ts(__intrusive_ready_queue_t & this) {
-	return this.before.ts;
-}
+	return this.before.link.ts;
+}
+
+//=======================================================================
+// Cforall Reqdy Queue used by ready queue
+//=======================================================================
+
+static __attribute__((aligned(128))) thread_local struct {
+	struct {
+		struct {
+			size_t attempt;
+			size_t success;
+		} push;
+		struct {
+			size_t maskrds;
+			size_t attempt;
+			size_t success;
+		} pop;
+	} pick;
+	struct {
+		size_t value;
+		size_t count;
+	} full;
+} tls = {
+	/* pick */{
+		/* push */{ 0, 0 },
+		/* pop  */{ 0, 0, 0 },
+	},
+	/* full */{ 0, 0 }
+};
+
+//-----------------------------------------------------------------------
+
+void ?{}(__ready_queue_t & this) with (this) {
+	empty.count = 0;
+	for( i ; __cfa_readyQ_mask_size ) {
+		empty.mask[i] = 0;
+	}
+
+	list.data = alloc(4);
+	for( i; 4 ) {
+		(list.data[i]){};
+	}
+	list.count = 4;
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		global_stats.pick.push.attempt = 0;
+		global_stats.pick.push.success = 0;
+		global_stats.pick.pop .maskrds = 0;
+		global_stats.pick.pop .attempt = 0;
+		global_stats.pick.pop .success = 0;
+
+		global_stats.full.value = 0;
+		global_stats.full.count = 0;
+	#endif
+}
+
+void ^?{}(__ready_queue_t & this) with (this) {
+	verify( 4  == list .count );
+	verify( 0  == empty.count );
+
+	for( i; 4 ) {
+		^(list.data[i]){};
+	}
+	free(list.data);
+
+
+	#if defined(__CFA_WITH_VERIFY__)
+		for( i ; __cfa_readyQ_mask_size ) {
+			assert( 0 == empty.mask[i] );
+		}
+	#endif
+}
+
+//-----------------------------------------------------------------------
+
+__attribute__((hot)) bool push(struct cluster * cltr, struct thread_desc * thrd) with (cltr->ready_queue) {
+	thrd->link.ts = rdtscl();
+
+	while(true) {
+		// Pick a random list
+		unsigned i = tls_rand() % list.count;
+
+		#if !defined(__CFA_NO_STATISTICS__)
+			tls.pick.push.attempt++;
+		#endif
+
+		// If we can't lock it retry
+		if( !__atomic_try_acquire( &list.data[i].lock ) ) continue;
+		verify(list.data[i].last_id == -1u);
+		list.data[i].last_id = kernelTLS.this_processor->id;
+
+		__attribute__((unused)) size_t num = __atomic_load_n( &empty.count, __ATOMIC_RELAXED );
+		bool first = false;
+
+		verify( list.data[i].last_id == kernelTLS.this_processor->id );
+		verify( list.data[i].lock );
+		// Actually push it
+		if(push(list.data[i], thrd)) {
+			size_t ret = __atomic_fetch_add( &empty.count, 1z, __ATOMIC_SEQ_CST);
+			first = (ret == 0);
+
+			__cfa_readyQ_mask_t word;
+			__cfa_readyQ_mask_t bit;
+			[bit, word] = extract(i);
+			verifyf((empty.mask[word] & (1ull << bit)) == 0, "Before set %llu:%llu (%u), %llx & %llx", word, bit, i, empty.mask[word], (1ull << bit));
+			__attribute__((unused)) bool ret = bts(&empty.mask[word], bit);
+			verify(!(bool)ret);
+			verifyf((empty.mask[word] & (1ull << bit)) != 0, "After set %llu:%llu (%u), %llx & %llx", word, bit, i, empty.mask[word], (1ull << bit));
+		}
+		verify(empty.count <= (int)list.count);
+		verify( list.data[i].last_id == kernelTLS.this_processor->id );
+		verify( list.data[i].lock );
+
+		// Unlock and return
+		list.data[i].last_id = -1u;
+		__atomic_unlock( &list.data[i].lock );
+
+		#if !defined(__CFA_NO_STATISTICS__)
+			tls.pick.push.success++;
+			tls.full.value += num;
+			tls.full.count += 1;
+		#endif
+		return first;
+	}
+}
+
+//-----------------------------------------------------------------------
+
+static struct thread_desc * try_pop(struct cluster * cltr, unsigned i, unsigned j) with (cltr->ready_queue) {
+	#if !defined(__CFA_NO_STATISTICS__)
+		tls.pick.pop.attempt++;
+	#endif
+
+	// Pick the bet list
+	int w = i;
+	if( __builtin_expect(ts(list.data[j]) != 0, true) ) {
+		w = (ts(list.data[i]) < ts(list.data[j])) ? i : j;
+	}
+
+	__intrusive_ready_queue_t & list = list.data[w];
+	// If list looks empty retry
+	if( ts(list) == 0 ) return 0p;
+
+	// If we can't get the lock retry
+	if( !__atomic_try_acquire(&list.lock) ) return 0p;
+	verify(list.last_id == -1u);
+	list.last_id = kernelTLS.this_processor->id;
+
+	verify(list.last_id == kernelTLS.this_processor->id);
+
+	__attribute__((unused)) int num = __atomic_load_n( &empty.count, __ATOMIC_RELAXED );
+
+
+	// If list is empty, unlock and retry
+	if( ts(list) == 0 ) {
+		list.last_id = -1u;
+		__atomic_unlock(&list.lock);
+		return 0p;
+	}
+	{
+		__cfa_readyQ_mask_t word;
+		__cfa_readyQ_mask_t bit;
+		[bit, word] = extract(w);
+		verify((empty.mask[word] & (1ull << bit)) != 0);
+	}
+
+	verify(list.last_id == kernelTLS.this_processor->id);
+	verify(list.lock);
+
+	// Actually pop the list
+	struct thread_desc * thrd;
+	bool emptied;
+	[thrd, emptied] = pop(list);
+	verify(thrd);
+
+	verify(list.last_id == kernelTLS.this_processor->id);
+	verify(list.lock);
+
+	if(emptied) {
+		__atomic_fetch_sub( &empty.count, 1z, __ATOMIC_SEQ_CST);
+
+		__cfa_readyQ_mask_t word;
+		__cfa_readyQ_mask_t bit;
+		[bit, word] = extract(w);
+		verify((empty.mask[word] & (1ull << bit)) != 0);
+		__attribute__((unused)) bool ret = btr(&empty.mask[word], bit);
+		verify(ret);
+		verify((empty.mask[word] & (1ull << bit)) == 0);
+	}
+
+	verify(list.lock);
+
+	// Unlock and return
+	list.last_id = -1u;
+	__atomic_unlock(&list.lock);
+	verify(empty.count >= 0);
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		tls.pick.pop.success++;
+		tls.full.value += num;
+		tls.full.count += 1;
+	#endif
+
+	return thrd;
+}
+
+__attribute__((hot)) thread_desc * pop(struct cluster * cltr) with (cltr->ready_queue) {
+	verify( list.count > 0 );
+	while( __atomic_load_n( &empty.count, __ATOMIC_RELAXED ) != 0) {
+		#if !defined(__CFA_READQ_NO_BITMASK__)
+			tls.pick.pop.maskrds++;
+			unsigned i, j;
+			{
+				#if !defined(__CFA_NO_SCHED_STATS__)
+					tls.pick.pop.maskrds++;
+				#endif
+
+				// Pick two lists at random
+				unsigned num = ((__atomic_load_n( &list.count, __ATOMIC_RELAXED ) - 1) >> 6) + 1;
+
+				unsigned ri = tls_rand();
+				unsigned rj = tls_rand();
+
+				unsigned wdxi = (ri >> 6u) % num;
+				unsigned wdxj = (rj >> 6u) % num;
+
+				size_t maski = __atomic_load_n( &empty.mask[wdxi], __ATOMIC_RELAXED );
+				size_t maskj = __atomic_load_n( &empty.mask[wdxj], __ATOMIC_RELAXED );
+
+				if(maski == 0 && maskj == 0) continue;
+
+				unsigned bi = rand_bit(ri, maski);
+				unsigned bj = rand_bit(rj, maskj);
+
+				verifyf(bi < 64, "%zu %u", maski, bi);
+				verifyf(bj < 64, "%zu %u", maskj, bj);
+
+				i = bi | (wdxi << 6);
+				j = bj | (wdxj << 6);
+
+				verifyf(i < list.count, "%u", wdxi << 6);
+				verifyf(j < list.count, "%u", wdxj << 6);
+			}
+
+			struct thread_desc * thrd = try_pop(cltr, i, j);
+			if(thrd) return thrd;
+		#else
+			// Pick two lists at random
+			int i = tls_rand() % __atomic_load_n( &list.count, __ATOMIC_RELAXED );
+			int j = tls_rand() % __atomic_load_n( &list.count, __ATOMIC_RELAXED );
+
+			struct thread_desc * thrd = try_pop(cltr, i, j);
+			if(thrd) return thrd;
+		#endif
+	}
+
+	return 0p;
+}
+
+//-----------------------------------------------------------------------
+
+static void check( __ready_queue_t & q ) with (q) {
+	#if defined(__CFA_WITH_VERIFY__)
+		{
+			int idx = 0;
+			for( w ; __cfa_readyQ_mask_size ) {
+				for( b ; 8 * sizeof(__cfa_readyQ_mask_t) ) {
+					bool is_empty = idx < list.count ? (ts(list.data[idx]) == 0) : true;
+					bool should_be_empty = 0 == (empty.mask[w] & (1z << b));
+					assertf(should_be_empty == is_empty, "Inconsistent list %d, mask expect : %d, actual is got %d", idx, should_be_empty, (bool)is_empty);
+					assert(__cfa_max_readyQs > idx);
+					idx++;
+				}
+			}
+		}
+
+		{
+			for( idx ; list.count ) {
+				__intrusive_ready_queue_t & sl = list.data[idx];
+				assert(!list.data[idx].lock);
+
+				assert(head(sl)->link.prev == 0p );
+				assert(head(sl)->link.next->link.prev == head(sl) );
+				assert(tail(sl)->link.next == 0p );
+				assert(tail(sl)->link.prev->link.next == tail(sl) );
+
+				if(sl.before.link.ts == 0l) {
+					assert(tail(sl)->link.next == 0p);
+					assert(tail(sl)->link.prev == head(sl));
+					assert(head(sl)->link.next == tail(sl));
+					assert(head(sl)->link.prev == 0p);
+				}
+			}
+		}
+	#endif
+}
+
+// Call this function of the intrusive list was moved using memcpy
+// fixes the list so that the pointers back to anchors aren't left
+// dangling
+static inline void fix(__intrusive_ready_queue_t & ll) {
+	// if the list is not empty then follow he pointer
+	// and fix its reverse
+	if(ll.before.link.ts != 0l) {
+		head(ll)->link.next->link.prev = head(ll);
+		tail(ll)->link.prev->link.next = tail(ll);
+	}
+	// Otherwise just reset the list
+	else {
+		tail(ll)->link.next = 0p;
+		tail(ll)->link.prev = head(ll);
+		head(ll)->link.next = tail(ll);
+		head(ll)->link.prev = 0p;
+	}
+}
+
+void ready_queue_grow  (struct cluster * cltr) {
+	uint_fast32_t last_size = ready_mutate_lock( *cltr );
+	check( cltr->ready_queue );
+
+	with( cltr->ready_queue ) {
+		size_t ncount = list.count;
+
+		// Check that we have some space left
+		if(ncount + 4 >= __cfa_max_readyQs) abort("Program attempted to create more than maximum number of Ready Queues (%zu)", __cfa_max_readyQs);
+
+		ncount += 4;
+
+		// Allocate new array
+		list.data = alloc(list.data, ncount);
+
+		// Fix the moved data
+		for( idx; (size_t)list.count ) {
+			fix(list.data[idx]);
+		}
+
+		// Construct new data
+		for( idx; (size_t)list.count ~ ncount) {
+			(list.data[idx]){};
+		}
+
+		// Update original
+		list.count = ncount;
+		// fields in empty don't need to change
+	}
+
+	// Make sure that everything is consistent
+	check( cltr->ready_queue );
+	ready_mutate_unlock( *cltr, last_size );
+}
+
+void ready_queue_shrink(struct cluster * cltr) {
+	uint_fast32_t last_size = ready_mutate_lock( *cltr );
+	with( cltr->ready_queue ) {
+		size_t ocount = list.count;
+		// Check that we have some space left
+		if(ocount < 8) abort("Program attempted to destroy more Ready Queues than were created");
+
+		list.count -= 4;
+
+		// redistribute old data
+		verify(ocount > list.count);
+		for( idx; (size_t)list.count ~ ocount) {
+			// This is not strictly needed but makes checking invariants much easier
+			bool locked = __atomic_try_acquire(&list.data[idx].lock);
+			verify(locked);
+			while(0 != ts(list.data[idx])) {
+				struct thread_desc * thrd;
+				__attribute__((unused)) bool _;
+				[thrd, _] = pop(list.data[idx]);
+				verify(thrd);
+				push(cltr, thrd);
+			}
+
+			__atomic_unlock(&list.data[idx].lock);
+
+			// TODO print the queue statistics here
+
+			^(list.data[idx]){};
+		}
+
+		// clear the now unused masks
+		{
+			__cfa_readyQ_mask_t fword, fbit, lword, lbit;
+			[fbit, fword] = extract(ocount);
+			[lbit, lword] = extract(list.count);
+
+			// For now assume that all queues where coverd by the same bitmask
+			// This is highly probable as long as grow and shrink use groups of 4
+			// exclusively
+			verify(fword == lword);
+			__cfa_readyQ_mask_t clears = ~0;
+
+			for( b ; fbit ~ lbit ) {
+				clears ^= 1 << b;
+			}
+
+			empty.mask[fword] &= clears;
+		}
+
+		// Allocate new array
+		list.data = alloc(list.data, list.count);
+
+		// Fix the moved data
+		for( idx; (size_t)list.count ) {
+			fix(list.data[idx]);
+		}
+	}
+
+	// Make sure that everything is consistent
+	check( cltr->ready_queue );
+	ready_mutate_unlock( *cltr, last_size );
+}
+
+//-----------------------------------------------------------------------
+
+#if !defined(__CFA_NO_STATISTICS__)
+void stats_tls_tally(struct cluster * cltr) with (cltr->ready_queue) {
+	__atomic_fetch_add( &global_stats.pick.push.attempt, tls.pick.push.attempt, __ATOMIC_SEQ_CST );
+	__atomic_fetch_add( &global_stats.pick.push.success, tls.pick.push.success, __ATOMIC_SEQ_CST );
+	__atomic_fetch_add( &global_stats.pick.pop .maskrds, tls.pick.pop .maskrds, __ATOMIC_SEQ_CST );
+	__atomic_fetch_add( &global_stats.pick.pop .attempt, tls.pick.pop .attempt, __ATOMIC_SEQ_CST );
+	__atomic_fetch_add( &global_stats.pick.pop .success, tls.pick.pop .success, __ATOMIC_SEQ_CST );
+
+	__atomic_fetch_add( &global_stats.full.value, tls.full.value, __ATOMIC_SEQ_CST );
+	__atomic_fetch_add( &global_stats.full.count, tls.full.count, __ATOMIC_SEQ_CST );
+}
+#endif
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision d4f1521a0b4c3222ff20073dd7fde49ad6004db2)
+++ libcfa/src/concurrency/thread.cfa	(revision b798713f34d313e4f132850f1f6d89bdc4947d8b)
@@ -41,6 +41,6 @@
 	self_mon_p = &self_mon;
 	curr_cluster = &cl;
-	next = 0p;
-	prev = 0p;
+	link.next = 0p;
+	link.prev = 0p;
 
 	node.next = NULL;
