Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision c84b4be20d06963eb1b538cfb902b7ba61e4644d)
+++ libcfa/src/concurrency/kernel.hfa	(revision dca580212e17dd47149961ae07496513513fa920)
@@ -165,7 +165,13 @@
 //-----------------------------------------------------------------------------
 // Cluster Tools
+
+// Cells use by the reader writer lock
+// while not generic it only relies on a opaque pointer
 struct __processor_id;
 
 // Reader-Writer lock protecting the ready-queue
+// while this lock is mostly generic some aspects
+// have been hard-coded to for the ready-queue for
+// simplicity and performance
 struct __clusterRWLock_t {
 	// total cachelines allocated
@@ -189,10 +195,8 @@
 void ^?{}(__clusterRWLock_t & this);
 
-// Underlying sub quues of the ready queue
-struct __attribute__((aligned(128))) __intrusive_ready_queue_t {
+// Intrusives lanes which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __intrusive_lane_t {
 	// spin lock protecting the queue
 	volatile bool lock;
-	unsigned int last_id;
-	unsigned int count;
 
 	// anchor for the head and the tail of the queue
@@ -204,4 +208,14 @@
 	} before, after;
 
+#if defined(__CFA_WITH_VERIFY__)
+	// id of last processor to acquire the lock
+	// needed only to check for mutual exclusion violations
+	unsigned int last_id;
+
+	// number of items on this list
+	// needed only to check for deadlocks
+	unsigned int count;
+#endif
+
 	// Optional statistic counters
 	#if !defined(__CFA_NO_SCHED_STATS__)
@@ -217,6 +231,6 @@
 };
 
-void  ?{}(__intrusive_ready_queue_t & this);
-void ^?{}(__intrusive_ready_queue_t & this);
+void  ?{}(__intrusive_lane_t & this);
+void ^?{}(__intrusive_lane_t & this);
 
 typedef unsigned long long __cfa_readyQ_mask_t;
@@ -227,36 +241,67 @@
 // };
 
-#define __cfa_readyQ_mask_size ((64 - sizeof(size_t)) / sizeof(__cfa_readyQ_mask_t))
-#define __cfa_max_readyQs (__cfa_readyQ_mask_size * 8 * sizeof(__cfa_readyQ_mask_t))
+#define __cfa_lane_mask_size ((64 - sizeof(size_t)) / sizeof(__cfa_readyQ_mask_t))
+#define __cfa_max_lanes (__cfa_lane_mask_size * 8 * sizeof(__cfa_readyQ_mask_t))
 
 //TODO adjust cache size to ARCHITECTURE
+// Structure holding the relaxed ready queue
 struct __attribute__((aligned(128))) __ready_queue_t {
+	// Data tracking how many/which lanes are used
+	// Aligned to 128 for cache locality
 	struct {
+		// number of non-empty lanes
 		volatile size_t count;
-		volatile __cfa_readyQ_mask_t mask[ __cfa_readyQ_mask_size ];
-	} empty;
-
+
+		// bit mask, set bits indentify which lanes are non-empty
+		volatile __cfa_readyQ_mask_t mask[ __cfa_lane_mask_size ];
+	} used;
+
+	// Data tracking the actual lanes
+	// On a seperate cacheline from the used struct since
+	// used can change on each push/pop but this data
+	// only changes on shrink/grow
 	struct __attribute__((aligned(64))) {
-		__intrusive_ready_queue_t * volatile data;
+		// Arary of lanes
+		__intrusive_lane_t * volatile data;
+
+		// Number of lanes (empty or not)
 		volatile size_t count;
-	} list;
-
+	} lanes;
+
+	// Statistics
 	#if !defined(__CFA_NO_STATISTICS__)
 		__attribute__((aligned(64))) struct {
 			struct {
+				// Push statistic
 				struct {
+					// number of attemps at pushing something
 					volatile size_t attempt;
+
+					// number of successes at pushing
 					volatile size_t success;
 				} push;
+
+				// Pop statistic
 				struct {
+					// number of reads of the mask
+					// picking an empty __cfa_readyQ_mask_t counts here
+					// but not as an attempt
 					volatile size_t maskrds;
+
+					// number of attemps at poping something
 					volatile size_t attempt;
+
+					// number of successes at poping
 					volatile size_t success;
 				} pop;
 			} pick;
+
+			// stats on the "used" struct of the queue
+			// tracks average number of queues that are not empty
+			// when pushing / poping
 			struct {
 				volatile size_t value;
 				volatile size_t count;
-			} full;
+			} used;
 		} global_stats;
 
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision c84b4be20d06963eb1b538cfb902b7ba61e4644d)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision dca580212e17dd47149961ae07496513513fa920)
@@ -136,4 +136,8 @@
 // Concurrent with doregister/unregister,
 //    i.e., threads can be added at any point during or between the entry/exit
+
+//-----------------------------------------------------------------------
+// simple spinlock underlying the RWLock
+// Blocking acquire
 static inline void __atomic_acquire(volatile bool * ll) {
 	while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) {
@@ -144,8 +148,10 @@
 }
 
+// Non-Blocking acquire
 static inline bool __atomic_try_acquire(volatile bool * ll) {
 	return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
 }
 
+// Release
 static inline void __atomic_unlock(volatile bool * ll) {
 	/* paranoid */ verify(*ll);
@@ -180,5 +186,5 @@
 	/*paranoid*/ verify(iproc < ready);
 	/*paranoid*/ verify(data[iproc].lock);
-	__atomic_store_n(&data[iproc].lock, false, __ATOMIC_RELEASE);
+	__atomic_unlock(&data[iproc].lock);
 }
 
@@ -188,14 +194,28 @@
 uint_fast32_t ready_mutate_lock( struct cluster & cltr );
 
-void ready_mutate_unlock( struct cluster & cltr, uint_fast32_t );
+void ready_mutate_unlock( struct cluster & cltr, uint_fast32_t /* value returned by lock */ );
 
 //=======================================================================
 // Ready-Queue API
-
+//-----------------------------------------------------------------------
+// push thread onto a ready queue for a cluster
+// returns true if the list was previously empty, false otherwise
 __attribute__((hot)) bool push(struct cluster * cltr, struct thread_desc * thrd);
+
+//-----------------------------------------------------------------------
+// pop thread from the ready queue of a cluster
+// returns 0p if empty
 __attribute__((hot)) thread_desc * pop(struct cluster * cltr);
+
+//-----------------------------------------------------------------------
+// Increase the width of the ready queue (number of lanes) by 4
 void ready_queue_grow  (struct cluster * cltr);
+
+//-----------------------------------------------------------------------
+// Decrease the width of the ready queue (number of lanes) by 4
 void ready_queue_shrink(struct cluster * cltr);
 
+//-----------------------------------------------------------------------
+// Statics call at the end of each thread to register statistics
 #if !defined(__CFA_NO_STATISTICS__)
 void stats_tls_tally(struct cluster * cltr);
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision c84b4be20d06963eb1b538cfb902b7ba61e4644d)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision dca580212e17dd47149961ae07496513513fa920)
@@ -24,19 +24,16 @@
 static const size_t cache_line_size = 64;
 
-static inline unsigned __max_processors_fallback() {
-	#ifdef __CFA_MAX_PROCESSORS__
-		return __CFA_MAX_PROCESSORS__;
-	#else
-		// No overriden function, no environment variable, no define
-		// fall back to a magic number
-		return 128;
-	#endif
-}
-
+// No overriden function, no environment variable, no define
+// fall back to a magic number
+#ifndef __CFA_MAX_PROCESSORS__
+	#define __CFA_MAX_PROCESSORS__ 128
+#endif
+
+// returns the maximum number of processors the RWLock support
 __attribute__((weak)) unsigned __max_processors() {
 	const char * max_cores_s = getenv("CFA_MAX_PROCESSORS");
 	if(!max_cores_s) {
 		__cfaabi_dbg_print_nolock("No CFA_MAX_PROCESSORS in ENV");
-		return __max_processors_fallback();
+		return __CFA_MAX_PROCESSORS__;
 	}
 
@@ -45,9 +42,9 @@
 	if(max_cores_l < 1 || max_cores_l > 65535) {
 		__cfaabi_dbg_print_nolock("CFA_MAX_PROCESSORS out of range : %ld", max_cores_l);
-		return __max_processors_fallback();
+		return __CFA_MAX_PROCESSORS__;
 	}
 	if('\0' != *endptr) {
 		__cfaabi_dbg_print_nolock("CFA_MAX_PROCESSORS not a decimal number : %s", max_cores_s);
-		return __max_processors_fallback();
+		return __CFA_MAX_PROCESSORS__;
 	}
 
@@ -55,49 +52,74 @@
 }
 
-static inline unsigned rand_bit(unsigned rnum, size_t mask) {
-	verify(sizeof(mask) == 8);
+// Picks a random 1 bit in 'mask' according to random number 'rnum'.
+static inline unsigned rand_bit(unsigned rnum, __cfa_readyQ_mask_t mask) {
+#if defined( __i386 )
+	static_assert(sizeof(mask) == 4);
+	unsigned bit = mask ? rnum % __builtin_popcount(mask) : 0;
+	#if !defined(__BMI2__)
+		#error rand_bit not implemented for non __BMI2__ i386
+	#else
+		uint32_t picked = _pdep_u32(1ul << bit, mask);
+		return picked ? __builtin_ctz(picked) : 0;
+	#endif
+#elif defined( __x86_64 )
+	static_assert(sizeof(mask) == 8);
 	unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
-#if !defined(__BMI2__)
-	uint64_t v = mask;   // Input value to find position with rank r.
-	unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
-	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
-	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
-	unsigned int t;      // Bit count temporary.
-
-	// Do a normal parallel bit count for a 64-bit integer,
-	// but store all intermediate steps.
-	a =  v - ((v >> 1) & ~0UL/3);
-	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
-	c = (b + (b >> 4)) & ~0UL/0x11;
-	d = (c + (c >> 8)) & ~0UL/0x101;
-
-
-	t = (d >> 32) + (d >> 48);
-	// Now do branchless select!
-	s  = 64;
-	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
-	t  = (d >> (s - 16)) & 0xff;
-	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
-	t  = (c >> (s - 8)) & 0xf;
-	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
-	t  = (b >> (s - 4)) & 0x7;
-	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
-	t  = (a >> (s - 2)) & 0x3;
-	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
-	t  = (v >> (s - 1)) & 0x1;
-	s -= ((t - r) & 256) >> 8;
-	return s - 1;
+	#if !defined(__BMI2__)
+		uint64_t v = mask;   // Input value to find position with rank r.
+		unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
+		unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
+		uint64_t a, b, c, d; // Intermediate temporaries for bit count.
+		unsigned int t;      // Bit count temporary.
+
+		// Do a normal parallel bit count for a 64-bit integer,
+		// but store all intermediate steps.
+		a =  v - ((v >> 1) & ~0UL/3);
+		b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
+		c = (b + (b >> 4)) & ~0UL/0x11;
+		d = (c + (c >> 8)) & ~0UL/0x101;
+
+
+		t = (d >> 32) + (d >> 48);
+		// Now do branchless select!
+		s  = 64;
+		s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
+		t  = (d >> (s - 16)) & 0xff;
+		s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
+		t  = (c >> (s - 8)) & 0xf;
+		s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
+		t  = (b >> (s - 4)) & 0x7;
+		s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
+		t  = (a >> (s - 2)) & 0x3;
+		s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
+		t  = (v >> (s - 1)) & 0x1;
+		s -= ((t - r) & 256) >> 8;
+		return s - 1;
+	#else
+		uint64_t picked = _pdep_u64(1ul << bit, mask);
+		return picked ? __builtin_ctzl(picked) : 0;
+	#endif
+#elif defined( __ARM_ARCH )
+	#error rand_bit not implemented for arm
 #else
-	uint64_t picked = _pdep_u64(1ul << bit, mask);
-	return picked ? __builtin_ctzl(picked) : 0;
+	#error uknown hardware architecture
 #endif
 }
 
-static inline __cfa_readyQ_mask_t readyQ_mask_full       () { return (8 * sizeof(__cfa_readyQ_mask_t)) - 1; }
-static inline __cfa_readyQ_mask_t readyQ_mask_shit_length() { return (8 * sizeof(__cfa_readyQ_mask_t)) - __builtin_clzl(readyQ_mask_full()); }
-
+
+//-----------------------------------------------------------------------------
+// Helpers used by extract
+// (_mask_bitsidx() & X) returns a bit index valid for a __cfa_readyQ_mask_t, where X is any integer
+static inline __cfa_readyQ_mask_t _mask_bitsidx () { return (8 * sizeof(__cfa_readyQ_mask_t)) - 1; }
+
+// (X >> _mask_shiftidx()) retuns an index into an array of __cfa_readyQ_mask_t
+static inline __cfa_readyQ_mask_t _mask_shiftidx() { return (8 * sizeof(__cfa_readyQ_mask_t)) - __builtin_clzl(_mask_bitsidx()); }
+
+
+// Assuming a large bit mask represented as an array of __cfa_readyQ_mask_t
+// Given an index into the large mask, returns the bit index and which __cfa_readyQ_mask_t index in the array
 static inline [__cfa_readyQ_mask_t, __cfa_readyQ_mask_t] extract(__cfa_readyQ_mask_t idx) {
-	__cfa_readyQ_mask_t word = idx >> readyQ_mask_shit_length();
-	__cfa_readyQ_mask_t bit  = idx &  readyQ_mask_full();
+	__cfa_readyQ_mask_t word = idx >> _mask_bitsidx();
+	__cfa_readyQ_mask_t bit  = idx &  _mask_shiftidx();
 	return [bit, word];
 }
@@ -219,5 +241,5 @@
 //=======================================================================
 // Get the head pointer (one before the first element) from the anchor
-static inline thread_desc * head(const __intrusive_ready_queue_t & this) {
+static inline thread_desc * head(const __intrusive_lane_t & this) {
 	thread_desc * rhead = (thread_desc *)(
 		(uintptr_t)( &this.before ) - offsetof( thread_desc, link )
@@ -228,5 +250,5 @@
 
 // Get the tail pointer (one after the last element) from the anchor
-static inline thread_desc * tail(const __intrusive_ready_queue_t & this) {
+static inline thread_desc * tail(const __intrusive_lane_t & this) {
 	thread_desc * rtail = (thread_desc *)(
 		(uintptr_t)( &this.after ) - offsetof( thread_desc, link )
@@ -237,5 +259,5 @@
 
 // Ctor
-void ?{}( __intrusive_ready_queue_t & this ) {
+void ?{}( __intrusive_lane_t & this ) {
 	this.lock = false;
 	this.last_id = -1u;
@@ -267,16 +289,16 @@
 	/* paranoid */ verify(&tail(this)->link.prev == &this.after .link.prev );
 	/* paranoid */ verify(&tail(this)->link.next == &this.after .link.next );
-	/* paranoid */ verify(sizeof(__intrusive_ready_queue_t) == 128);
+	/* paranoid */ verify(sizeof(__intrusive_lane_t) == 128);
 	/* paranoid */ verify(sizeof(this) == 128);
-	/* paranoid */ verify(__alignof__(__intrusive_ready_queue_t) == 128);
+	/* paranoid */ verify(__alignof__(__intrusive_lane_t) == 128);
 	/* paranoid */ verify(__alignof__(this) == 128);
 	/* paranoid */ verifyf(((intptr_t)(&this) % 128) == 0, "Expected address to be aligned %p %% 128 == %zd", &this, ((intptr_t)(&this) % 128));
 
-	/* paranoid */ verifyf(readyQ_mask_shit_length() == 6 , "%zu", readyQ_mask_shit_length());
-	/* paranoid */ verifyf(readyQ_mask_full()        == 63, "%zu", readyQ_mask_full());
+	/* paranoid */ verifyf(_mask_shiftidx() == 6 , "%zu", _mask_shiftidx());
+	/* paranoid */ verifyf(_mask_bitsidx () == 63, "%zu", _mask_bitsidx());
 }
 
 // Dtor is trivial
-void ^?{}( __intrusive_ready_queue_t & this ) {
+void ^?{}( __intrusive_lane_t & this ) {
 	// Make sure the list is empty
 	/* paranoid */ verify(head(this)->link.prev == 0p );
@@ -287,20 +309,22 @@
 }
 
-
-
-bool push(__intrusive_ready_queue_t & this, thread_desc * node) {
-	verify(this.lock);
-	verify(node->link.ts != 0);
-	verify(node->link.next == 0p);
-	verify(node->link.prev == 0p);
-
-	this.count++;
-
-	if(this.before.link.ts == 0l) {
-		verify(tail(this)->link.next == 0p);
-		verify(tail(this)->link.prev == head(this));
-		verify(head(this)->link.next == tail(this));
-		verify(head(this)->link.prev == 0p);
-	}
+// Push a thread onto this lane
+// returns true of lane was empty before push, false otherwise
+bool push(__intrusive_lane_t & this, thread_desc * node) {
+	#if defined(__CFA_WITH_VERIFY__)
+		/* paranoid */ verify(this.lock);
+		/* paranoid */ verify(node->link.ts != 0);
+		/* paranoid */ verify(node->link.next == 0p);
+		/* paranoid */ verify(node->link.prev == 0p);
+
+		this.count++;
+
+		if(this.before.link.ts == 0l) {
+			/* paranoid */ verify(tail(this)->link.next == 0p);
+			/* paranoid */ verify(tail(this)->link.prev == head(this));
+			/* paranoid */ verify(head(this)->link.next == tail(this));
+			/* paranoid */ verify(head(this)->link.prev == 0p);
+		}
+	#endif
 
 	// Get the relevant nodes locally
@@ -315,5 +339,5 @@
 
 	// Update stats
-	#ifndef __CFA_NO_SCHED_STATS__
+	#if !defined(__CFA_NO_SCHED_STATS__)
 		this.stat.diff++;
 		this.stat.push++;
@@ -325,5 +349,5 @@
 	if(this.before.link.ts == 0l) {
 		this.before.link.ts = node->link.ts;
-		verify(node->link.prev == head(this));
+		/* paranoid */ verify(node->link.prev == head(this));
 		return true;
 	}
@@ -331,28 +355,34 @@
 }
 
-[thread_desc *, bool] pop(__intrusive_ready_queue_t & this) {
-	verify(this.lock);
-	verify(this.before.link.ts != 0ul);
+// Pop a thread from this lane (must be non-empty)
+// returns popped
+// returns true of lane was empty before push, false otherwise
+[thread_desc *, bool] pop(__intrusive_lane_t & this) {
+	/* paranoid */ verify(this.lock);
+	/* paranoid */ verify(this.before.link.ts != 0ul);
+
+	// Get anchors locally
 	thread_desc * head = head(this);
 	thread_desc * tail = tail(this);
 
+	// Get the relevant nodes locally
 	thread_desc * node = head->link.next;
 	thread_desc * next = node->link.next;
-	if(node == tail) {
-		verify(false);
-		verify(this.before.link.ts == 0ul);
-		verify(tail(this)->link.next == 0p);
-		verify(tail(this)->link.prev == head(this));
-		verify(head(this)->link.next == tail(this));
-		verify(head(this)->link.prev == 0p);
-		return [0p, false];
-	}
-
-	this.count--;
-	/* paranoid */ verify(node);
-
+
+	#if defined(__CFA_WITH_VERIFY__)
+		this.count--;
+		/* paranoid */ verify(node != tail);
+		/* paranoid */ verify(node);
+	#endif
+
+	// Do the pop
 	head->link.next = next;
 	next->link.prev = head;
-
+	node->link.[next, prev] = 0p;
+
+	// Update head time stamp
+	this.before.link.ts = next->link.ts;
+
+	// Update stats
 	#ifndef __CFA_NO_SCHED_STATS__
 		this.stat.diff--;
@@ -360,23 +390,29 @@
 	#endif
 
+	// Check if we emptied list and return accordingly
 	if(next == tail) {
-		this.before.link.ts = 0ul;
-		verify(tail(this)->link.next == 0p);
-		verify(tail(this)->link.prev == head(this));
-		verify(head(this)->link.next == tail(this));
-		verify(head(this)->link.prev == 0p);
-		node->link.[next, prev] = 0p;
+		/* paranoid */ verify(this.before.link.ts == 0);
+		/* paranoid */ verify(tail(this)->link.next == 0p);
+		/* paranoid */ verify(tail(this)->link.prev == head(this));
+		/* paranoid */ verify(head(this)->link.next == tail(this));
+		/* paranoid */ verify(head(this)->link.prev == 0p);
 		return [node, true];
 	}
 	else {
-		verify(next->link.ts != 0);
-		this.before.link.ts = next->link.ts;
-		verify(this.before.link.ts != 0);
-		node->link.[next, prev] = 0p;
+		/* paranoid */ verify(next->link.ts != 0);
+		/* paranoid */ verify(this.before.link.ts != 0);
 		return [node, false];
 	}
 }
 
-static inline unsigned long long ts(__intrusive_ready_queue_t & this) {
+// Check whether or not list is empty
+static inline bool is_empty(__intrusive_lane_t & this) {
+	verify( (this.before.link.ts == 0) == (this.count == 0) );
+	return this.before.link.ts == 0;
+}
+
+// Return the timestamp
+static inline unsigned long long ts(__intrusive_lane_t & this) {
+	verify( this.before.link.ts == this.before.link.next->link.ts );
 	return this.before.link.ts;
 }
@@ -386,4 +422,6 @@
 //=======================================================================
 
+// Thread local mirror of ready queue statistics
+#if !defined(__CFA_NO_STATISTICS__)
 static __attribute__((aligned(128))) thread_local struct {
 	struct {
@@ -401,5 +439,5 @@
 		size_t value;
 		size_t count;
-	} full;
+	} used;
 } tls = {
 	/* pick */{
@@ -407,20 +445,21 @@
 		/* pop  */{ 0, 0, 0 },
 	},
-	/* full */{ 0, 0 }
+	/* used */{ 0, 0 }
 };
+#endif
 
 //-----------------------------------------------------------------------
 
 void ?{}(__ready_queue_t & this) with (this) {
-	empty.count = 0;
-	for( i ; __cfa_readyQ_mask_size ) {
-		empty.mask[i] = 0;
-	}
-
-	list.data = alloc(4);
+	used.count = 0;
+	for( i ; __cfa_lane_mask_size ) {
+		used.mask[i] = 0;
+	}
+
+	lanes.data = alloc(4);
 	for( i; 4 ) {
-		(list.data[i]){};
-	}
-	list.count = 4;
+		(lanes.data[i]){};
+	}
+	lanes.count = 4;
 
 	#if !defined(__CFA_NO_STATISTICS__)
@@ -431,22 +470,22 @@
 		global_stats.pick.pop .success = 0;
 
-		global_stats.full.value = 0;
-		global_stats.full.count = 0;
+		global_stats.used.value = 0;
+		global_stats.used.count = 0;
 	#endif
 }
 
 void ^?{}(__ready_queue_t & this) with (this) {
-	verify( 4  == list .count );
-	verify( 0  == empty.count );
+	verify( 4  == lanes.count );
+	verify( 0  == used .count );
 
 	for( i; 4 ) {
-		^(list.data[i]){};
-	}
-	free(list.data);
+		^(lanes.data[i]){};
+	}
+	free(lanes.data);
 
 
 	#if defined(__CFA_WITH_VERIFY__)
-		for( i ; __cfa_readyQ_mask_size ) {
-			assert( 0 == empty.mask[i] );
+		for( i ; __cfa_lane_mask_size ) {
+			assert( 0 == used.mask[i] );
 		}
 	#endif
@@ -454,11 +493,81 @@
 
 //-----------------------------------------------------------------------
-
+enum mask_strictness {
+	STRICT,
+	NOCHECK
+};
+
+// Set a given bit in the bit mask array
+// strictness determines of the bit had to be cleared before
+static inline void mask_set(__cfa_readyQ_mask_t * mask, unsigned index, mask_strictness strict) {
+	// Extract the array and bit indexes
+	__cfa_readyQ_mask_t word;
+	__cfa_readyQ_mask_t bit;
+	[bit, word] = extract(index);
+
+	// Conditional check
+	verifyf(
+		strict == STRICT && // Conditional check if it was expected to be cleared
+		((mask[word] & (1ull << bit)) == 0),
+		"Before set %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit)
+	);
+
+	// Atomically set the bit
+	__attribute__((unused)) bool ret = __atomic_bts(&mask[word], bit);
+
+	// Conditional check
+	verifyf(
+		strict == STRICT && // Conditional check if it was expected to be cleared
+		!ret,
+		"Bit was not set but bts returned true"
+	);
+
+	// Unconditional check
+	verifyf(
+		(mask[word] & (1ull << bit)) != 0,
+		"After set %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit)
+	);
+}
+
+static inline void mask_clear(__cfa_readyQ_mask_t * mask, unsigned index, mask_strictness strict) {
+	// Extract the array and bit indexes
+	__cfa_readyQ_mask_t word;
+	__cfa_readyQ_mask_t bit;
+	[bit, word] = extract(index);
+
+	// Conditional check
+	verifyf(
+		strict == STRICT && // Conditional check if it was expected to be set
+		((mask[word] & (1ull << bit)) != 0),
+		"Before clear %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit)
+	);
+
+	// Atomically clear the bit
+	__attribute__((unused)) bool ret = __atomic_btr(&mask[word], bit);
+
+	// Conditional check
+	verifyf(
+		strict == STRICT && // Conditional check if it was expected to be cleared
+		ret,
+		"Bit was set but btr returned false"
+	);
+
+	// Unconditional check
+	verifyf(
+		(mask[word] & (1ull << bit)) == 0,
+		"After clear %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit)
+	);
+}
+
+//-----------------------------------------------------------------------
 __attribute__((hot)) bool push(struct cluster * cltr, struct thread_desc * thrd) with (cltr->ready_queue) {
+	// write timestamp
 	thrd->link.ts = rdtscl();
 
-	while(true) {
-		// Pick a random list
-		unsigned i = tls_rand() % list.count;
+	// Try to pick a lane and lock it
+	unsigned i;
+	do {
+		// Pick the index of a lane
+		unsigned i = tls_rand() % lanes.count;
 
 		#if !defined(__CFA_NO_STATISTICS__)
@@ -467,45 +576,52 @@
 
 		// If we can't lock it retry
-		if( !__atomic_try_acquire( &list.data[i].lock ) ) continue;
-		verify(list.data[i].last_id == -1u);
-		list.data[i].last_id = kernelTLS.this_processor->id;
-
-		__attribute__((unused)) size_t num = __atomic_load_n( &empty.count, __ATOMIC_RELAXED );
-		bool first = false;
-
-		verify( list.data[i].last_id == kernelTLS.this_processor->id );
-		verify( list.data[i].lock );
-		// Actually push it
-		if(push(list.data[i], thrd)) {
-			size_t ret = __atomic_fetch_add( &empty.count, 1z, __ATOMIC_SEQ_CST);
-			first = (ret == 0);
-
-			__cfa_readyQ_mask_t word;
-			__cfa_readyQ_mask_t bit;
-			[bit, word] = extract(i);
-			verifyf((empty.mask[word] & (1ull << bit)) == 0, "Before set %llu:%llu (%u), %llx & %llx", word, bit, i, empty.mask[word], (1ull << bit));
-			__attribute__((unused)) bool ret = bts(&empty.mask[word], bit);
-			verify(!(bool)ret);
-			verifyf((empty.mask[word] & (1ull << bit)) != 0, "After set %llu:%llu (%u), %llx & %llx", word, bit, i, empty.mask[word], (1ull << bit));
-		}
-		verifyf( empty.count <= list.count, "Non-empty count (%zu) exceeds actual count (%zu)\n", empty.count, list.count );
-		verifyf( list.data[i].last_id == kernelTLS.this_processor->id, "Expected last processor to lock queue %u to be %u, was %u\n", i, list.data[i].last_id, kernelTLS.this_processor->id );
-		verifyf( list.data[i].lock, "List %u is not locked\n", i );
-
-		// Unlock and return
-		list.data[i].last_id = -1u;
-		__atomic_unlock( &list.data[i].lock );
-
-		#if !defined(__CFA_NO_STATISTICS__)
-			tls.pick.push.success++;
-			tls.full.value += num;
-			tls.full.count += 1;
-		#endif
-		return first;
-	}
+	} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+
+	#if defined(__CFA_WITH_VERIFY__)
+		/* paranoid */ verify(lanes.data[i].last_id == -1u);
+		/* paranoid */ lanes.data[i].last_id = kernelTLS.this_processor->id;
+	#endif
+
+	__attribute__((unused)) size_t num = __atomic_load_n( &used.count, __ATOMIC_RELAXED );
+	bool first = false;
+
+	// Actually push it
+	bool lane_first = push(lanes.data[i], thrd);
+
+	// If this lane used to be empty we need to do more
+	if(lane_first) {
+		// Update the global count
+		size_t ret = __atomic_fetch_add( &used.count, 1z, __ATOMIC_SEQ_CST);
+
+		// Check if the entire quue used to be empty
+		first = (ret == 0);
+
+		// Update the bit mask
+		mask_set((__cfa_readyQ_mask_t *)used.mask, i, STRICT);
+	}
+
+	#if defined(__CFA_WITH_VERIFY__)
+		/* paranoid */ verifyf( used.count <= lanes.count, "Non-empty count (%zu) exceeds actual count (%zu)\n", used.count, lanes.count );
+		/* paranoid */ verifyf( lanes.data[i].last_id == kernelTLS.this_processor->id, "Expected last processor to lock queue %u to be %u, was %u\n", i, lanes.data[i].last_id, kernelTLS.this_processor->id );
+		/* paranoid */ verifyf( lanes.data[i].lock, "List %u is not locked\n", i );
+		/* paranoid */ lanes.data[i].last_id = -1u;
+	#endif
+
+	// Unlock and return
+	__atomic_unlock( &lanes.data[i].lock );
+
+	// Update statistics
+	#if !defined(__CFA_NO_STATISTICS__)
+		tls.pick.push.success++;
+		tls.used.value += num;
+		tls.used.count += 1;
+	#endif
+
+	// return whether or not the list was empty before this push
+	return first;
 }
 
 //-----------------------------------------------------------------------
-
+// Given 2 indexes, pick the list with the oldest push an try to pop from it
 static struct thread_desc * try_pop(struct cluster * cltr, unsigned i, unsigned j) with (cltr->ready_queue) {
 	#if !defined(__CFA_NO_STATISTICS__)
@@ -515,120 +631,129 @@
 	// Pick the bet list
 	int w = i;
-	if( __builtin_expect(ts(list.data[j]) != 0, true) ) {
-		w = (ts(list.data[i]) < ts(list.data[j])) ? i : j;
-	}
-
-	__intrusive_ready_queue_t & list = list.data[w];
+	if( __builtin_expect(!is_empty(lanes.data[j]), true) ) {
+		w = (ts(lanes.data[i]) < ts(lanes.data[j])) ? i : j;
+	}
+
+	// Get relevant elements locally
+	__intrusive_lane_t & lane = lanes.data[w];
+
 	// If list looks empty retry
-	if( ts(list) == 0 ) return 0p;
+	if( is_empty(lane) ) return 0p;
 
 	// If we can't get the lock retry
-	if( !__atomic_try_acquire(&list.lock) ) return 0p;
-	verify(list.last_id == -1u);
-	list.last_id = kernelTLS.this_processor->id;
-
-	verify(list.last_id == kernelTLS.this_processor->id);
-
-	__attribute__((unused)) int num = __atomic_load_n( &empty.count, __ATOMIC_RELAXED );
+	if( !__atomic_try_acquire(&lane.lock) ) return 0p;
+
+	#if defined(__CFA_WITH_VERIFY__)
+		/* paranoid */ verify(lane.last_id == -1u);
+		/* paranoid */ lane.last_id = kernelTLS.this_processor->id;
+	#endif
 
 
 	// If list is empty, unlock and retry
-	if( ts(list) == 0 ) {
-		list.last_id = -1u;
-		__atomic_unlock(&list.lock);
+	if( is_empty(lane) ) {
+		#if defined(__CFA_WITH_VERIFY__)
+			/* paranoid */ verify(lane.last_id == kernelTLS.this_processor->id);
+			/* paranoid */ lane.last_id = -1u;
+		#endif
+
+		__atomic_unlock(&lane.lock);
 		return 0p;
 	}
-	{
-		__cfa_readyQ_mask_t word;
-		__cfa_readyQ_mask_t bit;
-		[bit, word] = extract(w);
-		verify((empty.mask[word] & (1ull << bit)) != 0);
-	}
-
-	verify(list.last_id == kernelTLS.this_processor->id);
-	verify(list.lock);
 
 	// Actually pop the list
 	struct thread_desc * thrd;
 	bool emptied;
-	[thrd, emptied] = pop(list);
-	verify(thrd);
-
-	verify(list.last_id == kernelTLS.this_processor->id);
-	verify(list.lock);
-
+	[thrd, emptied] = pop(lane);
+
+	/* paranoid */ verify(thrd);
+	/* paranoid */ verify(lane.last_id == kernelTLS.this_processor->id);
+	/* paranoid */ verify(lane.lock);
+
+	// If this was the last element in the lane
 	if(emptied) {
-		__atomic_fetch_sub( &empty.count, 1z, __ATOMIC_SEQ_CST);
-
-		__cfa_readyQ_mask_t word;
-		__cfa_readyQ_mask_t bit;
-		[bit, word] = extract(w);
-		verify((empty.mask[word] & (1ull << bit)) != 0);
-		__attribute__((unused)) bool ret = btr(&empty.mask[word], bit);
-		verify(ret);
-		verify((empty.mask[word] & (1ull << bit)) == 0);
-	}
-
-	verify(list.lock);
+		// Update the global count
+		__atomic_fetch_sub( &used.count, 1z, __ATOMIC_SEQ_CST);
+
+		// Update the bit mask
+		mask_clear((__cfa_readyQ_mask_t *)used.mask, w, STRICT);
+	}
+
+	#if defined(__CFA_WITH_VERIFY__)
+		/* paranoid */ verify(lane.last_id == kernelTLS.this_processor->id);
+		/* paranoid */ lane.last_id = -1u;
+	#endif
+
+	// For statistics, check the count before we release the lock
+	#if !defined(__CFA_NO_STATISTICS__)
+		int num = __atomic_load_n( &used.count, __ATOMIC_RELAXED );
+	#endif
 
 	// Unlock and return
-	list.last_id = -1u;
-	__atomic_unlock(&list.lock);
-	verify(empty.count >= 0);
-
+	__atomic_unlock(&lane.lock);
+
+	// Update statistics
 	#if !defined(__CFA_NO_STATISTICS__)
 		tls.pick.pop.success++;
-		tls.full.value += num;
-		tls.full.count += 1;
-	#endif
-
+		tls.used.value += num;
+		tls.used.count += 1;
+	#endif
+
+	// return the popped thread
 	return thrd;
 }
 
+// Pop from the ready queue from a given cluster
 __attribute__((hot)) thread_desc * pop(struct cluster * cltr) with (cltr->ready_queue) {
-	verify( list.count > 0 );
-	while( __atomic_load_n( &empty.count, __ATOMIC_RELAXED ) != 0) {
+	/* paranoid */ verify( lanes.count > 0 );
+
+	// As long as the list is not empty, try finding a lane that isn't empty and pop from it
+	while( __atomic_load_n( &used.count, __ATOMIC_RELAXED ) != 0) {
 		#if !defined(__CFA_READQ_NO_BITMASK__)
-			tls.pick.pop.maskrds++;
-			unsigned i, j;
-			{
-				#if !defined(__CFA_NO_SCHED_STATS__)
-					tls.pick.pop.maskrds++;
-				#endif
-
-				// Pick two lists at random
-				unsigned num = ((__atomic_load_n( &list.count, __ATOMIC_RELAXED ) - 1) >> 6) + 1;
-
-				unsigned ri = tls_rand();
-				unsigned rj = tls_rand();
-
-				unsigned wdxi = (ri >> 6u) % num;
-				unsigned wdxj = (rj >> 6u) % num;
-
-				size_t maski = __atomic_load_n( &empty.mask[wdxi], __ATOMIC_RELAXED );
-				size_t maskj = __atomic_load_n( &empty.mask[wdxj], __ATOMIC_RELAXED );
-
-				if(maski == 0 && maskj == 0) continue;
-
-				unsigned bi = rand_bit(ri, maski);
-				unsigned bj = rand_bit(rj, maskj);
-
-				verifyf(bi < 64, "%zu %u", maski, bi);
-				verifyf(bj < 64, "%zu %u", maskj, bj);
-
-				i = bi | (wdxi << 6);
-				j = bj | (wdxj << 6);
-
-				verifyf(i < list.count, "%u", wdxi << 6);
-				verifyf(j < list.count, "%u", wdxj << 6);
-			}
-
+			// If using bit masks
+			#if !defined(__CFA_NO_SCHED_STATS__)
+				tls.pick.pop.maskrds++;
+			#endif
+
+			// Pick two lists at random
+			unsigned ri = tls_rand();
+			unsigned rj = tls_rand();
+
+			// Find which __cfa_readyQ_mask_t the two lists belong
+			unsigned num = ((__atomic_load_n( &lanes.count, __ATOMIC_RELAXED ) - 1) >> 6) + 1;
+			unsigned wdxi = (ri >> 6u) % num;
+			unsigned wdxj = (rj >> 6u) % num;
+
+			// Get the actual __cfa_readyQ_mask_t
+			size_t maski = __atomic_load_n( &used.mask[wdxi], __ATOMIC_RELAXED );
+			size_t maskj = __atomic_load_n( &used.mask[wdxj], __ATOMIC_RELAXED );
+
+			// If both of these masks are empty, retry
+			if(maski == 0 && maskj == 0) continue;
+
+			// Pick one of the non-zero bits in the masks and get the bit indexes
+			unsigned bi = rand_bit(ri, maski);
+			unsigned bj = rand_bit(rj, maskj);
+
+			// some checks
+			/* paranoid */ verifyf(bi < 64, "%zu %u", maski, bi);
+			/* paranoid */ verifyf(bj < 64, "%zu %u", maskj, bj);
+
+			// get the general list index
+			unsigned i = bi | (wdxi << 6);
+			unsigned j = bj | (wdxj << 6);
+
+			// some more checks
+			/* paranoid */ verifyf(i < lanes.count, "%u", wdxi << 6);
+			/* paranoid */ verifyf(j < lanes.count, "%u", wdxj << 6);
+
+			// try popping from the 2 picked lists
 			struct thread_desc * thrd = try_pop(cltr, i, j);
 			if(thrd) return thrd;
 		#else
 			// Pick two lists at random
-			int i = tls_rand() % __atomic_load_n( &list.count, __ATOMIC_RELAXED );
-			int j = tls_rand() % __atomic_load_n( &list.count, __ATOMIC_RELAXED );
-
+			int i = tls_rand() % __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+			int j = tls_rand() % __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+
+			// try popping from the 2 picked lists
 			struct thread_desc * thrd = try_pop(cltr, i, j);
 			if(thrd) return thrd;
@@ -636,4 +761,5 @@
 	}
 
+	// All lanes where empty return 0p
 	return 0p;
 }
@@ -645,10 +771,10 @@
 		{
 			int idx = 0;
-			for( w ; __cfa_readyQ_mask_size ) {
+			for( w ; __cfa_lane_mask_size ) {
 				for( b ; 8 * sizeof(__cfa_readyQ_mask_t) ) {
-					bool is_empty = idx < list.count ? (ts(list.data[idx]) == 0) : true;
-					bool should_be_empty = 0 == (empty.mask[w] & (1z << b));
+					bool is_empty = idx < lanes.count ? (ts(lanes.data[idx]) == 0) : true;
+					bool should_be_empty = 0 == (used.mask[w] & (1z << b));
 					assertf(should_be_empty == is_empty, "Inconsistent list %d, mask expect : %d, actual is got %d", idx, should_be_empty, (bool)is_empty);
-					assert(__cfa_max_readyQs > idx);
+					assert(__cfa_max_lanes > idx);
 					idx++;
 				}
@@ -657,7 +783,7 @@
 
 		{
-			for( idx ; list.count ) {
-				__intrusive_ready_queue_t & sl = list.data[idx];
-				assert(!list.data[idx].lock);
+			for( idx ; lanes.count ) {
+				__intrusive_lane_t & sl = lanes.data[idx];
+				assert(!lanes.data[idx].lock);
 
 				assert(head(sl)->link.prev == 0p );
@@ -678,10 +804,8 @@
 
 // Call this function of the intrusive list was moved using memcpy
-// fixes the list so that the pointers back to anchors aren't left
-// dangling
-static inline void fix(__intrusive_ready_queue_t & ll) {
-	// if the list is not empty then follow he pointer
-	// and fix its reverse
-	if(ll.before.link.ts != 0l) {
+// fixes the list so that the pointers back to anchors aren't left dangling
+static inline void fix(__intrusive_lane_t & ll) {
+	// if the list is not empty then follow he pointer and fix its reverse
+	if(!is_empty(ll)) {
 		head(ll)->link.next->link.prev = head(ll);
 		tail(ll)->link.prev->link.next = tail(ll);
@@ -689,134 +813,152 @@
 	// Otherwise just reset the list
 	else {
-		tail(ll)->link.next = 0p;
+		verify(tail(ll)->link.next == 0p);
 		tail(ll)->link.prev = head(ll);
 		head(ll)->link.next = tail(ll);
-		head(ll)->link.prev = 0p;
-	}
-}
-
+		verify(head(ll)->link.prev == 0p);
+	}
+}
+
+// Grow the ready queue
 void ready_queue_grow  (struct cluster * cltr) {
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
 	uint_fast32_t last_size = ready_mutate_lock( *cltr );
+
 	__cfaabi_dbg_print_safe("Kernel : Growing ready queue\n");
-	check( cltr->ready_queue );
-
+
+	// Make sure that everything is consistent
+	/* paranoid */ check( cltr->ready_queue );
+
+	// grow the ready queue
 	with( cltr->ready_queue ) {
-		size_t ncount = list.count;
+		size_t ncount = lanes.count;
 
 		// Check that we have some space left
-		if(ncount + 4 >= __cfa_max_readyQs) abort("Program attempted to create more than maximum number of Ready Queues (%zu)", __cfa_max_readyQs);
-
+		if(ncount + 4 >= __cfa_max_lanes) abort("Program attempted to create more than maximum number of Ready Queues (%zu)", __cfa_max_lanes);
+
+		// increase count
 		ncount += 4;
 
-		// Allocate new array
-		list.data = alloc(list.data, ncount);
+		// Allocate new array (uses realloc and memcpies the data)
+		lanes.data = alloc(lanes.data, ncount);
 
 		// Fix the moved data
-		for( idx; (size_t)list.count ) {
-			fix(list.data[idx]);
+		for( idx; (size_t)lanes.count ) {
+			fix(lanes.data[idx]);
 		}
 
 		// Construct new data
-		for( idx; (size_t)list.count ~ ncount) {
-			(list.data[idx]){};
+		for( idx; (size_t)lanes.count ~ ncount) {
+			(lanes.data[idx]){};
 		}
 
 		// Update original
-		list.count = ncount;
-		// fields in empty don't need to change
+		lanes.count = ncount;
+
+		// fields in 'used' don't need to change when growing
 	}
 
 	// Make sure that everything is consistent
-	check( cltr->ready_queue );
+	/* paranoid */ check( cltr->ready_queue );
+
 	__cfaabi_dbg_print_safe("Kernel : Growing ready queue done\n");
+
+	// Unlock the RWlock
 	ready_mutate_unlock( *cltr, last_size );
 }
 
+// Shrink the ready queue
 void ready_queue_shrink(struct cluster * cltr) {
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
 	uint_fast32_t last_size = ready_mutate_lock( *cltr );
+
 	__cfaabi_dbg_print_safe("Kernel : Shrinking ready queue\n");
+
+	// Make sure that everything is consistent
+	/* paranoid */ check( cltr->ready_queue );
+
 	with( cltr->ready_queue ) {
+		// Make sure that the total thread count stays the same
 		#if defined(__CFA_WITH_VERIFY__)
 			size_t nthreads = 0;
-			for( idx; (size_t)list.count ) {
-				nthreads += list.data[idx].count;
+			for( idx; (size_t)lanes.count ) {
+				nthreads += lanes.data[idx].count;
 			}
 		#endif
 
-		size_t ocount = list.count;
+		size_t ocount = lanes.count;
 		// Check that we have some space left
 		if(ocount < 8) abort("Program attempted to destroy more Ready Queues than were created");
 
-		list.count -= 4;
+		// reduce the actual count so push doesn't use the old queues
+		lanes.count -= 4;
+		verify(ocount > lanes.count);
+
+		// for printing count the number of displaced threads
+		#if defined(__CFA_DEBUG_PRINT__)
+			__attribute__((unused)) size_t displaced = 0;
+		#endif
 
 		// redistribute old data
-		verify(ocount > list.count);
-		__attribute__((unused)) size_t displaced = 0;
-		for( idx; (size_t)list.count ~ ocount) {
-			// This is not strictly needed but makes checking invariants much easier
-			bool locked = __atomic_try_acquire(&list.data[idx].lock);
+		for( idx; (size_t)lanes.count ~ ocount) {
+			// Lock is not strictly needed but makes checking invariants much easier
+			bool locked = __atomic_try_acquire(&lanes.data[idx].lock);
 			verify(locked);
-			while(0 != ts(list.data[idx])) {
+
+			// As long as we can pop from this lane to push the threads somewhere else in the queue
+			while(!is_empty(lanes.data[idx])) {
 				struct thread_desc * thrd;
 				__attribute__((unused)) bool _;
-				[thrd, _] = pop(list.data[idx]);
-				verify(thrd);
+				[thrd, _] = pop(lanes.data[idx]);
+
 				push(cltr, thrd);
-				displaced++;
+
+				// for printing count the number of displaced threads
+				#if defined(__CFA_DEBUG_PRINT__)
+					displaced++;
+				#endif
 			}
 
-			__atomic_unlock(&list.data[idx].lock);
+			mask_clear((__cfa_readyQ_mask_t *)used.mask, idx, NOCHECK);
+
+			// Unlock the lane
+			__atomic_unlock(&lanes.data[idx].lock);
 
 			// TODO print the queue statistics here
 
-			^(list.data[idx]){};
+			^(lanes.data[idx]){};
 		}
 
 		__cfaabi_dbg_print_safe("Kernel : Shrinking ready queue displaced %zu threads\n", displaced);
 
-		// clear the now unused masks
-		{
-			__cfa_readyQ_mask_t fword, fbit, lword, lbit;
-			[fbit, fword] = extract(ocount);
-			[lbit, lword] = extract(list.count);
-
-			// For now assume that all queues where coverd by the same bitmask
-			// This is highly probable as long as grow and shrink use groups of 4
-			// exclusively
-			verify(fword == lword);
-			__cfa_readyQ_mask_t clears = ~0;
-
-			for( b ; lbit ~ fbit ) {
-				clears ^= 1 << b;
+		// recompute the used.count instead of maintaining it
+		used.count = 0;
+		for( i ; __cfa_lane_mask_size ) {
+			used.count += __builtin_popcountl(used.mask[i]);
+		}
+
+		// Allocate new array (uses realloc and memcpies the data)
+		lanes.data = alloc(lanes.data, lanes.count);
+
+		// Fix the moved data
+		for( idx; (size_t)lanes.count ) {
+			fix(lanes.data[idx]);
+		}
+
+		// Make sure that the total thread count stayed the same
+		#if defined(__CFA_WITH_VERIFY__)
+			for( idx; (size_t)lanes.count ) {
+				nthreads -= lanes.data[idx].count;
 			}
-
-			empty.mask[fword] &= clears;
-
-
-			empty.count = 0;
-			for( i ; 0 ~= lword ) {
-				empty.count += __builtin_popcountl(empty.mask[i]);
-			}
-		}
-
-		// Allocate new array
-		list.data = alloc(list.data, list.count);
-
-		// Fix the moved data
-		for( idx; (size_t)list.count ) {
-			fix(list.data[idx]);
-		}
-
-		#if defined(__CFA_WITH_VERIFY__)
-			for( idx; (size_t)list.count ) {
-				nthreads -= list.data[idx].count;
-			}
-			assertf(nthreads == 0, "Shrinking changed number of threads");
+			verifyf(nthreads == 0, "Shrinking changed number of threads");
 		#endif
 	}
 
 	// Make sure that everything is consistent
-	check( cltr->ready_queue );
+	/* paranoid */ check( cltr->ready_queue );
+
 	__cfaabi_dbg_print_safe("Kernel : Shrinking ready queue done\n");
+
+	// Unlock the RWlock
 	ready_mutate_unlock( *cltr, last_size );
 }
@@ -832,6 +974,6 @@
 	__atomic_fetch_add( &global_stats.pick.pop .success, tls.pick.pop .success, __ATOMIC_SEQ_CST );
 
-	__atomic_fetch_add( &global_stats.full.value, tls.full.value, __ATOMIC_SEQ_CST );
-	__atomic_fetch_add( &global_stats.full.count, tls.full.count, __ATOMIC_SEQ_CST );
+	__atomic_fetch_add( &global_stats.used.value, tls.used.value, __ATOMIC_SEQ_CST );
+	__atomic_fetch_add( &global_stats.used.count, tls.used.count, __ATOMIC_SEQ_CST );
 }
 #endif
