Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 97392b6998d47706885e39064f5f23e53dbe7496)
+++ libcfa/src/concurrency/kernel.hfa	(revision 61d7bec2829dd5abf116ff6e9af14da1d34d58be)
@@ -156,51 +156,18 @@
 
 // Intrusives lanes which are used by the relaxed ready queue
-struct __attribute__((aligned(128))) __intrusive_lane_t {
-	// spin lock protecting the queue
-	volatile bool lock;
-
-	// anchor for the head and the tail of the queue
-	struct __sentinel_t {
-		// Link lists fields
-		// instrusive link field for threads
-		// must be exactly as in $thread
-		__thread_desc_link link;
-	} before, after;
-
-#if defined(__CFA_WITH_VERIFY__)
-	// id of last processor to acquire the lock
-	// needed only to check for mutual exclusion violations
-	unsigned int last_id;
-
-	// number of items on this list
-	// needed only to check for deadlocks
-	unsigned int count;
-#endif
-
-	// Optional statistic counters
-	#if !defined(__CFA_NO_SCHED_STATS__)
-		struct __attribute__((aligned(64))) {
-			// difference between number of push and pops
-			ssize_t diff;
-
-			// total number of pushes and pops
-			size_t  push;
-			size_t  pop ;
-		} stat;
-	#endif
-};
-
+struct __attribute__((aligned(128))) __intrusive_lane_t;
 void  ?{}(__intrusive_lane_t & this);
 void ^?{}(__intrusive_lane_t & this);
 
-typedef unsigned long long __cfa_readyQ_mask_t;
-
-// enum {
-// 	__cfa_ready_queue_mask_size = (64 - sizeof(size_t)) / sizeof(size_t),
-// 	__cfa_max_ready_queues = __cfa_ready_queue_mask_size * 8 * sizeof(size_t)
-// };
-
-#define __cfa_lane_mask_size ((64 - sizeof(size_t)) / sizeof(__cfa_readyQ_mask_t))
-#define __cfa_max_lanes (__cfa_lane_mask_size * 8 * sizeof(__cfa_readyQ_mask_t))
+// Counter used for wether or not the lanes are all empty
+struct __attribute__((aligned(128))) __snzi_node_t;
+struct __snzi_t {
+	unsigned mask;
+	int root;
+	__snzi_node_t * nodes;
+};
+
+void  ?{}( __snzi_t & this, unsigned depth );
+void ^?{}( __snzi_t & this );
 
 //TODO adjust cache size to ARCHITECTURE
@@ -209,11 +176,5 @@
 	// Data tracking how many/which lanes are used
 	// Aligned to 128 for cache locality
-	struct {
-		// number of non-empty lanes
-		volatile size_t count;
-
-		// bit mask, set bits indentify which lanes are non-empty
-		volatile __cfa_readyQ_mask_t mask[ __cfa_lane_mask_size ];
-	} used;
+	__snzi_t snzi;
 
 	// Data tracking the actual lanes
@@ -231,5 +192,5 @@
 	// Statistics
 	#if !defined(__CFA_NO_STATISTICS__)
-		__attribute__((aligned(64))) struct {
+		struct __attribute__((aligned(64))) {
 			struct {
 				// Push statistic
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 97392b6998d47706885e39064f5f23e53dbe7496)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 61d7bec2829dd5abf116ff6e9af14da1d34d58be)
@@ -22,4 +22,5 @@
 #define _GNU_SOURCE
 #include "stdlib.hfa"
+#include "math.hfa"
 
 static const size_t cache_line_size = 64;
@@ -51,77 +52,4 @@
 
 	return max_cores_l;
-}
-
-// Picks a random 1 bit in 'mask' according to random number 'rnum'.
-static inline unsigned rand_bit(unsigned rnum, __cfa_readyQ_mask_t mask) {
-#if defined( __i386 )
-	static_assert(sizeof(mask) == 4);
-	unsigned bit = mask ? rnum % __builtin_popcount(mask) : 0;
-	#if !defined(__BMI2__)
-		#error rand_bit not implemented for non __BMI2__ i386
-	#else
-		uint32_t picked = _pdep_u32(1ul << bit, mask);
-		return picked ? __builtin_ctz(picked) : 0;
-	#endif
-#elif defined( __x86_64 )
-	static_assert(sizeof(mask) == 8);
-	unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
-	#if !defined(__BMI2__)
-		uint64_t v = mask;   // Input value to find position with rank r.
-		unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
-		unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
-		uint64_t a, b, c, d; // Intermediate temporaries for bit count.
-		unsigned int t;      // Bit count temporary.
-
-		// Do a normal parallel bit count for a 64-bit integer,
-		// but store all intermediate steps.
-		a =  v - ((v >> 1) & ~0UL/3);
-		b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
-		c = (b + (b >> 4)) & ~0UL/0x11;
-		d = (c + (c >> 8)) & ~0UL/0x101;
-
-
-		t = (d >> 32) + (d >> 48);
-		// Now do branchless select!
-		s  = 64;
-		s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
-		t  = (d >> (s - 16)) & 0xff;
-		s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
-		t  = (c >> (s - 8)) & 0xf;
-		s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
-		t  = (b >> (s - 4)) & 0x7;
-		s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
-		t  = (a >> (s - 2)) & 0x3;
-		s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
-		t  = (v >> (s - 1)) & 0x1;
-		s -= ((t - r) & 256) >> 8;
-		return s - 1;
-	#else
-		uint64_t picked = _pdep_u64(1ul << bit, mask);
-		return picked ? __builtin_ctzl(picked) : 0;
-	#endif
-#elif defined( __ARM_ARCH )
-	#error rand_bit not implemented for arm
-#else
-	#error uknown hardware architecture
-#endif
-}
-
-
-//-----------------------------------------------------------------------------
-// Helpers used by extract
-// (_mask_bitsidx() & X) returns a bit index valid for a __cfa_readyQ_mask_t, where X is any integer
-static inline __cfa_readyQ_mask_t _mask_bitsidx () __attribute__ ((const)) { return (8 * sizeof(__cfa_readyQ_mask_t)) - 1; }
-
-// (X >> _mask_shiftidx()) retuns an index into an array of __cfa_readyQ_mask_t
-static inline __cfa_readyQ_mask_t _mask_shiftidx() __attribute__ ((const)) { return (8 * sizeof(__cfa_readyQ_mask_t)) - __builtin_clzl(_mask_bitsidx()); }
-
-
-// Assuming a large bit mask represented as an array of __cfa_readyQ_mask_t
-// Given an index into the large mask, returns the bit index and which __cfa_readyQ_mask_t index in the array
-static inline [__cfa_readyQ_mask_t, __cfa_readyQ_mask_t] extract(__cfa_readyQ_mask_t idx) {
-	__cfa_readyQ_mask_t word = idx >> _mask_shiftidx();
-	__cfa_readyQ_mask_t bit  = idx &  _mask_bitsidx();
-	return [bit, word];
 }
 
@@ -247,4 +175,43 @@
 // Intrusive Queue used by ready queue
 //=======================================================================
+// Intrusives lanes which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __intrusive_lane_t {
+	// spin lock protecting the queue
+	volatile bool lock;
+
+	// anchor for the head and the tail of the queue
+	struct __sentinel_t {
+		// Link lists fields
+		// instrusive link field for threads
+		// must be exactly as in $thread
+		__thread_desc_link link;
+	} before, after;
+
+#if defined(__CFA_WITH_VERIFY__)
+	// id of last processor to acquire the lock
+	// needed only to check for mutual exclusion violations
+	unsigned int last_id;
+
+	// number of items on this list
+	// needed only to check for deadlocks
+	unsigned int count;
+#endif
+
+	// Optional statistic counters
+	#if !defined(__CFA_NO_SCHED_STATS__)
+		struct __attribute__((aligned(64))) {
+			// difference between number of push and pops
+			ssize_t diff;
+
+			// total number of pushes and pops
+			size_t  push;
+			size_t  pop ;
+		} stat;
+	#endif
+};
+
+void  ?{}(__intrusive_lane_t & this);
+void ^?{}(__intrusive_lane_t & this);
+
 // Get the head pointer (one before the first element) from the anchor
 static inline $thread * head(const __intrusive_lane_t & this) {
@@ -303,7 +270,4 @@
 	/* paranoid */ verify(__alignof__(this) == 128);
 	/* paranoid */ verifyf(((intptr_t)(&this) % 128) == 0, "Expected address to be aligned %p %% 128 == %zd", &this, ((intptr_t)(&this) % 128));
-
-	/* paranoid */ verifyf(_mask_shiftidx() == 6 , "%llu", _mask_shiftidx());
-	/* paranoid */ verifyf(_mask_bitsidx () == 63, "%llu", _mask_bitsidx());
 }
 
@@ -430,4 +394,159 @@
 	// Cannot verify here since it may not be locked
 	return this.before.link.ts;
+}
+
+//=======================================================================
+// Scalable Non-Zero counter
+//=======================================================================
+
+union __snzi_val_t {
+	uint64_t _all;
+	struct __attribute__((packed)) {
+		char cnt;
+		uint64_t ver:56;
+	};
+};
+
+bool cas(volatile __snzi_val_t & self, __snzi_val_t & exp, char _cnt, uint64_t _ver) {
+	__snzi_val_t t;
+	t.ver = _ver;
+	t.cnt = _cnt;
+	/* paranoid */ verify(t._all == ((_ver << 8) | ((unsigned char)_cnt)));
+	return __atomic_compare_exchange_n(&self._all, &exp._all, t._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+}
+
+bool cas(volatile __snzi_val_t & self, __snzi_val_t & exp, const __snzi_val_t & tar) {
+	return __atomic_compare_exchange_n(&self._all, &exp._all, tar._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+}
+
+void ?{}( __snzi_val_t & this ) { this._all = 0; }
+void ?{}( __snzi_val_t & this, const volatile __snzi_val_t & o) { this._all = o._all; }
+
+struct __attribute__((aligned(128))) __snzi_node_t {
+	volatile __snzi_val_t value;
+	struct __snzi_node_t * parent;
+	bool is_root;
+};
+
+static inline void arrive( __snzi_node_t & );
+static inline void depart( __snzi_node_t & );
+
+#define __snzi_half -1
+
+//--------------------------------------------------
+// Root node
+static void arrive_r( __snzi_node_t & this ) {
+	/* paranoid */ verify( this.is_root );
+	__atomic_fetch_add(&this.value._all, 1, __ATOMIC_SEQ_CST);
+}
+
+static void depart_r( __snzi_node_t & this ) {
+	/* paranoid */ verify( this.is_root );
+	__atomic_fetch_sub(&this.value._all, 1, __ATOMIC_SEQ_CST);
+}
+
+//--------------------------------------------------
+// Hierarchical node
+static void arrive_h( __snzi_node_t & this ) {
+	int undoArr = 0;
+	bool success = false;
+	while(!success) {
+		__snzi_val_t x = { this.value };
+		/* paranoid */ verify(x.cnt <= 120);
+		if( x.cnt >= 1 ) {
+			if( cas( this.value, x, x.cnt + 1, x.ver ) ) {
+				success = true;
+			}
+		}
+		/* paranoid */ verify(x.cnt <= 120);
+		if( x.cnt == 0 ) {
+			if( cas( this.value, x, __snzi_half, x.ver + 1) ) {
+				success = true;
+				x.cnt = __snzi_half;
+				x.ver = x.ver + 1;
+			}
+		}
+		/* paranoid */ verify(x.cnt <= 120);
+		if( x.cnt == __snzi_half ) {
+			/* paranoid */ verify( this.parent);
+			arrive( *this.parent );
+			if( !cas( this.value, x, 1, x.ver) ) {
+				undoArr = undoArr + 1;
+			}
+		}
+	}
+
+	for(int i = 0; i < undoArr; i++) {
+		/* paranoid */ verify( this.parent );
+		depart( *this.parent );
+	}
+}
+
+static void depart_h( __snzi_node_t & this ) {
+	while(true) {
+		const __snzi_val_t x = { this.value };
+		/* paranoid */ verifyf(x.cnt >= 1, "%d", x.cnt);
+		if( cas( this.value, x, x.cnt - 1, x.ver ) ) {
+			if( x.cnt == 1 ) {
+				/* paranoid */ verify( this.parent );
+				depart( *this.parent );
+			}
+			return;
+		}
+	}
+}
+
+//--------------------------------------------------
+// All nodes
+static inline void arrive( __snzi_node_t & this ) {
+	if(this.is_root) arrive_r( this );
+	else arrive_h( this );
+}
+
+static inline void depart( __snzi_node_t & this ) {
+	if(this.is_root) depart_r( this );
+	else depart_h( this );
+}
+
+static inline bool query( __snzi_node_t & this ) {
+	/* paranoid */ verify( this.is_root );
+	return this.value._all > 0;
+}
+
+//--------------------------------------------------
+// SNZI object
+void  ?{}( __snzi_t & this, unsigned depth ) with( this ) {
+	mask = (1 << depth) - 1;
+	root = (1 << (depth + 1)) - 2;
+	nodes = alloc( root + 1 );
+
+	int width = 1 << depth;
+	for(int i = 0; i < root; i++) {
+		nodes[i].value._all = 0;
+		nodes[i].parent = &nodes[(i / 2) + width ];
+		nodes[i].is_root = false;
+	}
+
+	nodes[ root ].value._all = 0;
+	nodes[ root ].parent = 0p;
+	nodes[ root ].is_root = true;
+}
+
+void ^?{}( __snzi_t & this ) {
+	free( this.nodes );
+}
+
+static inline void arrive( __snzi_t & this, int idx) {
+	idx &= this.mask;
+	arrive( this.nodes[idx] );
+}
+
+static inline void depart( __snzi_t & this, int idx) {
+	idx &= this.mask;
+	depart( this.nodes[idx] );
+}
+
+static inline bool query( const __snzi_t & this ) {
+	return query( this.nodes[ this.root ] );
 }
 
@@ -466,8 +585,4 @@
 
 void ?{}(__ready_queue_t & this) with (this) {
-	used.count = 0;
-	for( i ; __cfa_lane_mask_size ) {
-		used.mask[i] = 0;
-	}
 
 	lanes.data = alloc(4);
@@ -476,4 +591,5 @@
 	}
 	lanes.count = 4;
+	snzi{ log2( lanes.count / 8 ) };
 
 	#if !defined(__CFA_NO_STATISTICS__)
@@ -491,5 +607,7 @@
 void ^?{}(__ready_queue_t & this) with (this) {
 	verify( 4  == lanes.count );
-	verify( 0  == used .count );
+	verify( !query( snzi ) );
+
+	^(snzi){};
 
 	for( i; 4 ) {
@@ -497,86 +615,9 @@
 	}
 	free(lanes.data);
-
-
-	#if defined(__CFA_WITH_VERIFY__)
-		for( i ; __cfa_lane_mask_size ) {
-			assert( 0 == used.mask[i] );
-		}
-	#endif
-}
-
-//-----------------------------------------------------------------------
-enum mask_strictness {
-	STRICT,
-	NOCHECK
-};
-
-// Set a given bit in the bit mask array
-// strictness determines of the bit had to be cleared before
-static inline void mask_set(__cfa_readyQ_mask_t * mask, unsigned index, mask_strictness strict) {
-	// Extract the array and bit indexes
-	__cfa_readyQ_mask_t word;
-	__cfa_readyQ_mask_t bit;
-	[bit, word] = extract(index);
-
-	__cfadbg_print_safe(ready_queue, "Kernel : Ready queue extracted index %u as [bit %llu, word %llu]\n", index, bit, word);
-
-	// Conditional check
-	verifyf(
-		strict != STRICT || // Conditional check if it was expected to be cleared
-		((mask[word] & (1ull << bit)) == 0),
-		"Before set %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit)
-	);
-
-	// Atomically set the bit
-	__attribute__((unused)) bool ret = __atomic_bts(&mask[word], bit);
-
-	// Conditional check
-	verifyf(
-		strict != STRICT || // Conditional check if it was expected to be cleared
-		!ret,
-		"Bit was not set but bts returned true"
-	);
-
-	// Unconditional check
-	verifyf(
-		(mask[word] & (1ull << bit)) != 0,
-		"After set %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit)
-	);
-}
-
-static inline void mask_clear(__cfa_readyQ_mask_t * mask, unsigned index, mask_strictness strict) {
-	// Extract the array and bit indexes
-	__cfa_readyQ_mask_t word;
-	__cfa_readyQ_mask_t bit;
-	[bit, word] = extract(index);
-
-	// Conditional check
-	verifyf(
-		strict != STRICT || // Conditional check if it was expected to be set
-		((mask[word] & (1ull << bit)) != 0),
-		"Before clear %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit)
-	);
-
-	// Atomically clear the bit
-	__attribute__((unused)) bool ret = __atomic_btr(&mask[word], bit);
-
-	// Conditional check
-	verifyf(
-		strict != STRICT || // Conditional check if it was expected to be cleared
-		ret,
-		"Bit was set but btr returned false"
-	);
-
-	// Unconditional check
-	verifyf(
-		(mask[word] & (1ull << bit)) == 0,
-		"After clear %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit)
-	);
 }
 
 //-----------------------------------------------------------------------
 __attribute__((hot)) bool push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
-	__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p (mask %llu)\n", thrd, cltr, used.mask[0]);
+	__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
 
 	// write timestamp
@@ -601,5 +642,4 @@
 	#endif
 
-	__attribute__((unused)) size_t num = __atomic_load_n( &used.count, __ATOMIC_RELAXED );
 	bool first = false;
 
@@ -609,16 +649,12 @@
 	// If this lane used to be empty we need to do more
 	if(lane_first) {
-		// Update the bit mask
-		mask_set((__cfa_readyQ_mask_t *)used.mask, i, STRICT);
-
-		// Update the global count
-		size_t ret = __atomic_fetch_add( &used.count, 1z, __ATOMIC_SEQ_CST);
-
 		// Check if the entire queue used to be empty
-		first = (ret == 0);
+		first = !query(snzi);
+
+		// Update the snzi
+		arrive( snzi, i );
 	}
 
 	#if defined(__CFA_WITH_VERIFY__)
-		/* paranoid */ verifyf( used.count <= lanes.count, "Non-empty count (%zu) exceeds actual count (%zu)\n", used.count, lanes.count );
 		/* paranoid */ verifyf( lanes.data[i].last_id == kernelTLS.this_processor->id, "Expected last processor to lock queue %u to be %u, was %u\n", i, lanes.data[i].last_id, kernelTLS.this_processor->id );
 		/* paranoid */ verifyf( lanes.data[i].lock, "List %u is not locked\n", i );
@@ -634,6 +670,4 @@
 	#if !defined(__CFA_NO_STATISTICS__)
 		tls.pick.push.success++;
-		tls.used.value += num;
-		tls.used.count += 1;
 	#endif
 
@@ -692,9 +726,5 @@
 	// If this was the last element in the lane
 	if(emptied) {
-		// Update the global count
-		__atomic_fetch_sub( &used.count, 1z, __ATOMIC_SEQ_CST);
-
-		// Update the bit mask
-		mask_clear((__cfa_readyQ_mask_t *)used.mask, w, STRICT);
+		depart( snzi, w );
 	}
 
@@ -704,9 +734,4 @@
 	#endif
 
-	// For statistics, check the count before we release the lock
-	#if !defined(__CFA_NO_STATISTICS__)
-		int num = __atomic_load_n( &used.count, __ATOMIC_RELAXED );
-	#endif
-
 	// Unlock and return
 	__atomic_unlock(&lane.lock);
@@ -715,6 +740,4 @@
 	#if !defined(__CFA_NO_STATISTICS__)
 		tls.pick.pop.success++;
-		tls.used.value += num;
-		tls.used.count += 1;
 	#endif
 
@@ -728,55 +751,12 @@
 
 	// As long as the list is not empty, try finding a lane that isn't empty and pop from it
-	while( __atomic_load_n( &used.count, __ATOMIC_RELAXED ) != 0) {
-		#if !defined(__CFA_READQ_NO_BITMASK__)
-			// If using bit masks
-			#if !defined(__CFA_NO_SCHED_STATS__)
-				tls.pick.pop.maskrds++;
-			#endif
-
-			// Pick two lists at random
-			unsigned ri = __tls_rand();
-			unsigned rj = __tls_rand();
-
-			// Find which __cfa_readyQ_mask_t the two lists belong
-			unsigned num = ((__atomic_load_n( &lanes.count, __ATOMIC_RELAXED ) - 1) >> 6) + 1;
-			unsigned wdxi = (ri >> 6u) % num;
-			unsigned wdxj = (rj >> 6u) % num;
-
-			// Get the actual __cfa_readyQ_mask_t
-			size_t maski = __atomic_load_n( &used.mask[wdxi], __ATOMIC_RELAXED );
-			size_t maskj = __atomic_load_n( &used.mask[wdxj], __ATOMIC_RELAXED );
-
-			// If both of these masks are empty, retry
-			if(maski == 0 && maskj == 0) continue;
-
-			// Pick one of the non-zero bits in the masks and get the bit indexes
-			unsigned bi = rand_bit(ri, maski);
-			unsigned bj = rand_bit(rj, maskj);
-
-			// some checks
-			/* paranoid */ verifyf(bi < 64, "%zu %u", maski, bi);
-			/* paranoid */ verifyf(bj < 64, "%zu %u", maskj, bj);
-
-			// get the general list index
-			unsigned i = bi | (wdxi << 6);
-			unsigned j = bj | (wdxj << 6);
-
-			// some more checks
-			/* paranoid */ verifyf(i < lanes.count, "%u", wdxi << 6);
-			/* paranoid */ verifyf(j < lanes.count, "%u", wdxj << 6);
-
-			// try popping from the 2 picked lists
-			struct $thread * thrd = try_pop(cltr, i, j);
-			if(thrd) return thrd;
-		#else
-			// Pick two lists at random
-			int i = __tls_rand() % __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
-			int j = __tls_rand() % __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
-
-			// try popping from the 2 picked lists
-			struct $thread * thrd = try_pop(cltr, i, j);
-			if(thrd) return thrd;
-		#endif
+	while( query(snzi) ) {
+		// Pick two lists at random
+		int i = __tls_rand() % __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+		int j = __tls_rand() % __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+
+		// try popping from the 2 picked lists
+		struct $thread * thrd = try_pop(cltr, i, j);
+		if(thrd) return thrd;
 	}
 
@@ -789,17 +769,4 @@
 static void check( __ready_queue_t & q ) with (q) {
 	#if defined(__CFA_WITH_VERIFY__)
-		{
-			int idx = 0;
-			for( w ; __cfa_lane_mask_size ) {
-				for( b ; 8 * sizeof(__cfa_readyQ_mask_t) ) {
-					bool is_empty = idx < lanes.count ? (ts(lanes.data[idx]) == 0) : true;
-					bool should_be_empty = 0 == (used.mask[w] & (1z << b));
-					assertf(should_be_empty == is_empty, "Inconsistent list %d, mask expect : %d, actual is got %d", idx, should_be_empty, (bool)is_empty);
-					assert(__cfa_max_lanes > idx);
-					idx++;
-				}
-			}
-		}
-
 		{
 			for( idx ; lanes.count ) {
@@ -853,8 +820,7 @@
 	// grow the ready queue
 	with( cltr->ready_queue ) {
+		^(snzi){};
+
 		size_t ncount = lanes.count;
-
-		// Check that we have some space left
-		if(ncount + 4 >= __cfa_max_lanes) abort("Program attempted to create more than maximum number of Ready Queues (%zu)", __cfa_max_lanes);
 
 		// increase count
@@ -877,5 +843,11 @@
 		lanes.count = ncount;
 
-		// fields in 'used' don't need to change when growing
+		// Re-create the snzi
+		snzi{ log2( lanes.count / 8 ) };
+		for( idx; (size_t)lanes.count ) {
+			if( !is_empty(lanes.data[idx]) ) {
+				arrive(snzi, idx);
+			}
+		}
 	}
 
@@ -900,4 +872,6 @@
 
 	with( cltr->ready_queue ) {
+		^(snzi){};
+
 		// Make sure that the total thread count stays the same
 		#if defined(__CFA_WITH_VERIFY__)
@@ -941,6 +915,4 @@
 			}
 
-			mask_clear((__cfa_readyQ_mask_t *)used.mask, idx, NOCHECK);
-
 			// Unlock the lane
 			__atomic_unlock(&lanes.data[idx].lock);
@@ -952,10 +924,4 @@
 
 		__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue displaced %zu threads\n", displaced);
-
-		// recompute the used.count instead of maintaining it
-		used.count = 0;
-		for( i ; __cfa_lane_mask_size ) {
-			used.count += __builtin_popcountl(used.mask[i]);
-		}
 
 		// Allocate new array (uses realloc and memcpies the data)
@@ -965,4 +931,12 @@
 		for( idx; (size_t)lanes.count ) {
 			fix(lanes.data[idx]);
+		}
+
+		// Re-create the snzi
+		snzi{ log2( lanes.count / 8 ) };
+		for( idx; (size_t)lanes.count ) {
+			if( !is_empty(lanes.data[idx]) ) {
+				arrive(snzi, idx);
+			}
 		}
 
