Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision fde879b3d343755202ac61bdeddc27c27ddd1e9e)
+++ libcfa/src/concurrency/kernel.cfa	(revision 12daa4357a10e875ac973c95a27189ab57b7d175)
@@ -280,5 +280,5 @@
 
 				// Spin a little on I/O, just in case
-					for(5) {
+				for(5) {
 					__maybe_io_drain( this );
 					readyThread = pop_fast( this->cltr );
@@ -287,5 +287,5 @@
 
 				// no luck, try stealing a few times
-					for(5) {
+				for(5) {
 					if( __maybe_io_drain( this ) ) {
 						readyThread = pop_fast( this->cltr );
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision fde879b3d343755202ac61bdeddc27c27ddd1e9e)
+++ libcfa/src/concurrency/kernel.hfa	(revision 12daa4357a10e875ac973c95a27189ab57b7d175)
@@ -66,4 +66,5 @@
 		unsigned id;
 		unsigned target;
+		unsigned last;
 		unsigned long long int cutoff;
 	} rdq;
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision fde879b3d343755202ac61bdeddc27c27ddd1e9e)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 12daa4357a10e875ac973c95a27189ab57b7d175)
@@ -541,4 +541,5 @@
 	this.rdq.id  = -1u;
 	this.rdq.target = -1u;
+	this.rdq.last = -1u;
 	this.rdq.cutoff = 0ull;
 	do_terminate = false;
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision fde879b3d343755202ac61bdeddc27c27ddd1e9e)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 12daa4357a10e875ac973c95a27189ab57b7d175)
@@ -20,8 +20,9 @@
 
 
-#define USE_RELAXED_FIFO
+// #define USE_RELAXED_FIFO
 // #define USE_WORK_STEALING
 
 #include "bits/defs.hfa"
+#include "device/cpu.hfa"
 #include "kernel_private.hfa"
 
@@ -47,5 +48,7 @@
 #endif
 
-#if   defined(USE_RELAXED_FIFO)
+#if   defined(USE_CPU_WORK_STEALING)
+	#define READYQ_SHARD_FACTOR 2
+#elif defined(USE_RELAXED_FIFO)
 	#define BIAS 4
 	#define READYQ_SHARD_FACTOR 4
@@ -215,11 +218,25 @@
 //=======================================================================
 void ?{}(__ready_queue_t & this) with (this) {
-	lanes.data  = 0p;
-	lanes.tscs  = 0p;
-	lanes.count = 0;
+	#if defined(USE_CPU_WORK_STEALING)
+		lanes.count = cpu_info.hthrd_count * READYQ_SHARD_FACTOR;
+		lanes.data = alloc( lanes.count );
+		lanes.tscs = alloc( lanes.count );
+
+		for( idx; (size_t)lanes.count ) {
+			(lanes.data[idx]){};
+			lanes.tscs[idx].tv = rdtscl();
+		}
+	#else
+		lanes.data  = 0p;
+		lanes.tscs  = 0p;
+		lanes.count = 0;
+	#endif
 }
 
 void ^?{}(__ready_queue_t & this) with (this) {
-	verify( SEQUENTIAL_SHARD == lanes.count );
+	#if !defined(USE_CPU_WORK_STEALING)
+		verify( SEQUENTIAL_SHARD == lanes.count );
+	#endif
+
 	free(lanes.data);
 	free(lanes.tscs);
@@ -227,4 +244,108 @@
 
 //-----------------------------------------------------------------------
+#if defined(USE_CPU_WORK_STEALING)
+	__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd, bool push_local) with (cltr->ready_queue) {
+		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
+
+		processor * const proc = kernelTLS().this_processor;
+		const bool external = !push_local || (!proc) || (cltr != proc->cltr);
+
+		const int cpu = __kernel_getcpu();
+		/* paranoid */ verify(cpu >= 0);
+		/* paranoid */ verify(cpu < cpu_info.hthrd_count);
+		/* paranoid */ verify(cpu * READYQ_SHARD_FACTOR < lanes.count);
+
+		const int start = cpu * READYQ_SHARD_FACTOR;
+		unsigned i;
+		do {
+			unsigned r;
+			if(unlikely(external)) { r = __tls_rand(); }
+			else { r = proc->rdq.its++; }
+			i = start + (r % READYQ_SHARD_FACTOR);
+			// If we can't lock it retry
+		} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+
+		// Actually push it
+		push(lanes.data[i], thrd);
+
+		// Unlock and return
+		__atomic_unlock( &lanes.data[i].lock );
+
+		#if !defined(__CFA_NO_STATISTICS__)
+			if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED);
+			else __tls_stats()->ready.push.local.success++;
+		#endif
+
+		__cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
+
+	}
+
+	// Pop from the ready queue from a given cluster
+	__attribute__((hot)) $thread * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
+		/* paranoid */ verify( lanes.count > 0 );
+		/* paranoid */ verify( kernelTLS().this_processor );
+
+		const int cpu = __kernel_getcpu();
+		/* paranoid */ verify(cpu >= 0);
+		/* paranoid */ verify(cpu * READYQ_SHARD_FACTOR < lanes.count);
+		/* paranoid */ verify(cpu < cpu_info.hthrd_count);
+
+		processor * const proc = kernelTLS().this_processor;
+		const int start = cpu * READYQ_SHARD_FACTOR;
+
+		// Did we already have a help target
+		if(proc->rdq.target == -1u) {
+			// if We don't have a
+			unsigned long long min = ts(lanes.data[start]);
+			for(i; READYQ_SHARD_FACTOR) {
+				unsigned long long tsc = ts(lanes.data[start + i]);
+				if(tsc < min) min = tsc;
+			}
+			proc->rdq.cutoff = min;
+			proc->rdq.target = __tls_rand() % lanes.count;
+		}
+		else {
+			const unsigned long long bias = 0; //2_500_000_000;
+			const unsigned long long cutoff = proc->rdq.cutoff > bias ? proc->rdq.cutoff - bias : proc->rdq.cutoff;
+			{
+				unsigned target = proc->rdq.target;
+				proc->rdq.target = -1u;
+				if(lanes.tscs[target].tv < cutoff && ts(lanes.data[target]) < cutoff) {
+					$thread * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
+					proc->rdq.last = target;
+					if(t) return t;
+				}
+			}
+
+			unsigned last = proc->rdq.last;
+			if(last != -1u && lanes.tscs[last].tv < cutoff && ts(lanes.data[last]) < cutoff) {
+				$thread * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.help));
+				if(t) return t;
+			}
+			else {
+				proc->rdq.last = -1u;
+			}
+		}
+
+		for(READYQ_SHARD_FACTOR) {
+			unsigned i = start + (proc->rdq.itr++ % READYQ_SHARD_FACTOR);
+			if($thread * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
+		}
+
+		// All lanes where empty return 0p
+		return 0p;
+	}
+
+	__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+		processor * const proc = kernelTLS().this_processor;
+		unsigned last = proc->rdq.last;
+
+		unsigned i = __tls_rand() % lanes.count;
+		return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal));
+	}
+	__attribute__((hot)) struct $thread * pop_search(struct cluster * cltr) {
+		return search(cltr);
+	}
+#endif
 #if defined(USE_RELAXED_FIFO)
 	//-----------------------------------------------------------------------
@@ -580,128 +701,134 @@
 }
 
-// Grow the ready queue
-void ready_queue_grow(struct cluster * cltr) {
-	size_t ncount;
-	int target = cltr->procs.total;
-
-	/* paranoid */ verify( ready_mutate_islocked() );
-	__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue\n");
-
-	// Make sure that everything is consistent
-	/* paranoid */ check( cltr->ready_queue );
-
-	// grow the ready queue
-	with( cltr->ready_queue ) {
-		// Find new count
-		// Make sure we always have atleast 1 list
-		if(target >= 2) {
-			ncount = target * READYQ_SHARD_FACTOR;
-		} else {
-			ncount = SEQUENTIAL_SHARD;
-		}
-
-		// Allocate new array (uses realloc and memcpies the data)
-		lanes.data = alloc( ncount, lanes.data`realloc );
-
-		// Fix the moved data
-		for( idx; (size_t)lanes.count ) {
-			fix(lanes.data[idx]);
-		}
-
-		// Construct new data
-		for( idx; (size_t)lanes.count ~ ncount) {
-			(lanes.data[idx]){};
-		}
-
-		// Update original
-		lanes.count = ncount;
-	}
-
-	fix_times(cltr);
-
-	reassign_cltr_id(cltr);
-
-	// Make sure that everything is consistent
-	/* paranoid */ check( cltr->ready_queue );
-
-	__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue done\n");
-
-	/* paranoid */ verify( ready_mutate_islocked() );
-}
-
-// Shrink the ready queue
-void ready_queue_shrink(struct cluster * cltr) {
-	/* paranoid */ verify( ready_mutate_islocked() );
-	__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue\n");
-
-	// Make sure that everything is consistent
-	/* paranoid */ check( cltr->ready_queue );
-
-	int target = cltr->procs.total;
-
-	with( cltr->ready_queue ) {
-		// Remember old count
-		size_t ocount = lanes.count;
-
-		// Find new count
-		// Make sure we always have atleast 1 list
-		lanes.count = target >= 2 ? target * READYQ_SHARD_FACTOR: SEQUENTIAL_SHARD;
-		/* paranoid */ verify( ocount >= lanes.count );
-		/* paranoid */ verify( lanes.count == target * READYQ_SHARD_FACTOR || target < 2 );
-
-		// for printing count the number of displaced threads
-		#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
-			__attribute__((unused)) size_t displaced = 0;
-		#endif
-
-		// redistribute old data
-		for( idx; (size_t)lanes.count ~ ocount) {
-			// Lock is not strictly needed but makes checking invariants much easier
-			__attribute__((unused)) bool locked = __atomic_try_acquire(&lanes.data[idx].lock);
-			verify(locked);
-
-			// As long as we can pop from this lane to push the threads somewhere else in the queue
-			while(!is_empty(lanes.data[idx])) {
-				struct $thread * thrd;
-				unsigned long long _;
-				[thrd, _] = pop(lanes.data[idx]);
-
-				push(cltr, thrd, true);
-
-				// for printing count the number of displaced threads
-				#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
-					displaced++;
-				#endif
-			}
-
-			// Unlock the lane
-			__atomic_unlock(&lanes.data[idx].lock);
-
-			// TODO print the queue statistics here
-
-			^(lanes.data[idx]){};
-		}
-
-		__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue displaced %zu threads\n", displaced);
-
-		// Allocate new array (uses realloc and memcpies the data)
-		lanes.data = alloc( lanes.count, lanes.data`realloc );
-
-		// Fix the moved data
-		for( idx; (size_t)lanes.count ) {
-			fix(lanes.data[idx]);
-		}
-	}
-
-	fix_times(cltr);
-
-	reassign_cltr_id(cltr);
-
-	// Make sure that everything is consistent
-	/* paranoid */ check( cltr->ready_queue );
-
-	__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue done\n");
-	/* paranoid */ verify( ready_mutate_islocked() );
-}
+#if defined(USE_CPU_WORK_STEALING)
+	// ready_queue size is fixed in this case
+	void ready_queue_grow(struct cluster * cltr) {}
+	void ready_queue_shrink(struct cluster * cltr) {}
+#else
+	// Grow the ready queue
+	void ready_queue_grow(struct cluster * cltr) {
+		size_t ncount;
+		int target = cltr->procs.total;
+
+		/* paranoid */ verify( ready_mutate_islocked() );
+		__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue\n");
+
+		// Make sure that everything is consistent
+		/* paranoid */ check( cltr->ready_queue );
+
+		// grow the ready queue
+		with( cltr->ready_queue ) {
+			// Find new count
+			// Make sure we always have atleast 1 list
+			if(target >= 2) {
+				ncount = target * READYQ_SHARD_FACTOR;
+			} else {
+				ncount = SEQUENTIAL_SHARD;
+			}
+
+			// Allocate new array (uses realloc and memcpies the data)
+			lanes.data = alloc( ncount, lanes.data`realloc );
+
+			// Fix the moved data
+			for( idx; (size_t)lanes.count ) {
+				fix(lanes.data[idx]);
+			}
+
+			// Construct new data
+			for( idx; (size_t)lanes.count ~ ncount) {
+				(lanes.data[idx]){};
+			}
+
+			// Update original
+			lanes.count = ncount;
+		}
+
+		fix_times(cltr);
+
+		reassign_cltr_id(cltr);
+
+		// Make sure that everything is consistent
+		/* paranoid */ check( cltr->ready_queue );
+
+		__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue done\n");
+
+		/* paranoid */ verify( ready_mutate_islocked() );
+	}
+
+	// Shrink the ready queue
+	void ready_queue_shrink(struct cluster * cltr) {
+		/* paranoid */ verify( ready_mutate_islocked() );
+		__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue\n");
+
+		// Make sure that everything is consistent
+		/* paranoid */ check( cltr->ready_queue );
+
+		int target = cltr->procs.total;
+
+		with( cltr->ready_queue ) {
+			// Remember old count
+			size_t ocount = lanes.count;
+
+			// Find new count
+			// Make sure we always have atleast 1 list
+			lanes.count = target >= 2 ? target * READYQ_SHARD_FACTOR: SEQUENTIAL_SHARD;
+			/* paranoid */ verify( ocount >= lanes.count );
+			/* paranoid */ verify( lanes.count == target * READYQ_SHARD_FACTOR || target < 2 );
+
+			// for printing count the number of displaced threads
+			#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
+				__attribute__((unused)) size_t displaced = 0;
+			#endif
+
+			// redistribute old data
+			for( idx; (size_t)lanes.count ~ ocount) {
+				// Lock is not strictly needed but makes checking invariants much easier
+				__attribute__((unused)) bool locked = __atomic_try_acquire(&lanes.data[idx].lock);
+				verify(locked);
+
+				// As long as we can pop from this lane to push the threads somewhere else in the queue
+				while(!is_empty(lanes.data[idx])) {
+					struct $thread * thrd;
+					unsigned long long _;
+					[thrd, _] = pop(lanes.data[idx]);
+
+					push(cltr, thrd, true);
+
+					// for printing count the number of displaced threads
+					#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
+						displaced++;
+					#endif
+				}
+
+				// Unlock the lane
+				__atomic_unlock(&lanes.data[idx].lock);
+
+				// TODO print the queue statistics here
+
+				^(lanes.data[idx]){};
+			}
+
+			__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue displaced %zu threads\n", displaced);
+
+			// Allocate new array (uses realloc and memcpies the data)
+			lanes.data = alloc( lanes.count, lanes.data`realloc );
+
+			// Fix the moved data
+			for( idx; (size_t)lanes.count ) {
+				fix(lanes.data[idx]);
+			}
+		}
+
+		fix_times(cltr);
+
+		reassign_cltr_id(cltr);
+
+		// Make sure that everything is consistent
+		/* paranoid */ check( cltr->ready_queue );
+
+		__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue done\n");
+		/* paranoid */ verify( ready_mutate_islocked() );
+	}
+#endif
 
 #if !defined(__CFA_NO_STATISTICS__)
