Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 5614552a09dc6362003f31a5630722adea74aeb4)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 953827abfbb0f0ef12872840f84ea27ff7bc6a22)
@@ -20,6 +20,7 @@
 
 
-#define USE_RELAXED_FIFO
+// #define USE_RELAXED_FIFO
 // #define USE_WORK_STEALING
+#define USE_CPU_WORK_STEALING
 
 #include "bits/defs.hfa"
@@ -341,5 +342,25 @@
 			}
 			proc->rdq.cutoff = min;
-			proc->rdq.target = (map.start * READYQ_SHARD_FACTOR) + (__tls_rand() % (map.count* READYQ_SHARD_FACTOR));
+
+			/* paranoid */ verify(lanes.count < 65536); // The following code assumes max 65536 cores.
+			/* paranoid */ verify(map.count < 65536); // The following code assumes max 65536 cores.
+			uint64_t chaos = __tls_rand();
+			uint64_t high_chaos = (chaos >> 32);
+			uint64_t  mid_chaos = (chaos >> 16) & 0xffff;
+			uint64_t  low_chaos = chaos & 0xffff;
+
+			unsigned me = map.self;
+			unsigned cpu_chaos = map.start + (mid_chaos % map.count);
+			bool global = cpu_chaos == me;
+
+			if(global) {
+				proc->rdq.target = high_chaos % lanes.count;
+			} else {
+				proc->rdq.target = (cpu_chaos * READYQ_SHARD_FACTOR) + (low_chaos % READYQ_SHARD_FACTOR);
+				/* paranoid */ verify(proc->rdq.target >= (map.start * READYQ_SHARD_FACTOR));
+				/* paranoid */ verify(proc->rdq.target <  ((map.start + map.count) * READYQ_SHARD_FACTOR));
+			}
+
+			/* paranoid */ verify(proc->rdq.target != -1u);
 		}
 		else {
@@ -378,4 +399,9 @@
 		processor * const proc = kernelTLS().this_processor;
 		unsigned last = proc->rdq.last;
+		if(last != -1u) {
+			struct $thread * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.steal));
+			if(t) return t;
+			proc->rdq.last = -1u;
+		}
 
 		unsigned i = __tls_rand() % lanes.count;
