Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision d529ad03a0b0731081f300848ff6c33dc074d2fc)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 3c4bf050c37d1391ba25a62e7b3d185e99ef7f44)
@@ -20,7 +20,4 @@
 
 
-// #define USE_RELAXED_FIFO
-// #define USE_WORK_STEALING
-// #define USE_CPU_WORK_STEALING
 #define USE_AWARE_STEALING
 
@@ -56,24 +53,10 @@
 #endif
 
-#if   defined(USE_AWARE_STEALING)
-	#define READYQ_SHARD_FACTOR 2
-	#define SEQUENTIAL_SHARD 2
-#elif defined(USE_CPU_WORK_STEALING)
-	#define READYQ_SHARD_FACTOR 2
-#elif defined(USE_RELAXED_FIFO)
-	#define BIAS 4
-	#define READYQ_SHARD_FACTOR 4
-	#define SEQUENTIAL_SHARD 1
-#elif defined(USE_WORK_STEALING)
-	#define READYQ_SHARD_FACTOR 2
-	#define SEQUENTIAL_SHARD 2
-#else
-	#error no scheduling strategy selected
-#endif
+#define READYQ_SHARD_FACTOR 2
+#define SEQUENTIAL_SHARD 2
 
 static inline struct thread$ * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats));
 static inline struct thread$ * try_pop(struct cluster * cltr, unsigned i, unsigned j __STATS(, __stats_readyQ_pop_t & stats));
 static inline struct thread$ * search(struct cluster * cltr);
-static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred);
 
 
@@ -248,32 +231,4 @@
 
 //=======================================================================
-// caches handling
-
-struct __attribute__((aligned(128))) __ready_queue_caches_t {
-	// Count States:
-	// - 0  : No one is looking after this cache
-	// - 1  : No one is looking after this cache, BUT it's not empty
-	// - 2+ : At least one processor is looking after this cache
-	volatile unsigned count;
-};
-
-void  ?{}(__ready_queue_caches_t & this) { this.count = 0; }
-void ^?{}(__ready_queue_caches_t & this) {}
-
-static inline void depart(__ready_queue_caches_t & cache) {
-	/* paranoid */ verify( cache.count > 1);
-	__atomic_fetch_add(&cache.count, -1, __ATOMIC_SEQ_CST);
-	/* paranoid */ verify( cache.count != 0);
-	/* paranoid */ verify( cache.count < 65536 ); // This verify assumes no cluster will have more than 65000 kernel threads mapped to a single cache, which could be correct but is super weird.
-}
-
-static inline void arrive(__ready_queue_caches_t & cache) {
-	// for() {
-	// 	unsigned expected = cache.count;
-	// 	unsigned desired  = 0 == expected ? 2 : expected + 1;
-	// }
-}
-
-//=======================================================================
 // Cforall Ready Queue used for scheduling
 //=======================================================================
@@ -292,34 +247,12 @@
 
 void ?{}(__ready_queue_t & this) with (this) {
-	#if defined(USE_CPU_WORK_STEALING)
-		lanes.count = cpu_info.hthrd_count * READYQ_SHARD_FACTOR;
-		lanes.data = alloc( lanes.count );
-		lanes.tscs = alloc( lanes.count );
-		lanes.help = alloc( cpu_info.hthrd_count );
-
-		for( idx; (size_t)lanes.count ) {
-			(lanes.data[idx]){};
-			lanes.tscs[idx].tv = rdtscl();
-			lanes.tscs[idx].ma = rdtscl();
-		}
-		for( idx; (size_t)cpu_info.hthrd_count ) {
-			lanes.help[idx].src = 0;
-			lanes.help[idx].dst = 0;
-			lanes.help[idx].tri = 0;
-		}
-	#else
-		lanes.data   = 0p;
-		lanes.tscs   = 0p;
-		lanes.caches = 0p;
-		lanes.help   = 0p;
-		lanes.count  = 0;
-	#endif
+	lanes.data   = 0p;
+	lanes.tscs   = 0p;
+	lanes.caches = 0p;
+	lanes.help   = 0p;
+	lanes.count  = 0;
 }
 
 void ^?{}(__ready_queue_t & this) with (this) {
-	#if !defined(USE_CPU_WORK_STEALING)
-		verify( SEQUENTIAL_SHARD == lanes.count );
-	#endif
-
 	free(lanes.data);
 	free(lanes.tscs);
@@ -329,486 +262,126 @@
 
 //-----------------------------------------------------------------------
-#if defined(USE_AWARE_STEALING)
-	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
-		processor * const proc = kernelTLS().this_processor;
-		const bool external = (!proc) || (cltr != proc->cltr);
-		const bool remote   = hint == UNPARK_REMOTE;
-
-		unsigned i;
-		if( external || remote ) {
-			// Figure out where thread was last time and make sure it's valid
-			/* paranoid */ verify(thrd->preferred >= 0);
-			if(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count) {
-				/* paranoid */ verify(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count);
-				unsigned start = thrd->preferred * READYQ_SHARD_FACTOR;
-				do {
-					unsigned r = __tls_rand();
-					i = start + (r % READYQ_SHARD_FACTOR);
-					/* paranoid */ verify( i < lanes.count );
-					// If we can't lock it retry
-				} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
-			} else {
-				do {
-					i = __tls_rand() % lanes.count;
-				} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
-			}
-		} else {
+__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
+	processor * const proc = kernelTLS().this_processor;
+	const bool external = (!proc) || (cltr != proc->cltr);
+	const bool remote   = hint == UNPARK_REMOTE;
+
+	unsigned i;
+	if( external || remote ) {
+		// Figure out where thread was last time and make sure it's valid
+		/* paranoid */ verify(thrd->preferred >= 0);
+		if(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count) {
+			/* paranoid */ verify(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count);
+			unsigned start = thrd->preferred * READYQ_SHARD_FACTOR;
 			do {
-				unsigned r = proc->rdq.its++;
-				i = proc->rdq.id + (r % READYQ_SHARD_FACTOR);
+				unsigned r = __tls_rand();
+				i = start + (r % READYQ_SHARD_FACTOR);
 				/* paranoid */ verify( i < lanes.count );
 				// If we can't lock it retry
 			} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
-		}
-
-		// Actually push it
-		push(lanes.data[i], thrd);
-
-		// Unlock and return
-		__atomic_unlock( &lanes.data[i].lock );
-
-		#if !defined(__CFA_NO_STATISTICS__)
-			if(unlikely(external || remote)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED);
-			else __tls_stats()->ready.push.local.success++;
-		#endif
-	}
-
-	static inline unsigned long long calc_cutoff(const unsigned long long ctsc, const processor * proc, __ready_queue_t & rdq) {
-		unsigned start = proc->rdq.id;
-		unsigned long long max = 0;
-		for(i; READYQ_SHARD_FACTOR) {
-			unsigned long long ptsc = ts(rdq.lanes.data[start + i]);
-			if(ptsc != -1ull) {
-				/* paranoid */ verify( start + i < rdq.lanes.count );
-				unsigned long long tsc = moving_average(ctsc, ptsc, rdq.lanes.tscs[start + i].ma);
-				if(tsc > max) max = tsc;
-			}
-		}
-		return (max + 2 * max) / 2;
-	}
-
-	__attribute__((hot)) struct thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
-		/* paranoid */ verify( lanes.count > 0 );
-		/* paranoid */ verify( kernelTLS().this_processor );
-		/* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count );
-
-		processor * const proc = kernelTLS().this_processor;
-		unsigned this = proc->rdq.id;
-		/* paranoid */ verify( this < lanes.count );
-		__cfadbg_print_safe(ready_queue, "Kernel : pop from %u\n", this);
-
-		// Figure out the current cpu and make sure it is valid
-		const int cpu = __kernel_getcpu();
-		/* paranoid */ verify(cpu >= 0);
-		/* paranoid */ verify(cpu < cpu_info.hthrd_count);
-		unsigned this_cache = cpu_info.llc_map[cpu].cache;
-
-		// Super important: don't write the same value over and over again
-		// We want to maximise our chances that his particular values stays in cache
-		if(lanes.caches[this / READYQ_SHARD_FACTOR].id != this_cache)
-			__atomic_store_n(&lanes.caches[this / READYQ_SHARD_FACTOR].id, this_cache, __ATOMIC_RELAXED);
-
-		const unsigned long long ctsc = rdtscl();
-
-		if(proc->rdq.target == MAX) {
-			uint64_t chaos = __tls_rand();
-			unsigned ext = chaos & 0xff;
-			unsigned other  = (chaos >> 8) % (lanes.count);
-
-			if(ext < 3 || __atomic_load_n(&lanes.caches[other / READYQ_SHARD_FACTOR].id, __ATOMIC_RELAXED) == this_cache) {
-				proc->rdq.target = other;
-			}
-		}
-		else {
-			const unsigned target = proc->rdq.target;
-			__cfadbg_print_safe(ready_queue, "Kernel : %u considering helping %u, tcsc %llu\n", this, target, lanes.tscs[target].tv);
-			/* paranoid */ verify( lanes.tscs[target].tv != MAX );
-			if(target < lanes.count) {
-				const unsigned long long cutoff = calc_cutoff(ctsc, proc, cltr->ready_queue);
-				const unsigned long long age = moving_average(ctsc, lanes.tscs[target].tv, lanes.tscs[target].ma);
-				__cfadbg_print_safe(ready_queue, "Kernel : Help attempt on %u from %u, age %'llu vs cutoff %'llu, %s\n", target, this, age, cutoff, age > cutoff ? "yes" : "no");
-				if(age > cutoff) {
-					thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
-					if(t) return t;
-				}
-			}
-			proc->rdq.target = MAX;
-		}
-
-		for(READYQ_SHARD_FACTOR) {
-			unsigned i = this + (proc->rdq.itr++ % READYQ_SHARD_FACTOR);
-			if(thread$ * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
-		}
-
-		// All lanes where empty return 0p
-		return 0p;
-
-	}
-	__attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
-		unsigned i = __tls_rand() % lanes.count;
-		return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal));
-	}
-	__attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr) {
-		return search(cltr);
-	}
-#endif
-#if defined(USE_CPU_WORK_STEALING)
-	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
-		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
-
-		processor * const proc = kernelTLS().this_processor;
-		const bool external = (!proc) || (cltr != proc->cltr);
-
-		// Figure out the current cpu and make sure it is valid
-		const int cpu = __kernel_getcpu();
-		/* paranoid */ verify(cpu >= 0);
-		/* paranoid */ verify(cpu < cpu_info.hthrd_count);
-		/* paranoid */ verify(cpu * READYQ_SHARD_FACTOR < lanes.count);
-
-		// Figure out where thread was last time and make sure it's
-		/* paranoid */ verify(thrd->preferred >= 0);
-		/* paranoid */ verify(thrd->preferred < cpu_info.hthrd_count);
-		/* paranoid */ verify(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count);
-		const int prf = thrd->preferred * READYQ_SHARD_FACTOR;
-
-		const cpu_map_entry_t & map;
-		choose(hint) {
-			case UNPARK_LOCAL : &map = &cpu_info.llc_map[cpu];
-			case UNPARK_REMOTE: &map = &cpu_info.llc_map[prf];
-		}
-		/* paranoid */ verify(map.start * READYQ_SHARD_FACTOR < lanes.count);
-		/* paranoid */ verify(map.self * READYQ_SHARD_FACTOR < lanes.count);
-		/* paranoid */ verifyf((map.start + map.count) * READYQ_SHARD_FACTOR <= lanes.count, "have %zu lanes but map can go up to %u", lanes.count, (map.start + map.count) * READYQ_SHARD_FACTOR);
-
-		const int start = map.self * READYQ_SHARD_FACTOR;
-		unsigned i;
+		} else {
+			do {
+				i = __tls_rand() % lanes.count;
+			} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+		}
+	} else {
 		do {
-			unsigned r;
-			if(unlikely(external)) { r = __tls_rand(); }
-			else { r = proc->rdq.its++; }
-			choose(hint) {
-				case UNPARK_LOCAL : i = start + (r % READYQ_SHARD_FACTOR);
-				case UNPARK_REMOTE: i = prf   + (r % READYQ_SHARD_FACTOR);
-			}
+			unsigned r = proc->rdq.its++;
+			i = proc->rdq.id + (r % READYQ_SHARD_FACTOR);
+			/* paranoid */ verify( i < lanes.count );
 			// If we can't lock it retry
 		} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
-
-		// Actually push it
-		push(lanes.data[i], thrd);
-
-		// Unlock and return
-		__atomic_unlock( &lanes.data[i].lock );
-
-		#if !defined(__CFA_NO_STATISTICS__)
-			if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED);
-			else __tls_stats()->ready.push.local.success++;
-		#endif
-
-		__cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
-
-	}
-
-	// Pop from the ready queue from a given cluster
-	__attribute__((hot)) thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
-		/* paranoid */ verify( lanes.count > 0 );
-		/* paranoid */ verify( kernelTLS().this_processor );
-
-		processor * const proc = kernelTLS().this_processor;
-		const int cpu = __kernel_getcpu();
-		/* paranoid */ verify(cpu >= 0);
-		/* paranoid */ verify(cpu < cpu_info.hthrd_count);
-		/* paranoid */ verify(cpu * READYQ_SHARD_FACTOR < lanes.count);
-
-		const cpu_map_entry_t & map = cpu_info.llc_map[cpu];
-		/* paranoid */ verify(map.start * READYQ_SHARD_FACTOR < lanes.count);
-		/* paranoid */ verify(map.self * READYQ_SHARD_FACTOR < lanes.count);
-		/* paranoid */ verifyf((map.start + map.count) * READYQ_SHARD_FACTOR <= lanes.count, "have %zu lanes but map can go up to %u", lanes.count, (map.start + map.count) * READYQ_SHARD_FACTOR);
-
-		const int start = map.self * READYQ_SHARD_FACTOR;
-		const unsigned long long ctsc = rdtscl();
-
-		// Did we already have a help target
-		if(proc->rdq.target == MAX) {
-			unsigned long long max = 0;
-			for(i; READYQ_SHARD_FACTOR) {
-				unsigned long long tsc = moving_average(ctsc, ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
-				if(tsc > max) max = tsc;
-			}
-			//  proc->rdq.cutoff = (max + 2 * max) / 2;
-			/* paranoid */ verify(lanes.count < 65536); // The following code assumes max 65536 cores.
-			/* paranoid */ verify(map.count < 65536); // The following code assumes max 65536 cores.
-
-			if(0 == (__tls_rand() % 100)) {
-				proc->rdq.target = __tls_rand() % lanes.count;
-			} else {
-				unsigned cpu_chaos = map.start + (__tls_rand() % map.count);
-				proc->rdq.target = (cpu_chaos * READYQ_SHARD_FACTOR) + (__tls_rand() % READYQ_SHARD_FACTOR);
-				/* paranoid */ verify(proc->rdq.target >= (map.start * READYQ_SHARD_FACTOR));
-				/* paranoid */ verify(proc->rdq.target <  ((map.start + map.count) * READYQ_SHARD_FACTOR));
-			}
-
-			/* paranoid */ verify(proc->rdq.target != MAX);
-		}
-		else {
-			unsigned long long max = 0;
-			for(i; READYQ_SHARD_FACTOR) {
-				unsigned long long tsc = moving_average(ctsc, ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
-				if(tsc > max) max = tsc;
-			}
-			const unsigned long long cutoff = (max + 2 * max) / 2;
-			{
-				unsigned target = proc->rdq.target;
-				proc->rdq.target = MAX;
-				lanes.help[target / READYQ_SHARD_FACTOR].tri++;
-				if(moving_average(ctsc, lanes.tscs[target].tv, lanes.tscs[target].ma) > cutoff) {
-					thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
-					proc->rdq.last = target;
-					if(t) return t;
-				}
-				proc->rdq.target = MAX;
-			}
-
-			unsigned last = proc->rdq.last;
-			if(last != MAX && moving_average(ctsc, lanes.tscs[last].tv, lanes.tscs[last].ma) > cutoff) {
-				thread$ * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.help));
-				if(t) return t;
-			}
-			else {
-				proc->rdq.last = MAX;
-			}
-		}
-
-		for(READYQ_SHARD_FACTOR) {
-			unsigned i = start + (proc->rdq.itr++ % READYQ_SHARD_FACTOR);
-			if(thread$ * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
-		}
-
-		// All lanes where empty return 0p
-		return 0p;
-	}
-
-	__attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
-		processor * const proc = kernelTLS().this_processor;
-		unsigned last = proc->rdq.last;
-		if(last != MAX) {
-			struct thread$ * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.steal));
-			if(t) return t;
-			proc->rdq.last = MAX;
-		}
-
-		unsigned i = __tls_rand() % lanes.count;
-		return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal));
-	}
-	__attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr) {
-		return search(cltr);
-	}
-#endif
-#if defined(USE_RELAXED_FIFO)
-	//-----------------------------------------------------------------------
-	// get index from random number with or without bias towards queues
-	static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred) {
-		unsigned i;
-		bool local;
-		unsigned rlow  = r % BIAS;
-		unsigned rhigh = r / BIAS;
-		if((0 != rlow) && preferred >= 0) {
-			// (BIAS - 1) out of BIAS chances
-			// Use perferred queues
-			i = preferred + (rhigh % READYQ_SHARD_FACTOR);
-			local = true;
-		}
-		else {
-			// 1 out of BIAS chances
-			// Use all queues
-			i = rhigh;
-			local = false;
-		}
-		return [i, local];
-	}
-
-	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
-		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
-
-		const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
-		/* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
-
-		bool local;
-		int preferred = external ? -1 : kernelTLS().this_processor->rdq.id;
-
-		// Try to pick a lane and lock it
-		unsigned i;
-		do {
-			// Pick the index of a lane
-			unsigned r = __tls_rand_fwd();
-			[i, local] = idx_from_r(r, preferred);
-
-			i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
-
-			#if !defined(__CFA_NO_STATISTICS__)
-				if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.attempt, 1, __ATOMIC_RELAXED);
-				else if(local) __tls_stats()->ready.push.local.attempt++;
-				else __tls_stats()->ready.push.share.attempt++;
-			#endif
-
-			// If we can't lock it retry
-		} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
-
-		// Actually push it
-		push(lanes.data[i], thrd);
-
-		// Unlock and return
-		__atomic_unlock( &lanes.data[i].lock );
-
-		// Mark the current index in the tls rng instance as having an item
-		__tls_rand_advance_bck();
-
-		__cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
-
-		// Update statistics
-		#if !defined(__CFA_NO_STATISTICS__)
-			if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED);
-			else if(local) __tls_stats()->ready.push.local.success++;
-			else __tls_stats()->ready.push.share.success++;
-		#endif
-	}
-
-	// Pop from the ready queue from a given cluster
-	__attribute__((hot)) thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
-		/* paranoid */ verify( lanes.count > 0 );
-		/* paranoid */ verify( kernelTLS().this_processor );
-		/* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count );
-
-		unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
-		int preferred = kernelTLS().this_processor->rdq.id;
-
-
-		// As long as the list is not empty, try finding a lane that isn't empty and pop from it
-		for(25) {
-			// Pick two lists at random
-			unsigned ri = __tls_rand_bck();
-			unsigned rj = __tls_rand_bck();
-
-			unsigned i, j;
-			__attribute__((unused)) bool locali, localj;
-			[i, locali] = idx_from_r(ri, preferred);
-			[j, localj] = idx_from_r(rj, preferred);
-
-			i %= count;
-			j %= count;
-
-			// try popping from the 2 picked lists
-			struct thread$ * thrd = try_pop(cltr, i, j __STATS(, *(locali || localj ? &__tls_stats()->ready.pop.local : &__tls_stats()->ready.pop.help)));
-			if(thrd) {
-				return thrd;
-			}
-		}
-
-		// All lanes where empty return 0p
-		return 0p;
-	}
-
-	__attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) { return pop_fast(cltr); }
-	__attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr) {
-		return search(cltr);
-	}
-#endif
-#if defined(USE_WORK_STEALING)
-	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
-		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
-
-		// #define USE_PREFERRED
-		#if !defined(USE_PREFERRED)
-		const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
-		/* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
-		#else
-			unsigned preferred = thrd->preferred;
-			const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || preferred == MAX || thrd->curr_cluster != cltr;
-			/* paranoid */ verifyf(external || preferred < lanes.count, "Invalid preferred queue %u for %u lanes", preferred, lanes.count );
-
-			unsigned r = preferred % READYQ_SHARD_FACTOR;
-			const unsigned start = preferred - r;
-		#endif
-
-		// Try to pick a lane and lock it
-		unsigned i;
-		do {
-			#if !defined(__CFA_NO_STATISTICS__)
-				if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.attempt, 1, __ATOMIC_RELAXED);
-				else __tls_stats()->ready.push.local.attempt++;
-			#endif
-
-			if(unlikely(external)) {
-				i = __tls_rand() % lanes.count;
-			}
-			else {
-				#if !defined(USE_PREFERRED)
-					processor * proc = kernelTLS().this_processor;
-					unsigned r = proc->rdq.its++;
-					i =  proc->rdq.id + (r % READYQ_SHARD_FACTOR);
-				#else
-					i = start + (r++ % READYQ_SHARD_FACTOR);
-				#endif
-			}
-			// If we can't lock it retry
-		} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
-
-		// Actually push it
-		push(lanes.data[i], thrd);
-
-		// Unlock and return
-		__atomic_unlock( &lanes.data[i].lock );
-
-		#if !defined(__CFA_NO_STATISTICS__)
-			if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED);
-			else __tls_stats()->ready.push.local.success++;
-		#endif
-
-		__cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
-	}
-
-	// Pop from the ready queue from a given cluster
-	__attribute__((hot)) thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
-		/* paranoid */ verify( lanes.count > 0 );
-		/* paranoid */ verify( kernelTLS().this_processor );
-		/* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count );
-
-		processor * proc = kernelTLS().this_processor;
-
-		if(proc->rdq.target == MAX) {
-			unsigned long long min = ts(lanes.data[proc->rdq.id]);
-			for(int i = 0; i < READYQ_SHARD_FACTOR; i++) {
-				unsigned long long tsc = ts(lanes.data[proc->rdq.id + i]);
-				if(tsc < min) min = tsc;
-			}
-			proc->rdq.cutoff = min;
-			proc->rdq.target = __tls_rand() % lanes.count;
-		}
-		else {
-			unsigned target = proc->rdq.target;
-			proc->rdq.target = MAX;
-			const unsigned long long bias = 0; //2_500_000_000;
-			const unsigned long long cutoff = proc->rdq.cutoff > bias ? proc->rdq.cutoff - bias : proc->rdq.cutoff;
-			if(lanes.tscs[target].tv < cutoff && ts(lanes.data[target]) < cutoff) {
+	}
+
+	// Actually push it
+	push(lanes.data[i], thrd);
+
+	// Unlock and return
+	__atomic_unlock( &lanes.data[i].lock );
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		if(unlikely(external || remote)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED);
+		else __tls_stats()->ready.push.local.success++;
+	#endif
+}
+
+static inline unsigned long long calc_cutoff(const unsigned long long ctsc, const processor * proc, __ready_queue_t & rdq) {
+	unsigned start = proc->rdq.id;
+	unsigned long long max = 0;
+	for(i; READYQ_SHARD_FACTOR) {
+		unsigned long long ptsc = ts(rdq.lanes.data[start + i]);
+		if(ptsc != -1ull) {
+			/* paranoid */ verify( start + i < rdq.lanes.count );
+			unsigned long long tsc = moving_average(ctsc, ptsc, rdq.lanes.tscs[start + i].ma);
+			if(tsc > max) max = tsc;
+		}
+	}
+	return (max + 2 * max) / 2;
+}
+
+__attribute__((hot)) struct thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
+	/* paranoid */ verify( lanes.count > 0 );
+	/* paranoid */ verify( kernelTLS().this_processor );
+	/* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count );
+
+	processor * const proc = kernelTLS().this_processor;
+	unsigned this = proc->rdq.id;
+	/* paranoid */ verify( this < lanes.count );
+	__cfadbg_print_safe(ready_queue, "Kernel : pop from %u\n", this);
+
+	// Figure out the current cpu and make sure it is valid
+	const int cpu = __kernel_getcpu();
+	/* paranoid */ verify(cpu >= 0);
+	/* paranoid */ verify(cpu < cpu_info.hthrd_count);
+	unsigned this_cache = cpu_info.llc_map[cpu].cache;
+
+	// Super important: don't write the same value over and over again
+	// We want to maximise our chances that his particular values stays in cache
+	if(lanes.caches[this / READYQ_SHARD_FACTOR].id != this_cache)
+		__atomic_store_n(&lanes.caches[this / READYQ_SHARD_FACTOR].id, this_cache, __ATOMIC_RELAXED);
+
+	const unsigned long long ctsc = rdtscl();
+
+	if(proc->rdq.target == MAX) {
+		uint64_t chaos = __tls_rand();
+		unsigned ext = chaos & 0xff;
+		unsigned other  = (chaos >> 8) % (lanes.count);
+
+		if(ext < 3 || __atomic_load_n(&lanes.caches[other / READYQ_SHARD_FACTOR].id, __ATOMIC_RELAXED) == this_cache) {
+			proc->rdq.target = other;
+		}
+	}
+	else {
+		const unsigned target = proc->rdq.target;
+		__cfadbg_print_safe(ready_queue, "Kernel : %u considering helping %u, tcsc %llu\n", this, target, lanes.tscs[target].tv);
+		/* paranoid */ verify( lanes.tscs[target].tv != MAX );
+		if(target < lanes.count) {
+			const unsigned long long cutoff = calc_cutoff(ctsc, proc, cltr->ready_queue);
+			const unsigned long long age = moving_average(ctsc, lanes.tscs[target].tv, lanes.tscs[target].ma);
+			__cfadbg_print_safe(ready_queue, "Kernel : Help attempt on %u from %u, age %'llu vs cutoff %'llu, %s\n", target, this, age, cutoff, age > cutoff ? "yes" : "no");
+			if(age > cutoff) {
 				thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
 				if(t) return t;
 			}
 		}
-
-		for(READYQ_SHARD_FACTOR) {
-			unsigned i = proc->rdq.id + (proc->rdq.itr++ % READYQ_SHARD_FACTOR);
-			if(thread$ * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
-		}
-		return 0p;
-	}
-
-	__attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
-		unsigned i = __tls_rand() % lanes.count;
-		return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal));
-	}
-
-	__attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr) with (cltr->ready_queue) {
-		return search(cltr);
-	}
-#endif
+		proc->rdq.target = MAX;
+	}
+
+	for(READYQ_SHARD_FACTOR) {
+		unsigned i = this + (proc->rdq.itr++ % READYQ_SHARD_FACTOR);
+		if(thread$ * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
+	}
+
+	// All lanes where empty return 0p
+	return 0p;
+
+}
+__attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+	unsigned i = __tls_rand() % lanes.count;
+	return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal));
+}
+__attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr) {
+	return search(cltr);
+}
 
 //=======================================================================
@@ -845,7 +418,5 @@
 	// Actually pop the list
 	struct thread$ * thrd;
-	#if defined(USE_AWARE_STEALING) || defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING)
-		unsigned long long tsc_before = ts(lane);
-	#endif
+	unsigned long long tsc_before = ts(lane);
 	unsigned long long tsv;
 	[thrd, tsv] = pop(lane);
@@ -861,18 +432,12 @@
 	__STATS( stats.success++; )
 
-	#if defined(USE_AWARE_STEALING) || defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING)
-		if (tsv != MAX) {
-			unsigned long long now = rdtscl();
-			unsigned long long pma = __atomic_load_n(&lanes.tscs[w].ma, __ATOMIC_RELAXED);
-			__atomic_store_n(&lanes.tscs[w].tv, tsv, __ATOMIC_RELAXED);
-			__atomic_store_n(&lanes.tscs[w].ma, moving_average(now, tsc_before, pma), __ATOMIC_RELAXED);
-		}
-	#endif
-
-	#if defined(USE_AWARE_STEALING) || defined(USE_CPU_WORK_STEALING)
-		thrd->preferred = w / READYQ_SHARD_FACTOR;
-	#else
-		thrd->preferred = w;
-	#endif
+	if (tsv != MAX) {
+		unsigned long long now = rdtscl();
+		unsigned long long pma = __atomic_load_n(&lanes.tscs[w].ma, __ATOMIC_RELAXED);
+		__atomic_store_n(&lanes.tscs[w].tv, tsv, __ATOMIC_RELAXED);
+		__atomic_store_n(&lanes.tscs[w].ma, moving_average(now, tsc_before, pma), __ATOMIC_RELAXED);
+	}
+
+	thrd->preferred = w / READYQ_SHARD_FACTOR;
 
 	// return the popped thread
@@ -902,18 +467,8 @@
 // get preferred ready for new thread
 unsigned ready_queue_new_preferred() {
-	unsigned pref = 0;
+	unsigned pref = MAX;
 	if(struct thread$ * thrd = publicTLS_get( this_thread )) {
 		pref = thrd->preferred;
 	}
-	else {
-		#if defined(USE_CPU_WORK_STEALING)
-			pref = __kernel_getcpu();
-		#endif
-	}
-
-	#if defined(USE_CPU_WORK_STEALING)
-		/* paranoid */ verify(pref >= 0);
-		/* paranoid */ verify(pref < cpu_info.hthrd_count);
-	#endif
 
 	return pref;
@@ -982,148 +537,140 @@
 
 static void fix_times( struct cluster * cltr ) with( cltr->ready_queue ) {
-	#if defined(USE_AWARE_STEALING) || defined(USE_WORK_STEALING)
-		lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
-		for(i; lanes.count) {
-			lanes.tscs[i].tv = rdtscl();
-			lanes.tscs[i].ma = 0;
-		}
-	#endif
-}
-
-#if defined(USE_CPU_WORK_STEALING)
-	// ready_queue size is fixed in this case
-	void ready_queue_grow(struct cluster * cltr) {}
-	void ready_queue_shrink(struct cluster * cltr) {}
-#else
-	// Grow the ready queue
-	void ready_queue_grow(struct cluster * cltr) {
-		size_t ncount;
-		int target = cltr->procs.total;
-
-		/* paranoid */ verify( ready_mutate_islocked() );
-		__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue\n");
-
-		// Make sure that everything is consistent
-		/* paranoid */ check( cltr->ready_queue );
-
-		// grow the ready queue
-		with( cltr->ready_queue ) {
-			// Find new count
-			// Make sure we always have atleast 1 list
-			if(target >= 2) {
-				ncount = target * READYQ_SHARD_FACTOR;
-			} else {
-				ncount = SEQUENTIAL_SHARD;
+	lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
+	for(i; lanes.count) {
+		lanes.tscs[i].tv = rdtscl();
+		lanes.tscs[i].ma = 0;
+	}
+}
+
+// Grow the ready queue
+void ready_queue_grow(struct cluster * cltr) {
+	size_t ncount;
+	int target = cltr->procs.total;
+
+	/* paranoid */ verify( ready_mutate_islocked() );
+	__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue\n");
+
+	// Make sure that everything is consistent
+	/* paranoid */ check( cltr->ready_queue );
+
+	// grow the ready queue
+	with( cltr->ready_queue ) {
+		// Find new count
+		// Make sure we always have atleast 1 list
+		if(target >= 2) {
+			ncount = target * READYQ_SHARD_FACTOR;
+		} else {
+			ncount = SEQUENTIAL_SHARD;
+		}
+
+		// Allocate new array (uses realloc and memcpies the data)
+		lanes.data = alloc( ncount, lanes.data`realloc );
+
+		// Fix the moved data
+		for( idx; (size_t)lanes.count ) {
+			fix(lanes.data[idx]);
+		}
+
+		// Construct new data
+		for( idx; (size_t)lanes.count ~ ncount) {
+			(lanes.data[idx]){};
+		}
+
+		// Update original
+		lanes.count = ncount;
+
+		lanes.caches = alloc( target, lanes.caches`realloc );
+	}
+
+	fix_times(cltr);
+
+	reassign_cltr_id(cltr);
+
+	// Make sure that everything is consistent
+	/* paranoid */ check( cltr->ready_queue );
+
+	__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue done\n");
+
+	/* paranoid */ verify( ready_mutate_islocked() );
+}
+
+// Shrink the ready queue
+void ready_queue_shrink(struct cluster * cltr) {
+	/* paranoid */ verify( ready_mutate_islocked() );
+	__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue\n");
+
+	// Make sure that everything is consistent
+	/* paranoid */ check( cltr->ready_queue );
+
+	int target = cltr->procs.total;
+
+	with( cltr->ready_queue ) {
+		// Remember old count
+		size_t ocount = lanes.count;
+
+		// Find new count
+		// Make sure we always have atleast 1 list
+		lanes.count = target >= 2 ? target * READYQ_SHARD_FACTOR: SEQUENTIAL_SHARD;
+		/* paranoid */ verify( ocount >= lanes.count );
+		/* paranoid */ verify( lanes.count == target * READYQ_SHARD_FACTOR || target < 2 );
+
+		// for printing count the number of displaced threads
+		#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
+			__attribute__((unused)) size_t displaced = 0;
+		#endif
+
+		// redistribute old data
+		for( idx; (size_t)lanes.count ~ ocount) {
+			// Lock is not strictly needed but makes checking invariants much easier
+			__attribute__((unused)) bool locked = __atomic_try_acquire(&lanes.data[idx].lock);
+			verify(locked);
+
+			// As long as we can pop from this lane to push the threads somewhere else in the queue
+			while(!is_empty(lanes.data[idx])) {
+				struct thread$ * thrd;
+				unsigned long long _;
+				[thrd, _] = pop(lanes.data[idx]);
+
+				push(cltr, thrd, true);
+
+				// for printing count the number of displaced threads
+				#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
+					displaced++;
+				#endif
 			}
 
-			// Allocate new array (uses realloc and memcpies the data)
-			lanes.data = alloc( ncount, lanes.data`realloc );
-
-			// Fix the moved data
-			for( idx; (size_t)lanes.count ) {
-				fix(lanes.data[idx]);
-			}
-
-			// Construct new data
-			for( idx; (size_t)lanes.count ~ ncount) {
-				(lanes.data[idx]){};
-			}
-
-			// Update original
-			lanes.count = ncount;
-
-			lanes.caches = alloc( target, lanes.caches`realloc );
-		}
-
-		fix_times(cltr);
-
-		reassign_cltr_id(cltr);
-
-		// Make sure that everything is consistent
-		/* paranoid */ check( cltr->ready_queue );
-
-		__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue done\n");
-
-		/* paranoid */ verify( ready_mutate_islocked() );
-	}
-
-	// Shrink the ready queue
-	void ready_queue_shrink(struct cluster * cltr) {
-		/* paranoid */ verify( ready_mutate_islocked() );
-		__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue\n");
-
-		// Make sure that everything is consistent
-		/* paranoid */ check( cltr->ready_queue );
-
-		int target = cltr->procs.total;
-
-		with( cltr->ready_queue ) {
-			// Remember old count
-			size_t ocount = lanes.count;
-
-			// Find new count
-			// Make sure we always have atleast 1 list
-			lanes.count = target >= 2 ? target * READYQ_SHARD_FACTOR: SEQUENTIAL_SHARD;
-			/* paranoid */ verify( ocount >= lanes.count );
-			/* paranoid */ verify( lanes.count == target * READYQ_SHARD_FACTOR || target < 2 );
-
-			// for printing count the number of displaced threads
-			#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
-				__attribute__((unused)) size_t displaced = 0;
-			#endif
-
-			// redistribute old data
-			for( idx; (size_t)lanes.count ~ ocount) {
-				// Lock is not strictly needed but makes checking invariants much easier
-				__attribute__((unused)) bool locked = __atomic_try_acquire(&lanes.data[idx].lock);
-				verify(locked);
-
-				// As long as we can pop from this lane to push the threads somewhere else in the queue
-				while(!is_empty(lanes.data[idx])) {
-					struct thread$ * thrd;
-					unsigned long long _;
-					[thrd, _] = pop(lanes.data[idx]);
-
-					push(cltr, thrd, true);
-
-					// for printing count the number of displaced threads
-					#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
-						displaced++;
-					#endif
-				}
-
-				// Unlock the lane
-				__atomic_unlock(&lanes.data[idx].lock);
-
-				// TODO print the queue statistics here
-
-				^(lanes.data[idx]){};
-			}
-
-			__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue displaced %zu threads\n", displaced);
-
-			// Allocate new array (uses realloc and memcpies the data)
-			lanes.data = alloc( lanes.count, lanes.data`realloc );
-
-			// Fix the moved data
-			for( idx; (size_t)lanes.count ) {
-				fix(lanes.data[idx]);
-			}
-
-			lanes.caches = alloc( target, lanes.caches`realloc );
-		}
-
-		fix_times(cltr);
-
-
-		reassign_cltr_id(cltr);
-
-		// Make sure that everything is consistent
-		/* paranoid */ check( cltr->ready_queue );
-
-		__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue done\n");
-		/* paranoid */ verify( ready_mutate_islocked() );
-	}
-#endif
+			// Unlock the lane
+			__atomic_unlock(&lanes.data[idx].lock);
+
+			// TODO print the queue statistics here
+
+			^(lanes.data[idx]){};
+		}
+
+		__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue displaced %zu threads\n", displaced);
+
+		// Allocate new array (uses realloc and memcpies the data)
+		lanes.data = alloc( lanes.count, lanes.data`realloc );
+
+		// Fix the moved data
+		for( idx; (size_t)lanes.count ) {
+			fix(lanes.data[idx]);
+		}
+
+		lanes.caches = alloc( target, lanes.caches`realloc );
+	}
+
+	fix_times(cltr);
+
+
+	reassign_cltr_id(cltr);
+
+	// Make sure that everything is consistent
+	/* paranoid */ check( cltr->ready_queue );
+
+	__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue done\n");
+	/* paranoid */ verify( ready_mutate_islocked() );
+}
 
 #if !defined(__CFA_NO_STATISTICS__)