Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 0c51f9ad2f8684f00fd1f4b15c11d47001f992b2)
+++ libcfa/src/concurrency/kernel.hfa	(revision a77f25bdb5a25184f01c1a1af685629803d572dc)
@@ -67,6 +67,6 @@
 		unsigned target;
 		unsigned last;
-		unsigned cnt;
-		unsigned long long int cutoff;
+		signed   cpu;
+		// unsigned long long int cutoff;
 	} rdq;
 
@@ -152,4 +152,8 @@
 	volatile unsigned long long tv;
 	volatile unsigned long long ma;
+};
+
+struct __attribute__((aligned(128))) __cache_id_t {
+	volatile unsigned id;
 };
 
@@ -164,6 +168,10 @@
 static inline void ^?{}(__timestamp_t & this) {}
 
+struct __attribute__((aligned(128))) __ready_queue_caches_t;
+void  ?{}(__ready_queue_caches_t & this);
+void ^?{}(__ready_queue_caches_t & this);
+
 //TODO adjust cache size to ARCHITECTURE
-// Structure holding the relaxed ready queue
+// Structure holding the ready queue
 struct __ready_queue_t {
 	// Data tracking the actual lanes
@@ -177,4 +185,6 @@
 		// Array of times
 		__timestamp_t * volatile tscs;
+
+		__cache_id_t * volatile caches;
 
 		// Array of stats
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 0c51f9ad2f8684f00fd1f4b15c11d47001f992b2)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision a77f25bdb5a25184f01c1a1af685629803d572dc)
@@ -34,4 +34,5 @@
 #include "kernel_private.hfa"
 #include "startup.hfa"          // STARTUP_PRIORITY_XXX
+#include "limits.hfa"
 #include "math.hfa"
 
@@ -514,8 +515,9 @@
 	this.rdq.its = 0;
 	this.rdq.itr = 0;
-	this.rdq.id  = -1u;
-	this.rdq.target = -1u;
-	this.rdq.last = -1u;
-	this.rdq.cutoff = 0ull;
+	this.rdq.id  = MAX;
+	this.rdq.target = MAX;
+	this.rdq.last = MAX;
+	this.rdq.cpu = 0;
+	// this.rdq.cutoff = 0ull;
 	do_terminate = false;
 	preemption_alarm = 0p;
@@ -685,4 +687,6 @@
 	uint_fast32_t last_size;
 	[this->unique_id, last_size] = ready_mutate_register();
+
+		this->rdq.cpu = __kernel_getcpu();
 
 		this->cltr->procs.total += 1u;
Index: libcfa/src/concurrency/locks.hfa
===================================================================
--- libcfa/src/concurrency/locks.hfa	(revision 0c51f9ad2f8684f00fd1f4b15c11d47001f992b2)
+++ libcfa/src/concurrency/locks.hfa	(revision a77f25bdb5a25184f01c1a1af685629803d572dc)
@@ -29,103 +29,4 @@
 #include "time_t.hfa"
 #include "time.hfa"
-
-//-----------------------------------------------------------------------------
-// Semaphores
-
-// '0-nary' semaphore
-// Similar to a counting semaphore except the value of one is never reached
-// as a consequence, a V() that would bring the value to 1 *spins* until
-// a P consumes it
-struct Semaphore0nary {
-	__spinlock_t lock; // needed to protect
-	mpsc_queue(thread$) queue;
-};
-
-static inline bool P(Semaphore0nary & this, thread$ * thrd) {
-	/* paranoid */ verify(!thrd`next);
-	/* paranoid */ verify(!(&(*thrd)`next));
-
-	push(this.queue, thrd);
-	return true;
-}
-
-static inline bool P(Semaphore0nary & this) {
-    thread$ * thrd = active_thread();
-    P(this, thrd);
-    park();
-    return true;
-}
-
-static inline thread$ * V(Semaphore0nary & this, bool doUnpark = true) {
-	thread$ * next;
-	lock(this.lock __cfaabi_dbg_ctx2);
-		for (;;) {
-			next = pop(this.queue);
-			if (next) break;
-			Pause();
-		}
-	unlock(this.lock);
-
-	if (doUnpark) unpark(next);
-	return next;
-}
-
-// Wrapper used on top of any sempahore to avoid potential locking
-struct BinaryBenaphore {
-	volatile ssize_t counter;
-};
-
-static inline {
-	void ?{}(BinaryBenaphore & this) { this.counter = 0; }
-	void ?{}(BinaryBenaphore & this, zero_t) { this.counter = 0; }
-	void ?{}(BinaryBenaphore & this, one_t ) { this.counter = 1; }
-
-	// returns true if no blocking needed
-	bool P(BinaryBenaphore & this) {
-		return __atomic_fetch_sub(&this.counter, 1, __ATOMIC_SEQ_CST) > 0;
-	}
-
-	bool tryP(BinaryBenaphore & this) {
-		ssize_t c = this.counter;
-		/* paranoid */ verify( c > MIN );
-		return (c >= 1) && __atomic_compare_exchange_n(&this.counter, &c, c-1, false, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
-	}
-
-	// returns true if notify needed
-	bool V(BinaryBenaphore & this) {
-		ssize_t c = 0;
-		for () {
-			/* paranoid */ verify( this.counter < MAX );
-			if (__atomic_compare_exchange_n(&this.counter, &c, c+1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-				if (c == 0) return true;
-				/* paranoid */ verify(c < 0);
-				return false;
-			} else {
-				if (c == 1) return true;
-				/* paranoid */ verify(c < 1);
-				Pause();
-			}
-		}
-	}
-}
-
-// Binary Semaphore based on the BinaryBenaphore on top of the 0-nary Semaphore
-struct ThreadBenaphore {
-	BinaryBenaphore ben;
-	Semaphore0nary  sem;
-};
-
-static inline void ?{}(ThreadBenaphore & this) {}
-static inline void ?{}(ThreadBenaphore & this, zero_t) { (this.ben){ 0 }; }
-static inline void ?{}(ThreadBenaphore & this, one_t ) { (this.ben){ 1 }; }
-
-static inline bool P(ThreadBenaphore & this)              { return P(this.ben) ? false : P(this.sem); }
-static inline bool tryP(ThreadBenaphore & this)           { return tryP(this.ben); }
-static inline bool P(ThreadBenaphore & this, bool wait)   { return wait ? P(this) : tryP(this); }
-
-static inline thread$ * V(ThreadBenaphore & this, bool doUnpark = true) {
-	if (V(this.ben)) return 0p;
-	return V(this.sem, doUnpark);
-}
 
 //-----------------------------------------------------------------------------
@@ -171,51 +72,4 @@
 static inline void   on_wakeup( owner_lock & this, size_t v ) { on_wakeup ( (blocking_lock &)this, v ); }
 static inline void   on_notify( owner_lock & this, struct thread$ * t ) { on_notify( (blocking_lock &)this, t ); }
-
-struct fast_lock {
-	thread$ * volatile owner;
-	ThreadBenaphore sem;
-};
-
-static inline void ?{}(fast_lock & this) __attribute__((deprecated("use linear_backoff_then_block_lock instead")));
-static inline void ?{}(fast_lock & this) { this.owner = 0p; }
-
-static inline bool $try_lock(fast_lock & this, thread$ * thrd) {
-    thread$ * exp = 0p;
-    return __atomic_compare_exchange_n(&this.owner, &exp, thrd, false, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
-}
-
-static inline void lock( fast_lock & this ) __attribute__((deprecated("use linear_backoff_then_block_lock instead"), artificial));
-static inline void lock( fast_lock & this ) {
-	thread$ * thrd = active_thread();
-	/* paranoid */verify(thrd != this.owner);
-
-	for (;;) {
-		if ($try_lock(this, thrd)) return;
-		P(this.sem);
-	}
-}
-
-static inline bool try_lock( fast_lock & this ) __attribute__((deprecated("use linear_backoff_then_block_lock instead"), artificial));
-static inline bool try_lock ( fast_lock & this ) {
-	thread$ * thrd = active_thread();
-	/* paranoid */ verify(thrd != this.owner);
-	return $try_lock(this, thrd);
-}
-
-static inline thread$ * unlock( fast_lock & this ) __attribute__((deprecated("use linear_backoff_then_block_lock instead"), artificial));
-static inline thread$ * unlock( fast_lock & this ) {
-	/* paranoid */ verify(active_thread() == this.owner);
-
-	// open 'owner' before unlocking anyone
-	// so new and unlocked threads don't park incorrectly.
-	// This may require additional fencing on ARM.
-	this.owner = 0p;
-
-	return V(this.sem);
-}
-
-static inline size_t on_wait( fast_lock & this ) { unlock(this); return 0; }
-static inline void on_wakeup( fast_lock & this, size_t ) { lock(this); }
-static inline void on_notify( fast_lock &, struct thread$ * t ) { unpark(t); }
 
 struct mcs_node {
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 0c51f9ad2f8684f00fd1f4b15c11d47001f992b2)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision a77f25bdb5a25184f01c1a1af685629803d572dc)
@@ -20,7 +20,8 @@
 
 
-#define USE_RELAXED_FIFO
+// #define USE_RELAXED_FIFO
 // #define USE_WORK_STEALING
 // #define USE_CPU_WORK_STEALING
+#define USE_AWARE_STEALING
 
 #include "bits/defs.hfa"
@@ -29,4 +30,5 @@
 
 #include "stdlib.hfa"
+#include "limits.hfa"
 #include "math.hfa"
 
@@ -54,5 +56,8 @@
 #endif
 
-#if   defined(USE_CPU_WORK_STEALING)
+#if   defined(USE_AWARE_STEALING)
+	#define READYQ_SHARD_FACTOR 2
+	#define SEQUENTIAL_SHARD 2
+#elif defined(USE_CPU_WORK_STEALING)
 	#define READYQ_SHARD_FACTOR 2
 #elif defined(USE_RELAXED_FIFO)
@@ -138,5 +143,4 @@
 	__kernel_rseq_register();
 
-	__cfadbg_print_safe(ready_queue, "Kernel : Registering proc %p for RW-Lock\n", proc);
 	bool * handle = (bool *)&kernelTLS().sched_lock;
 
@@ -174,6 +178,4 @@
 	}
 
-	__cfadbg_print_safe(ready_queue, "Kernel : Registering proc %p done, id %lu\n", proc, n);
-
 	// Return new spot.
 	/* paranoid */ verify(n < ready);
@@ -190,6 +192,4 @@
 
 	__atomic_store_n(cell, 0p, __ATOMIC_RELEASE);
-
-	__cfadbg_print_safe(ready_queue, "Kernel : Unregister proc %p\n", proc);
 
 	__kernel_rseq_unregister();
@@ -244,11 +244,45 @@
 
 //=======================================================================
+// caches handling
+
+struct __attribute__((aligned(128))) __ready_queue_caches_t {
+	// Count States:
+	// - 0  : No one is looking after this cache
+	// - 1  : No one is looking after this cache, BUT it's not empty
+	// - 2+ : At least one processor is looking after this cache
+	volatile unsigned count;
+};
+
+void  ?{}(__ready_queue_caches_t & this) { this.count = 0; }
+void ^?{}(__ready_queue_caches_t & this) {}
+
+static inline void depart(__ready_queue_caches_t & cache) {
+	/* paranoid */ verify( cache.count > 1);
+	__atomic_fetch_add(&cache.count, -1, __ATOMIC_SEQ_CST);
+	/* paranoid */ verify( cache.count != 0);
+	/* paranoid */ verify( cache.count < 65536 ); // This verify assumes no cluster will have more than 65000 kernel threads mapped to a single cache, which could be correct but is super weird.
+}
+
+static inline void arrive(__ready_queue_caches_t & cache) {
+	// for() {
+	// 	unsigned expected = cache.count;
+	// 	unsigned desired  = 0 == expected ? 2 : expected + 1;
+	// }
+}
+
+//=======================================================================
 // Cforall Ready Queue used for scheduling
 //=======================================================================
-unsigned long long moving_average(unsigned long long nval, unsigned long long oval) {
-	const unsigned long long tw = 16;
-	const unsigned long long nw = 4;
-	const unsigned long long ow = tw - nw;
-	return ((nw * nval) + (ow * oval)) / tw;
+unsigned long long moving_average(unsigned long long currtsc, unsigned long long instsc, unsigned long long old_avg) {
+	/* paranoid */ verifyf( currtsc < 45000000000000000, "Suspiciously large current time: %'llu (%llx)\n", currtsc, currtsc );
+	/* paranoid */ verifyf( instsc  < 45000000000000000, "Suspiciously large insert time: %'llu (%llx)\n", instsc, instsc );
+	/* paranoid */ verifyf( old_avg < 15000000000000, "Suspiciously large previous average: %'llu (%llx)\n", old_avg, old_avg );
+
+	const unsigned long long new_val = currtsc > instsc ? currtsc - instsc : 0;
+	const unsigned long long total_weight = 16;
+	const unsigned long long new_weight   = 4;
+	const unsigned long long old_weight = total_weight - new_weight;
+	const unsigned long long ret = ((new_weight * new_val) + (old_weight * old_avg)) / total_weight;
+	return ret;
 }
 
@@ -270,9 +304,15 @@
 			lanes.help[idx].tri = 0;
 		}
+
+		caches = alloc( cpu_info.llc_count );
+		for( idx; (size_t)cpu_info.llc_count ) {
+			(caches[idx]){};
+		}
 	#else
-		lanes.data  = 0p;
-		lanes.tscs  = 0p;
-		lanes.help  = 0p;
-		lanes.count = 0;
+		lanes.data   = 0p;
+		lanes.tscs   = 0p;
+		lanes.caches = 0p;
+		lanes.help   = 0p;
+		lanes.count  = 0;
 	#endif
 }
@@ -285,8 +325,129 @@
 	free(lanes.data);
 	free(lanes.tscs);
+	free(lanes.caches);
 	free(lanes.help);
 }
 
 //-----------------------------------------------------------------------
+#if defined(USE_AWARE_STEALING)
+	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
+		processor * const proc = kernelTLS().this_processor;
+		const bool external = (!proc) || (cltr != proc->cltr);
+		const bool remote   = hint == UNPARK_REMOTE;
+
+		unsigned i;
+		if( external || remote ) {
+			// Figure out where thread was last time and make sure it's valid
+			/* paranoid */ verify(thrd->preferred >= 0);
+			if(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count) {
+				/* paranoid */ verify(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count);
+				unsigned start = thrd->preferred * READYQ_SHARD_FACTOR;
+				do {
+					unsigned r = __tls_rand();
+					i = start + (r % READYQ_SHARD_FACTOR);
+					/* paranoid */ verify( i < lanes.count );
+					// If we can't lock it retry
+				} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+			} else {
+				do {
+					i = __tls_rand() % lanes.count;
+				} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+			}
+		} else {
+			do {
+				unsigned r = proc->rdq.its++;
+				i = proc->rdq.id + (r % READYQ_SHARD_FACTOR);
+				/* paranoid */ verify( i < lanes.count );
+				// If we can't lock it retry
+			} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+		}
+
+		// Actually push it
+		push(lanes.data[i], thrd);
+
+		// Unlock and return
+		__atomic_unlock( &lanes.data[i].lock );
+
+		#if !defined(__CFA_NO_STATISTICS__)
+			if(unlikely(external || remote)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED);
+			else __tls_stats()->ready.push.local.success++;
+		#endif
+	}
+
+	static inline unsigned long long calc_cutoff(const unsigned long long ctsc, const processor * proc, __ready_queue_t & rdq) {
+		unsigned start = proc->rdq.id;
+		unsigned long long max = 0;
+		for(i; READYQ_SHARD_FACTOR) {
+			unsigned long long ptsc = ts(rdq.lanes.data[start + i]);
+			if(ptsc != -1ull) {
+				/* paranoid */ verify( start + i < rdq.lanes.count );
+				unsigned long long tsc = moving_average(ctsc, ptsc, rdq.lanes.tscs[start + i].ma);
+				if(tsc > max) max = tsc;
+			}
+		}
+		return (max + 2 * max) / 2;
+	}
+
+	__attribute__((hot)) struct thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
+		/* paranoid */ verify( lanes.count > 0 );
+		/* paranoid */ verify( kernelTLS().this_processor );
+		/* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count );
+
+		processor * const proc = kernelTLS().this_processor;
+		unsigned this = proc->rdq.id;
+		/* paranoid */ verify( this < lanes.count );
+		__cfadbg_print_safe(ready_queue, "Kernel : pop from %u\n", this);
+
+		// Figure out the current cpu and make sure it is valid
+		const int cpu = __kernel_getcpu();
+		/* paranoid */ verify(cpu >= 0);
+		/* paranoid */ verify(cpu < cpu_info.hthrd_count);
+		unsigned this_cache = cpu_info.llc_map[cpu].cache;
+		__atomic_store_n(&lanes.caches[this / READYQ_SHARD_FACTOR].id, this_cache, __ATOMIC_RELAXED);
+
+		const unsigned long long ctsc = rdtscl();
+
+		if(proc->rdq.target == MAX) {
+			uint64_t chaos = __tls_rand();
+			unsigned ext = chaos & 0xff;
+			unsigned other  = (chaos >> 8) % (lanes.count);
+
+			if(ext < 3 || __atomic_load_n(&lanes.caches[other / READYQ_SHARD_FACTOR].id, __ATOMIC_RELAXED) == this_cache) {
+				proc->rdq.target = other;
+			}
+		}
+		else {
+			const unsigned target = proc->rdq.target;
+			__cfadbg_print_safe(ready_queue, "Kernel : %u considering helping %u, tcsc %llu\n", this, target, lanes.tscs[target].tv);
+			/* paranoid */ verify( lanes.tscs[target].tv != MAX );
+			if(target < lanes.count) {
+				const unsigned long long cutoff = calc_cutoff(ctsc, proc, cltr->ready_queue);
+				const unsigned long long age = moving_average(ctsc, lanes.tscs[target].tv, lanes.tscs[target].ma);
+				__cfadbg_print_safe(ready_queue, "Kernel : Help attempt on %u from %u, age %'llu vs cutoff %'llu, %s\n", target, this, age, cutoff, age > cutoff ? "yes" : "no");
+				if(age > cutoff) {
+					thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
+					if(t) return t;
+				}
+			}
+			proc->rdq.target = MAX;
+		}
+
+		for(READYQ_SHARD_FACTOR) {
+			unsigned i = this + (proc->rdq.itr++ % READYQ_SHARD_FACTOR);
+			if(thread$ * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
+		}
+
+		// All lanes where empty return 0p
+		return 0p;
+
+	}
+	__attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+		unsigned i = __tls_rand() % lanes.count;
+		return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal));
+	}
+	__attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr) {
+		return search(cltr);
+	}
+#endif
 #if defined(USE_CPU_WORK_STEALING)
 	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
@@ -345,4 +506,22 @@
 	}
 
+	static inline int pop_getcpu(processor * proc, __ready_queue_caches_t * caches) {
+		const int prv = proc->rdq.cpu;
+		const int cpu = __kernel_getcpu();
+		if( prv != proc->rdq.cpu ) {
+			unsigned pidx = cpu_info.llc_map[prv].cache;
+			/* paranoid */ verify(pidx < cpu_info.llc_count);
+
+			unsigned nidx = cpu_info.llc_map[cpu].cache;
+			/* paranoid */ verify(pidx < cpu_info.llc_count);
+
+			depart(caches[pidx]);
+			arrive(caches[nidx]);
+
+			__STATS( /* cpu migs++ */ )
+		}
+		return proc->rdq.cpu = cpu;
+	}
+
 	// Pop from the ready queue from a given cluster
 	__attribute__((hot)) thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
@@ -350,5 +529,7 @@
 		/* paranoid */ verify( kernelTLS().this_processor );
 
-		const int cpu = __kernel_getcpu();
+		processor * const proc = kernelTLS().this_processor;
+		const int cpu = pop_getcpu( proc, caches );
+		// const int cpu = __kernel_getcpu();
 		/* paranoid */ verify(cpu >= 0);
 		/* paranoid */ verify(cpu < cpu_info.hthrd_count);
@@ -360,10 +541,9 @@
 		/* paranoid */ verifyf((map.start + map.count) * READYQ_SHARD_FACTOR <= lanes.count, "have %zu lanes but map can go up to %u", lanes.count, (map.start + map.count) * READYQ_SHARD_FACTOR);
 
-		processor * const proc = kernelTLS().this_processor;
 		const int start = map.self * READYQ_SHARD_FACTOR;
 		const unsigned long long ctsc = rdtscl();
 
 		// Did we already have a help target
-		if(proc->rdq.target == -1u) {
+		if(proc->rdq.target == MAX) {
 			unsigned long long max = 0;
 			for(i; READYQ_SHARD_FACTOR) {
@@ -371,5 +551,5 @@
 				if(tsc > max) max = tsc;
 			}
-			 proc->rdq.cutoff = (max + 2 * max) / 2;
+			//  proc->rdq.cutoff = (max + 2 * max) / 2;
 			/* paranoid */ verify(lanes.count < 65536); // The following code assumes max 65536 cores.
 			/* paranoid */ verify(map.count < 65536); // The following code assumes max 65536 cores.
@@ -384,5 +564,5 @@
 			}
 
-			/* paranoid */ verify(proc->rdq.target != -1u);
+			/* paranoid */ verify(proc->rdq.target != MAX);
 		}
 		else {
@@ -395,22 +575,23 @@
 			{
 				unsigned target = proc->rdq.target;
-				proc->rdq.target = -1u;
+				proc->rdq.target = MAX;
 				lanes.help[target / READYQ_SHARD_FACTOR].tri++;
 				if(moving_average(ctsc - lanes.tscs[target].tv, lanes.tscs[target].ma) > cutoff) {
+					__STATS( __tls_stats()->ready.pop.helped[target]++; )
 					thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
 					proc->rdq.last = target;
 					if(t) return t;
-					else proc->rdq.target = -1u;
 				}
-				else proc->rdq.target = -1u;
+				proc->rdq.target = MAX;
 			}
 
 			unsigned last = proc->rdq.last;
-			if(last != -1u && lanes.tscs[last].tv < cutoff && ts(lanes.data[last]) < cutoff) {
+			if(last != MAX && moving_average(ctsc - lanes.tscs[last].tv, lanes.tscs[last].ma) > cutoff) {
+				__STATS( __tls_stats()->ready.pop.helped[last]++; )
 				thread$ * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.help));
 				if(t) return t;
 			}
 			else {
-				proc->rdq.last = -1u;
+				proc->rdq.last = MAX;
 			}
 		}
@@ -428,8 +609,8 @@
 		processor * const proc = kernelTLS().this_processor;
 		unsigned last = proc->rdq.last;
-		if(last != -1u) {
+		if(last != MAX) {
 			struct thread$ * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.steal));
 			if(t) return t;
-			proc->rdq.last = -1u;
+			proc->rdq.last = MAX;
 		}
 
@@ -560,5 +741,5 @@
 		#else
 			unsigned preferred = thrd->preferred;
-			const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || preferred == -1u || thrd->curr_cluster != cltr;
+			const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || preferred == MAX || thrd->curr_cluster != cltr;
 			/* paranoid */ verifyf(external || preferred < lanes.count, "Invalid preferred queue %u for %u lanes", preferred, lanes.count );
 
@@ -612,5 +793,5 @@
 		processor * proc = kernelTLS().this_processor;
 
-		if(proc->rdq.target == -1u) {
+		if(proc->rdq.target == MAX) {
 			unsigned long long min = ts(lanes.data[proc->rdq.id]);
 			for(int i = 0; i < READYQ_SHARD_FACTOR; i++) {
@@ -623,5 +804,5 @@
 		else {
 			unsigned target = proc->rdq.target;
-			proc->rdq.target = -1u;
+			proc->rdq.target = MAX;
 			const unsigned long long bias = 0; //2_500_000_000;
 			const unsigned long long cutoff = proc->rdq.cutoff > bias ? proc->rdq.cutoff - bias : proc->rdq.cutoff;
@@ -658,4 +839,5 @@
 // try to pop from a lane given by index w
 static inline struct thread$ * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->ready_queue) {
+	/* paranoid */ verify( w < lanes.count );
 	__STATS( stats.attempt++; )
 
@@ -681,5 +863,5 @@
 	// Actually pop the list
 	struct thread$ * thrd;
-	#if defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING)
+	#if defined(USE_AWARE_STEALING) || defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING)
 		unsigned long long tsc_before = ts(lane);
 	#endif
@@ -697,11 +879,14 @@
 	__STATS( stats.success++; )
 
-	#if defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING)
-		unsigned long long now = rdtscl();
-		lanes.tscs[w].tv = tsv;
-		lanes.tscs[w].ma = moving_average(now > tsc_before ? now - tsc_before : 0, lanes.tscs[w].ma);
+	#if defined(USE_AWARE_STEALING) || defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING)
+		if (tsv != MAX) {
+			unsigned long long now = rdtscl();
+			unsigned long long pma = __atomic_load_n(&lanes.tscs[w].ma, __ATOMIC_RELAXED);
+			__atomic_store_n(&lanes.tscs[w].tv, tsv, __ATOMIC_RELAXED);
+			__atomic_store_n(&lanes.tscs[w].ma, moving_average(now, tsc_before, pma), __ATOMIC_RELAXED);
+		}
 	#endif
 
-	#if defined(USE_CPU_WORK_STEALING)
+	#if defined(USE_AWARE_STEALING) || defined(USE_CPU_WORK_STEALING)
 		thrd->preferred = w / READYQ_SHARD_FACTOR;
 	#else
@@ -802,5 +987,5 @@
 		/* paranoid */ verifyf( it, "Unexpected null iterator, at index %u of %u\n", i, count);
 		it->rdq.id = value;
-		it->rdq.target = -1u;
+		it->rdq.target = MAX;
 		value += READYQ_SHARD_FACTOR;
 		it = &(*it)`next;
@@ -815,10 +1000,9 @@
 
 static void fix_times( struct cluster * cltr ) with( cltr->ready_queue ) {
-	#if defined(USE_WORK_STEALING)
+	#if defined(USE_AWARE_STEALING) || defined(USE_WORK_STEALING)
 		lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
 		for(i; lanes.count) {
-			unsigned long long tsc1 = ts(lanes.data[i]);
-			unsigned long long tsc2 = rdtscl();
-			lanes.tscs[i].tv = min(tsc1, tsc2);
+			lanes.tscs[i].tv = rdtscl();
+			lanes.tscs[i].ma = 0;
 		}
 	#endif
@@ -866,4 +1050,6 @@
 			// Update original
 			lanes.count = ncount;
+
+			lanes.caches = alloc( target, lanes.caches`realloc );
 		}
 
@@ -942,7 +1128,10 @@
 				fix(lanes.data[idx]);
 			}
+
+			lanes.caches = alloc( target, lanes.caches`realloc );
 		}
 
 		fix_times(cltr);
+
 
 		reassign_cltr_id(cltr);
Index: sts/concurrent/.expect/semaphore.txt
===================================================================
--- tests/concurrent/.expect/semaphore.txt	(revision 0c51f9ad2f8684f00fd1f4b15c11d47001f992b2)
+++ 	(revision )
@@ -1,3 +1,0 @@
-Starting
-Done!
-Match!
Index: sts/concurrent/.expect/spinaphore.txt
===================================================================
--- tests/concurrent/.expect/spinaphore.txt	(revision 0c51f9ad2f8684f00fd1f4b15c11d47001f992b2)
+++ 	(revision )
@@ -1,3 +1,0 @@
-Starting
-Done!
-Match!
Index: sts/concurrent/semaphore.cfa
===================================================================
--- tests/concurrent/semaphore.cfa	(revision 0c51f9ad2f8684f00fd1f4b15c11d47001f992b2)
+++ 	(revision )
@@ -1,78 +1,0 @@
-#include <fstream.hfa>
-#include <locks.hfa>
-#include <thread.hfa>
-#include <mutex_stmt.hfa>
-
-enum { num_blockers = 17, num_unblockers = 13 };
-
-void thrash() {
-	unsigned t[100];
-	for(i; 100) {
-		t[i] = 0xDEADBEEF;
-	}
-}
-
-ThreadBenaphore ben;
-
-// const unsigned int num_blocks = 25000;
-const unsigned int num_blocks = 5;
-
-thread Blocker {
-	size_t sum;
-};
-
-void main(Blocker & this) {
-	thread$ * me = active_thread();
-	this.sum = 0;
-	for(num_blocks) {
-		this.sum += (unsigned)me;
-		thrash();
-		P(ben);
-		if(((thread&)this).seqable.next != 0p) mutex(sout) sout | "Link not invalidated";
-		thrash();
-	}
-}
-
-thread Unblocker {
-	size_t sum;
-};
-
-void main(Unblocker & this) {
-	this.sum = 0;
-	LOOP: for() {
-		waitfor( ^?{} : this) {
-			break LOOP;
-		}
-		or else {}
-
-		thread$ * t = V(ben, false);
-		if(t) {
-			this.sum += (unsigned)t;
-			unpark(t);
-		}
-		yield(random(10));
-	}
-}
-
-int main() {
-	size_t usum = 0;
-	size_t bsum = 0;
-
-	sout | "Starting";
-	{
-		Blocker   blockers  [num_blockers  ];
-		Unblocker unblockers[num_unblockers];
-
-		for(i;num_blockers) {
-			bsum += join(blockers[i]).sum;
-		}
-
-		sout | "Done!";
-
-		for(i;num_unblockers) {
-			usum += join(unblockers[i]).sum;
-		}
-	}
-	if(bsum == usum) sout | "Match!";
-	else sout | "No Match!" | usum | "!=" | bsum;
-}
Index: sts/concurrent/spinaphore.cfa
===================================================================
--- tests/concurrent/spinaphore.cfa	(revision 0c51f9ad2f8684f00fd1f4b15c11d47001f992b2)
+++ 	(revision )
@@ -1,94 +1,0 @@
-#include <fstream.hfa>
-#include <locks.hfa>
-#include <thread.hfa>
-
-enum { num_blockers = 17, num_unblockers = 13 };
-
-void thrash() {
-	unsigned t[100];
-	for(i; 100) {
-		t[i] = 0xDEADBEEF;
-	}
-}
-
-Semaphore0nary sem;
-
-const unsigned int num_blocks = 1625;
-
-thread Blocker {
-	size_t sum;
-};
-void main(Blocker & this);
-
-Blocker * from_thread(thread$ * t) {
-	Blocker & nullb = *(Blocker*)0p;
-	thread$ & nullt = (thread&)nullb;
-	uintptr_t offset  = (uintptr_t)&nullt;
-	uintptr_t address = ((uintptr_t)t) - offset;
-	return (Blocker*)address;
-}
-
-void main(Blocker & this) {
-	thread$ * me = active_thread();
-	Blocker * me1 = &this;
-	Blocker * me2 = from_thread(me);
-	if( me1 != me2 ) sout | "Bad casting!" | me1 | "vs" | me2;
-	this.sum = 0;
-	for(num_blocks) {
-		P(sem);
-		if(((thread&)this).seqable.next != 0p) sout | "Link not invalidated";
-	}
-}
-
-const unsigned int num_unblocks = 2125;
-
-thread Unblocker {
-	size_t sum;
-};
-
-void main(Unblocker & this) {
-	this.sum = 0;
-	unsigned me = (unsigned)(uintptr_t)&this;
-	for(num_unblocks) {
-		thread$ * t = V(sem, false);
-		Blocker * b = from_thread(t);
-		b->sum += me;
-		this.sum += (unsigned)(uintptr_t)b;
-		unpark(t);
-		yield(random(10));
-	}
-}
-
-int main() {
-	size_t usum = 0;
-	size_t bsum = 0;
-
-	if((num_unblocks * num_unblockers) != (num_blocks * num_blockers)) sout | "Mismatched Operations: " | (num_unblocks * num_unblockers) | "vs" | (num_blocks * num_blockers);
-
-	sout | "Starting";
-	{
-		Blocker   blockers  [num_blockers  ];
-		Unblocker unblockers[num_unblockers];
-
-		for(i;num_blockers) {
-			for(num_blocks)
-				usum += (unsigned)(uintptr_t)&blockers[i];
-		}
-
-		for(i;num_unblockers) {
-			for(num_unblocks)
-				bsum += (unsigned)(uintptr_t)&unblockers[i];
-		}
-
-		for(i;num_blockers) {
-			bsum -= join(blockers[i]).sum;
-		}
-
-		for(i;num_unblockers) {
-			usum -= join(unblockers[i]).sum;
-		}
-	}
-	sout | "Done!";
-	if(bsum == 0 && usum == 0) sout | "Match!";
-	else sout | "No Match!" | usum | "/" | bsum;
-}
Index: tests/device/cpu.cfa
===================================================================
--- tests/device/cpu.cfa	(revision 0c51f9ad2f8684f00fd1f4b15c11d47001f992b2)
+++ tests/device/cpu.cfa	(revision a77f25bdb5a25184f01c1a1af685629803d572dc)
@@ -15,6 +15,7 @@
 
 
+#include <device/cpu.hfa>
+#include <limits.hfa>
 #include <fstream.hfa>
-#include <device/cpu.hfa>
 #include <stdlib.hfa>
 
@@ -118,5 +119,5 @@
 
 	unsigned found_level = 0;
-	unsigned found = -1u;
+	unsigned found = MAX;
 	for(i; idxs) {
 		unsigned idx = idxs - 1 - i;
@@ -136,5 +137,5 @@
 	}
 
-	/* paranoid */ verify(found != -1u);
+	/* paranoid */ verify(found != MAX);
 	return found;
 }
Index: tests/unified_locking/.expect/locks.txt
===================================================================
--- tests/unified_locking/.expect/locks.txt	(revision 0c51f9ad2f8684f00fd1f4b15c11d47001f992b2)
+++ tests/unified_locking/.expect/locks.txt	(revision a77f25bdb5a25184f01c1a1af685629803d572dc)
@@ -11,18 +11,14 @@
 Start Test 6: owner lock and condition variable 3 wait/notify all
 Done Test 6
-Start Test 7: fast lock and condition variable single wait/notify
+Start Test 7: linear backoff lock and condition variable single wait/notify
 Done Test 7
-Start Test 8: fast lock and condition variable 3 wait/notify all
+Start Test 8: linear backoff lock and condition variable 3 wait/notify all
 Done Test 8
-Start Test 9: linear backoff lock and condition variable single wait/notify
+Start Test 9: multi acquisiton lock and condition variable multiple acquire and wait/notify
 Done Test 9
-Start Test 10: linear backoff lock and condition variable 3 wait/notify all
+Start Test 10: owner lock and condition variable multiple acquire and wait/notify
 Done Test 10
-Start Test 11: multi acquisiton lock and condition variable multiple acquire and wait/notify
+Start Test 11: no lock condition variable wait/notify
 Done Test 11
-Start Test 12: owner lock and condition variable multiple acquire and wait/notify
+Start Test 12: locked condition variable wait/notify with front()
 Done Test 12
-Start Test 13: no lock condition variable wait/notify
-Done Test 13
-Start Test 14: locked condition variable wait/notify with front()
-Done Test 14
Index: tests/unified_locking/locks.cfa
===================================================================
--- tests/unified_locking/locks.cfa	(revision 0c51f9ad2f8684f00fd1f4b15c11d47001f992b2)
+++ tests/unified_locking/locks.cfa	(revision a77f25bdb5a25184f01c1a1af685629803d572dc)
@@ -15,7 +15,4 @@
 condition_variable( owner_lock ) c_o;
 
-fast_lock f;
-condition_variable( fast_lock ) c_f;
-
 linear_backoff_then_block_lock l;
 condition_variable( linear_backoff_then_block_lock ) c_l;
@@ -74,32 +71,4 @@
 		}
 		unlock(s);
-	}
-}
-
-thread T_C_F_WS1 {};
-
-void main( T_C_F_WS1 & this ) {
-	for (unsigned int i = 0; i < num_times; i++) {
-		lock(f);
-		if(empty(c_f) && i != num_times - 1) {
-			wait(c_f,f);
-		}else{
-			notify_one(c_f);
-		}
-		unlock(f);
-	}
-}
-
-thread T_C_F_WB1 {};
-
-void main( T_C_F_WB1 & this ) {
-	for (unsigned int i = 0; i < num_times; i++) {
-		lock(f);
-		if(counter(c_f) == 3 || i == num_times - 1) {
-			notify_all(c_f);
-		}else{
-			wait(c_f,f);
-		}
-		unlock(f);
 	}
 }
@@ -317,51 +286,39 @@
 	printf("Done Test 6\n");
 
-	printf("Start Test 7: fast lock and condition variable single wait/notify\n");
-	{
-		T_C_F_WS1 t1[2];
+	printf("Start Test 7: linear backoff lock and condition variable single wait/notify\n");
+	{
+		T_C_L_WS1 t1[2];
 	}
 	printf("Done Test 7\n");
 
-	printf("Start Test 8: fast lock and condition variable 3 wait/notify all\n");
-	{
-		T_C_F_WB1 t1[4];
+	printf("Start Test 8: linear backoff lock and condition variable 3 wait/notify all\n");
+	{
+		T_C_L_WB1 t1[4];
 	}
 	printf("Done Test 8\n");
 
-	printf("Start Test 9: linear backoff lock and condition variable single wait/notify\n");
-	{
-		T_C_L_WS1 t1[2];
+	printf("Start Test 9: multi acquisiton lock and condition variable multiple acquire and wait/notify\n");
+	{
+		T_C_M_WS2 t1[2];
 	}
 	printf("Done Test 9\n");
 
-	printf("Start Test 10: linear backoff lock and condition variable 3 wait/notify all\n");
-	{
-		T_C_L_WB1 t1[4];
+	printf("Start Test 10: owner lock and condition variable multiple acquire and wait/notify\n");
+	{
+		T_C_O_WS2 t1[2];
 	}
 	printf("Done Test 10\n");
 
-	printf("Start Test 11: multi acquisiton lock and condition variable multiple acquire and wait/notify\n");
-	{
-		T_C_M_WS2 t1[2];
-	}
-	printf("Done Test 11\n");
-
-	printf("Start Test 12: owner lock and condition variable multiple acquire and wait/notify\n");
-	{
-		T_C_O_WS2 t1[2];
-	}
-	printf("Done Test 12\n");
-
-	printf("Start Test 13: no lock condition variable wait/notify\n");
+	printf("Start Test 11: no lock condition variable wait/notify\n");
 	{
 		T_C_NLW t1;
 		T_C_NLS t2;
 	}
-	printf("Done Test 13\n");
-
-	printf("Start Test 14: locked condition variable wait/notify with front()\n");
+	printf("Done Test 11\n");
+
+	printf("Start Test 12: locked condition variable wait/notify with front()\n");
 	{
 		T_C_S_WNF t1[2];
 	}
-	printf("Done Test 14\n");
-}
+	printf("Done Test 12\n");
+}
Index: sts/zombies/fastlock.cfa
===================================================================
--- tests/zombies/fastlock.cfa	(revision 0c51f9ad2f8684f00fd1f4b15c11d47001f992b2)
+++ 	(revision )
@@ -1,8 +1,0 @@
-#include <locks.hfa>
-
-#define LOCK fast_lock
-#include "mutex_test.hfa"
-
-int main() {
-    test();
-}