Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision bc4a43356f310b9a005c19fd080a2a54f6385c31)
+++ libcfa/src/concurrency/kernel.hfa	(revision 089d30c78871e1c30101300a99735d1f7c29787d)
@@ -151,7 +151,15 @@
 struct __attribute__((aligned(128))) __timestamp_t {
 	volatile unsigned long long tv;
-};
-
-static inline void  ?{}(__timestamp_t & this) { this.tv = 0; }
+	volatile unsigned long long ma;
+};
+
+// Aligned timestamps which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __help_cnts_t {
+	volatile unsigned long long src;
+	volatile unsigned long long dst;
+	volatile unsigned long long tri;
+};
+
+static inline void  ?{}(__timestamp_t & this) { this.tv = 0; this.ma = 0; }
 static inline void ^?{}(__timestamp_t & this) {}
 
@@ -169,4 +177,7 @@
 		// Array of times
 		__timestamp_t * volatile tscs;
+
+		// Array of stats
+		__help_cnts_t * volatile help;
 
 		// Number of lanes (empty or not)
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision bc4a43356f310b9a005c19fd080a2a54f6385c31)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 089d30c78871e1c30101300a99735d1f7c29787d)
@@ -246,4 +246,11 @@
 // Cforall Ready Queue used for scheduling
 //=======================================================================
+unsigned long long moving_average(unsigned long long nval, unsigned long long oval) {
+	const unsigned long long tw = 16;
+	const unsigned long long nw = 4;
+	const unsigned long long ow = tw - nw;
+	return ((nw * nval) + (ow * oval)) / tw;
+}
+
 void ?{}(__ready_queue_t & this) with (this) {
 	#if defined(USE_CPU_WORK_STEALING)
@@ -251,12 +258,20 @@
 		lanes.data = alloc( lanes.count );
 		lanes.tscs = alloc( lanes.count );
+		lanes.help = alloc( cpu_info.hthrd_count );
 
 		for( idx; (size_t)lanes.count ) {
 			(lanes.data[idx]){};
 			lanes.tscs[idx].tv = rdtscl();
+			lanes.tscs[idx].ma = rdtscl();
+		}
+		for( idx; (size_t)cpu_info.hthrd_count ) {
+			lanes.help[idx].src = 0;
+			lanes.help[idx].dst = 0;
+			lanes.help[idx].tri = 0;
 		}
 	#else
 		lanes.data  = 0p;
 		lanes.tscs  = 0p;
+		lanes.help  = 0p;
 		lanes.count = 0;
 	#endif
@@ -270,4 +285,5 @@
 	free(lanes.data);
 	free(lanes.tscs);
+	free(lanes.help);
 }
 
@@ -332,19 +348,18 @@
 		processor * const proc = kernelTLS().this_processor;
 		const int start = map.self * READYQ_SHARD_FACTOR;
+		const unsigned long long ctsc = rdtscl();
 
 		// Did we already have a help target
 		if(proc->rdq.target == -1u) {
-			// if We don't have a
-			unsigned long long min = ts(lanes.data[start]);
+			unsigned long long max = 0;
 			for(i; READYQ_SHARD_FACTOR) {
-				unsigned long long tsc = ts(lanes.data[start + i]);
-				if(tsc < min) min = tsc;
-			}
-			proc->rdq.cutoff = min;
-
+				unsigned long long tsc = moving_average(ctsc - ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
+				if(tsc > max) max = tsc;
+			}
+			 proc->rdq.cutoff = (max + 2 * max) / 2;
 			/* paranoid */ verify(lanes.count < 65536); // The following code assumes max 65536 cores.
 			/* paranoid */ verify(map.count < 65536); // The following code assumes max 65536 cores.
 
-			if(0 == (__tls_rand() % 10_000)) {
+			if(0 == (__tls_rand() % 100)) {
 				proc->rdq.target = __tls_rand() % lanes.count;
 			} else {
@@ -358,14 +373,21 @@
 		}
 		else {
-			const unsigned long long bias = 0; //2_500_000_000;
-			const unsigned long long cutoff = proc->rdq.cutoff > bias ? proc->rdq.cutoff - bias : proc->rdq.cutoff;
+			unsigned long long max = 0;
+			for(i; READYQ_SHARD_FACTOR) {
+				unsigned long long tsc = moving_average(ctsc - ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
+				if(tsc > max) max = tsc;
+			}
+			const unsigned long long cutoff = (max + 2 * max) / 2;
 			{
 				unsigned target = proc->rdq.target;
 				proc->rdq.target = -1u;
-				if(lanes.tscs[target].tv < cutoff && ts(lanes.data[target]) < cutoff) {
+				lanes.help[target].tri++;
+				if(moving_average(ctsc - lanes.tscs[target].tv, lanes.tscs[target].ma) > cutoff) {
 					thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
 					proc->rdq.last = target;
 					if(t) return t;
+					else proc->rdq.target = -1u;
 				}
+				else proc->rdq.target = -1u;
 			}
 
@@ -645,4 +667,5 @@
 	// Actually pop the list
 	struct thread$ * thrd;
+	unsigned long long tsc_before = ts(lane);
 	unsigned long long tsv;
 	[thrd, tsv] = pop(lane);
@@ -658,6 +681,8 @@
 	__STATS( stats.success++; )
 
-	#if defined(USE_WORK_STEALING)
+	#if defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING)
+		unsigned long long now = rdtscl();
 		lanes.tscs[w].tv = tsv;
+		lanes.tscs[w].ma = moving_average(now > tsc_before ? now - tsc_before : 0, lanes.tscs[w].ma);
 	#endif
 
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision bc4a43356f310b9a005c19fd080a2a54f6385c31)
+++ libcfa/src/concurrency/thread.cfa	(revision 089d30c78871e1c30101300a99735d1f7c29787d)
@@ -25,4 +25,6 @@
 #include "invoke.h"
 
+uint64_t thread_rand();
+
 //-----------------------------------------------------------------------------
 // Thread ctors and dtors
@@ -41,5 +43,5 @@
 	link.next = 0p;
 	link.ts   = -1llu;
-	preferred = -1u;
+	preferred = thread_rand() % cl.ready_queue.lanes.count;
 	last_proc = 0p;
 	#if defined( __CFA_WITH_VERIFY__ )