Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 9cc3a185c1994c15462c67f498788eaade4cc34c)
+++ libcfa/src/concurrency/kernel.cfa	(revision 431cd4fc38b74e3dd30d07d84dd46cb8017afe4e)
@@ -474,5 +474,5 @@
 
 	ready_schedule_lock();
-		$thread * thrd = pop( this );
+		$thread * thrd = pop_fast( this );
 	ready_schedule_unlock();
 
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 9cc3a185c1994c15462c67f498788eaade4cc34c)
+++ libcfa/src/concurrency/kernel.hfa	(revision 431cd4fc38b74e3dd30d07d84dd46cb8017afe4e)
@@ -69,6 +69,12 @@
 	struct cluster * cltr;
 
-	// Id within the cluster
-	unsigned cltr_id;
+	// Ready Queue state per processor
+	struct {
+		unsigned short its;
+		unsigned short itr;
+		unsigned id;
+		unsigned target;
+		unsigned long long int cutoff;
+	} rdq;
 
 	// Set to true to notify the processor should terminate
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 9cc3a185c1994c15462c67f498788eaade4cc34c)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 431cd4fc38b74e3dd30d07d84dd46cb8017afe4e)
@@ -469,5 +469,9 @@
 	this.name = name;
 	this.cltr = &_cltr;
-	this.cltr_id = -1u;
+	this.rdq.its = 0;
+	this.rdq.itr = 0;
+	this.rdq.id  = -1u;
+	this.rdq.target = -1u;
+	this.rdq.cutoff = -1ull;
 	do_terminate = false;
 	preemption_alarm = 0p;
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 9cc3a185c1994c15462c67f498788eaade4cc34c)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 431cd4fc38b74e3dd30d07d84dd46cb8017afe4e)
@@ -292,5 +292,5 @@
 // returns 0p if empty
 // May return 0p spuriously
-__attribute__((hot)) struct $thread * pop(struct cluster * cltr);
+__attribute__((hot)) struct $thread * pop_fast(struct cluster * cltr);
 
 //-----------------------------------------------------------------------
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 9cc3a185c1994c15462c67f498788eaade4cc34c)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 431cd4fc38b74e3dd30d07d84dd46cb8017afe4e)
@@ -52,5 +52,6 @@
 static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred);
 static inline struct $thread * try_pop(struct cluster * cltr, unsigned w);
-static struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j);
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j);
+static inline struct $thread * search(struct cluster * cltr);
 
 
@@ -221,120 +222,224 @@
 
 //-----------------------------------------------------------------------
-__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
-	__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
-
-	const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
-	/* paranoid */ verify(external || kernelTLS().this_processor->cltr_id < lanes.count );
-
-	// write timestamp
-	thrd->link.ts = rdtscl();
-
-	bool local;
-	int preferred = external ? -1 : kernelTLS().this_processor->cltr_id;
-
-	// Try to pick a lane and lock it
-	unsigned i;
-	do {
-		// Pick the index of a lane
-		unsigned r = __tls_rand_fwd();
-		[i, local] = idx_from_r(r, preferred);
-
-		i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
-
+#if defined(USE_RELAXED_FIFO)
+	//-----------------------------------------------------------------------
+	// get index from random number with or without bias towards queues
+	static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred) {
+		unsigned i;
+		bool local;
+		unsigned rlow  = r % BIAS;
+		unsigned rhigh = r / BIAS;
+		if((0 != rlow) && preferred >= 0) {
+			// (BIAS - 1) out of BIAS chances
+			// Use perferred queues
+			i = preferred + (rhigh % READYQ_SHARD_FACTOR);
+			local = true;
+		}
+		else {
+			// 1 out of BIAS chances
+			// Use all queues
+			i = rhigh;
+			local = false;
+		}
+		return [i, local];
+	}
+
+	__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
+
+		const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+		/* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
+
+		// write timestamp
+		thrd->link.ts = rdtscl();
+
+		bool local;
+		int preferred = external ? -1 : kernelTLS().this_processor->rdq.id;
+
+		// Try to pick a lane and lock it
+		unsigned i;
+		do {
+			// Pick the index of a lane
+			unsigned r = __tls_rand_fwd();
+			[i, local] = idx_from_r(r, preferred);
+
+			i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+
+			#if !defined(__CFA_NO_STATISTICS__)
+				if(external) {
+					if(local) __atomic_fetch_add(&cltr->stats->ready.pick.ext.local, 1, __ATOMIC_RELAXED);
+					__atomic_fetch_add(&cltr->stats->ready.pick.ext.attempt, 1, __ATOMIC_RELAXED);
+				}
+				else {
+					if(local) __tls_stats()->ready.pick.push.local++;
+					__tls_stats()->ready.pick.push.attempt++;
+				}
+			#endif
+
+		#if defined(USE_MPSC)
+			// mpsc always succeeds
+		} while( false );
+		#else
+			// If we can't lock it retry
+		} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+		#endif
+
+		// Actually push it
+		push(lanes.data[i], thrd);
+
+		#if !defined(USE_MPSC)
+			// Unlock and return
+			__atomic_unlock( &lanes.data[i].lock );
+		#endif
+
+		// Mark the current index in the tls rng instance as having an item
+		__tls_rand_advance_bck();
+
+		__cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
+
+		// Update statistics
 		#if !defined(__CFA_NO_STATISTICS__)
 			if(external) {
-				if(local) __atomic_fetch_add(&cltr->stats->ready.pick.ext.local, 1, __ATOMIC_RELAXED);
-				__atomic_fetch_add(&cltr->stats->ready.pick.ext.attempt, 1, __ATOMIC_RELAXED);
+				if(local) __atomic_fetch_add(&cltr->stats->ready.pick.ext.lsuccess, 1, __ATOMIC_RELAXED);
+				__atomic_fetch_add(&cltr->stats->ready.pick.ext.success, 1, __ATOMIC_RELAXED);
 			}
 			else {
-				if(local) __tls_stats()->ready.pick.push.local++;
-				__tls_stats()->ready.pick.push.attempt++;
+				if(local) __tls_stats()->ready.pick.push.lsuccess++;
+				__tls_stats()->ready.pick.push.success++;
 			}
 		#endif
-
-	#if defined(USE_MPSC)
-		// mpsc always succeeds
-	} while( false );
-	#else
-		// If we can't lock it retry
-	} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
-	#endif
-
-	// Actually push it
-	push(lanes.data[i], thrd);
-
-	#if !defined(USE_MPSC)
-		// Unlock and return
-		__atomic_unlock( &lanes.data[i].lock );
-	#endif
-
-	// Mark the current index in the tls rng instance as having an item
-	__tls_rand_advance_bck();
-
-	__cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
-
-	// Update statistics
-	#if !defined(__CFA_NO_STATISTICS__)
-		if(external) {
-			if(local) __atomic_fetch_add(&cltr->stats->ready.pick.ext.lsuccess, 1, __ATOMIC_RELAXED);
-			__atomic_fetch_add(&cltr->stats->ready.pick.ext.success, 1, __ATOMIC_RELAXED);
-		}
-		else {
-			if(local) __tls_stats()->ready.pick.push.lsuccess++;
-			__tls_stats()->ready.pick.push.success++;
-		}
-	#endif
-}
-
-// Pop from the ready queue from a given cluster
-__attribute__((hot)) $thread * pop(struct cluster * cltr) with (cltr->ready_queue) {
-	/* paranoid */ verify( lanes.count > 0 );
-	/* paranoid */ verify(kernelTLS().this_processor->cltr_id < lanes.count );
-
-	unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
-	int preferred = kernelTLS().this_processor->cltr_id;
-
-
-	// As long as the list is not empty, try finding a lane that isn't empty and pop from it
-	for(25) {
-		// Pick two lists at random
-		unsigned ri = __tls_rand_bck();
-		unsigned rj = __tls_rand_bck();
-
-		unsigned i, j;
-		__attribute__((unused)) bool locali, localj;
-		[i, locali] = idx_from_r(ri, preferred);
-		[j, localj] = idx_from_r(rj, preferred);
-
-		#if !defined(__CFA_NO_STATISTICS__)
-			if(locali && localj) {
-				__tls_stats()->ready.pick.pop.local++;
+	}
+
+	// Pop from the ready queue from a given cluster
+	__attribute__((hot)) $thread * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
+		/* paranoid */ verify( lanes.count > 0 );
+		/* paranoid */ verify( kernelTLS().this_processor );
+		/* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count );
+
+		unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+		int preferred = kernelTLS().this_processor->rdq.id;
+
+
+		// As long as the list is not empty, try finding a lane that isn't empty and pop from it
+		for(25) {
+			// Pick two lists at random
+			unsigned ri = __tls_rand_bck();
+			unsigned rj = __tls_rand_bck();
+
+			unsigned i, j;
+			__attribute__((unused)) bool locali, localj;
+			[i, locali] = idx_from_r(ri, preferred);
+			[j, localj] = idx_from_r(rj, preferred);
+
+			#if !defined(__CFA_NO_STATISTICS__)
+				if(locali && localj) {
+					__tls_stats()->ready.pick.pop.local++;
+				}
+			#endif
+
+			i %= count;
+			j %= count;
+
+			// try popping from the 2 picked lists
+			struct $thread * thrd = try_pop(cltr, i, j);
+			if(thrd) {
+				#if !defined(__CFA_NO_STATISTICS__)
+					if( locali || localj ) __tls_stats()->ready.pick.pop.lsuccess++;
+				#endif
+				return thrd;
 			}
+		}
+
+		// All lanes where empty return 0p
+		return 0p;
+	}
+
+	__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) {
+		return search(cltr);
+	}
+#endif
+#if defined(USE_WORK_STEALING)
+	__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
+
+		const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+		/* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
+
+		// write timestamp
+		thrd->link.ts = rdtscl();
+
+		// Try to pick a lane and lock it
+		unsigned i;
+		do {
+			if(unlikely(external)) {
+				i = __tls_rand() % lanes.count;
+			}
+			else {
+				processor * proc = kernelTLS().this_processor;
+				unsigned r = proc->rdq.its++;
+				i =  proc->rdq.id + (r % READYQ_SHARD_FACTOR);
+			}
+
+
+		#if defined(USE_MPSC)
+			// mpsc always succeeds
+		} while( false );
+		#else
+			// If we can't lock it retry
+		} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
 		#endif
 
-		i %= count;
-		j %= count;
-
-		// try popping from the 2 picked lists
-		struct $thread * thrd = try_pop(cltr, i, j);
-		if(thrd) {
-			#if !defined(__CFA_NO_STATISTICS__)
-				if( locali || localj ) __tls_stats()->ready.pick.pop.lsuccess++;
-			#endif
-			return thrd;
-		}
-	}
-
-	// All lanes where empty return 0p
-	return 0p;
-}
-
-static void fix_times( struct cluster * cltr ) with( cltr->ready_queue ) {
-	lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
-	for(i; lanes.count) {
-		lanes.tscs[i].tv = ts(lanes.data[i]);
-	}
-
-}
+		// Actually push it
+		push(lanes.data[i], thrd);
+
+		#if !defined(USE_MPSC)
+			// Unlock and return
+			__atomic_unlock( &lanes.data[i].lock );
+		#endif
+
+		__cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
+	}
+
+	// Pop from the ready queue from a given cluster
+	__attribute__((hot)) $thread * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
+		/* paranoid */ verify( lanes.count > 0 );
+		/* paranoid */ verify( kernelTLS().this_processor );
+		/* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count );
+
+		processor * proc = kernelTLS().this_processor;
+
+		if(proc->rdq.target == -1u) {
+			proc->rdq.target = __tls_rand() % lanes.count;
+			unsigned it1  = proc->rdq.itr;
+			unsigned it2  = proc->rdq.itr + 1;
+			unsigned idx1 = proc->rdq.id + (it1 % READYQ_SHARD_FACTOR);
+			unsigned idx2 = proc->rdq.id + (it1 % READYQ_SHARD_FACTOR);
+			unsigned long long tsc1 = ts(lanes.data[idx1]);
+			unsigned long long tsc2 = ts(lanes.data[idx2]);
+			proc->rdq.cutoff = min(tsc1, tsc2);
+		}
+		else if(lanes.tscs[proc->rdq.target].tv < proc->rdq.cutoff) {
+			$thread * t = try_pop(cltr, proc->rdq.target);
+			proc->rdq.target = -1u;
+			if(t) return t;
+		}
+
+		for(READYQ_SHARD_FACTOR) {
+			unsigned i = proc->rdq.id + (--proc->rdq.itr % READYQ_SHARD_FACTOR);
+			if($thread * t = try_pop(cltr, i)) return t;
+		}
+		return 0p;
+	}
+
+	__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+		for(25) {
+			unsigned i = __tls_rand() % lanes.count;
+			$thread * t = try_pop(cltr, i);
+			if(t) return t;
+		}
+
+		return search(cltr);
+	}
+#endif
 
 //=======================================================================
@@ -345,26 +450,4 @@
 
 //-----------------------------------------------------------------------
-// get index from random number with or without bias towards queues
-static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred) {
-	unsigned i;
-	bool local;
-	unsigned rlow  = r % BIAS;
-	unsigned rhigh = r / BIAS;
-	if((0 != rlow) && preferred >= 0) {
-		// (BIAS - 1) out of BIAS chances
-		// Use perferred queues
-		i = preferred + (rhigh % READYQ_SHARD_FACTOR);
-		local = true;
-	}
-	else {
-		// 1 out of BIAS chances
-		// Use all queues
-		i = rhigh;
-		local = false;
-	}
-	return [i, local];
-}
-
-//-----------------------------------------------------------------------
 // try to pop from a lane given by index w
 static inline struct $thread * try_pop(struct cluster * cltr, unsigned w) with (cltr->ready_queue) {
@@ -399,6 +482,6 @@
 	#endif
 
-	#if defined(USE_WORKSTEALING)
-		lanes.times[i].val = thrd->links.ts;
+	#if defined(USE_WORK_STEALING)
+		lanes.tscs[w].tv = thrd->link.ts;
 	#endif
 
@@ -410,5 +493,5 @@
 // try to pop from any lanes making sure you don't miss any threads push
 // before the start of the function
-__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+static inline struct $thread * search(struct cluster * cltr) with (cltr->ready_queue) {
 	/* paranoid */ verify( lanes.count > 0 );
 	unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
@@ -491,5 +574,6 @@
 	for(unsigned i = 0; i < count; i++) {
 		/* paranoid */ verifyf( it, "Unexpected null iterator, at index %u of %u\n", i, count);
-		it->cltr_id = value;
+		it->rdq.id = value;
+		it->rdq.target = -1u;
 		value += READYQ_SHARD_FACTOR;
 		it = &(*it)`next;
@@ -501,4 +585,13 @@
 	assign_list(preferred, cltr->procs.actives, cltr->procs.total - cltr->procs.idle);
 	assign_list(preferred, cltr->procs.idles  , cltr->procs.idle );
+}
+
+static void fix_times( struct cluster * cltr ) with( cltr->ready_queue ) {
+	#if defined(USE_WORK_STEALING)
+		lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
+		for(i; lanes.count) {
+			lanes.tscs[i].tv = ts(lanes.data[i]);
+		}
+	#endif
 }
 
