Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision 57b36754852fad4ae8e509a553d6d510ed886b7b)
+++ libcfa/src/concurrency/invoke.h	(revision 9cc3a185c1994c15462c67f498788eaade4cc34c)
@@ -148,5 +148,4 @@
 		struct $thread * prev;
 		volatile unsigned long long ts;
-		int preferred;
 	};
 
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 57b36754852fad4ae8e509a553d6d510ed886b7b)
+++ libcfa/src/concurrency/kernel.hfa	(revision 9cc3a185c1994c15462c67f498788eaade4cc34c)
@@ -140,8 +140,13 @@
 // Cluster Tools
 
-// Intrusives lanes which are used by the relaxed ready queue
+// Intrusives lanes which are used by the ready queue
 struct __attribute__((aligned(128))) __intrusive_lane_t;
 void  ?{}(__intrusive_lane_t & this);
 void ^?{}(__intrusive_lane_t & this);
+
+// Aligned timestamps which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __timestamp_t;
+void  ?{}(__timestamp_t & this);
+void ^?{}(__timestamp_t & this);
 
 //TODO adjust cache size to ARCHITECTURE
@@ -155,4 +160,7 @@
 		// Arary of lanes
 		__intrusive_lane_t * volatile data;
+
+		// Array of times
+		__timestamp_t * volatile tscs;
 
 		// Number of lanes (empty or not)
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 57b36754852fad4ae8e509a553d6d510ed886b7b)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 9cc3a185c1994c15462c67f498788eaade4cc34c)
@@ -286,5 +286,5 @@
 // push thread onto a ready queue for a cluster
 // returns true if the list was previously empty, false otherwise
-__attribute__((hot)) bool push(struct cluster * cltr, struct $thread * thrd);
+__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd);
 
 //-----------------------------------------------------------------------
@@ -301,9 +301,4 @@
 
 //-----------------------------------------------------------------------
-// remove thread from the ready queue of a cluster
-// returns bool if it wasn't found
-bool remove_head(struct cluster * cltr, struct $thread * thrd);
-
-//-----------------------------------------------------------------------
 // Increase the width of the ready queue (number of lanes) by 4
 void ready_queue_grow  (struct cluster * cltr);
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 57b36754852fad4ae8e509a553d6d510ed886b7b)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 9cc3a185c1994c15462c67f498788eaade4cc34c)
@@ -19,4 +19,7 @@
 // #define USE_MPSC
 
+#define USE_RELAXED_FIFO
+// #define USE_WORK_STEALING
+
 #include "bits/defs.hfa"
 #include "kernel_private.hfa"
@@ -38,5 +41,17 @@
 #endif
 
-#define BIAS 4
+#if   defined(USE_RELAXED_FIFO)
+	#define BIAS 4
+	#define READYQ_SHARD_FACTOR 4
+#elif defined(USE_WORK_STEALING)
+	#define READYQ_SHARD_FACTOR 2
+#else
+	#error no scheduling strategy selected
+#endif
+
+static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred);
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned w);
+static struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j);
+
 
 // returns the maximum number of processors the RWLock support
@@ -191,8 +206,9 @@
 
 //=======================================================================
-// Cforall Reqdy Queue used for scheduling
+// Cforall Ready Queue used for scheduling
 //=======================================================================
 void ?{}(__ready_queue_t & this) with (this) {
 	lanes.data  = 0p;
+	lanes.tscs  = 0p;
 	lanes.count = 0;
 }
@@ -201,53 +217,19 @@
 	verify( 1 == lanes.count );
 	free(lanes.data);
+	free(lanes.tscs);
 }
 
 //-----------------------------------------------------------------------
-static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred) {
-	unsigned i;
-	bool local;
-	#if defined(BIAS)
-		unsigned rlow  = r % BIAS;
-		unsigned rhigh = r / BIAS;
-		if((0 != rlow) && preferred >= 0) {
-			// (BIAS - 1) out of BIAS chances
-			// Use perferred queues
-			i = preferred + (rhigh % 4);
-			local = true;
-		}
-		else {
-			// 1 out of BIAS chances
-			// Use all queues
-			i = rhigh;
-			local = false;
-		}
-	#else
-		i = r;
-		local = false;
-	#endif
-	return [i, local];
-}
-
-//-----------------------------------------------------------------------
-__attribute__((hot)) bool push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
 	__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
 
 	const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+	/* paranoid */ verify(external || kernelTLS().this_processor->cltr_id < lanes.count );
 
 	// write timestamp
 	thrd->link.ts = rdtscl();
 
-	bool first = false;
-	__attribute__((unused)) bool local;
-	__attribute__((unused)) int preferred;
-	#if defined(BIAS)
-		/* paranoid */ verify(external || kernelTLS().this_processor->cltr_id < lanes.count );
-		preferred =
-			//*
-			external ? -1 : kernelTLS().this_processor->cltr_id;
-			/*/
-			thrd->link.preferred * 4;
-			//*/
-	#endif
+	bool local;
+	int preferred = external ? -1 : kernelTLS().this_processor->cltr_id;
 
 	// Try to pick a lane and lock it
@@ -255,5 +237,4 @@
 	do {
 		// Pick the index of a lane
-		// unsigned r = __tls_rand();
 		unsigned r = __tls_rand_fwd();
 		[i, local] = idx_from_r(r, preferred);
@@ -304,21 +285,13 @@
 		}
 	#endif
-
-	// return whether or not the list was empty before this push
-	return first;
-}
-
-static struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j);
-static struct $thread * try_pop(struct cluster * cltr, unsigned i);
+}
 
 // Pop from the ready queue from a given cluster
 __attribute__((hot)) $thread * pop(struct cluster * cltr) with (cltr->ready_queue) {
 	/* paranoid */ verify( lanes.count > 0 );
+	/* paranoid */ verify(kernelTLS().this_processor->cltr_id < lanes.count );
+
 	unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
-	int preferred;
-	#if defined(BIAS)
-		/* paranoid */ verify(kernelTLS().this_processor->cltr_id < lanes.count );
-		preferred = kernelTLS().this_processor->cltr_id;
-	#endif
+	int preferred = kernelTLS().this_processor->cltr_id;
 
 
@@ -326,6 +299,4 @@
 	for(25) {
 		// Pick two lists at random
-		// unsigned ri = __tls_rand();
-		// unsigned rj = __tls_rand();
 		unsigned ri = __tls_rand_bck();
 		unsigned rj = __tls_rand_bck();
@@ -348,5 +319,5 @@
 		struct $thread * thrd = try_pop(cltr, i, j);
 		if(thrd) {
-			#if defined(BIAS) && !defined(__CFA_NO_STATISTICS__)
+			#if !defined(__CFA_NO_STATISTICS__)
 				if( locali || localj ) __tls_stats()->ready.pick.pop.lsuccess++;
 			#endif
@@ -359,4 +330,84 @@
 }
 
+static void fix_times( struct cluster * cltr ) with( cltr->ready_queue ) {
+	lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
+	for(i; lanes.count) {
+		lanes.tscs[i].tv = ts(lanes.data[i]);
+	}
+
+}
+
+//=======================================================================
+// Various Ready Queue utilities
+//=======================================================================
+// these function work the same or almost the same
+// whether they are using work-stealing or relaxed fifo scheduling
+
+//-----------------------------------------------------------------------
+// get index from random number with or without bias towards queues
+static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred) {
+	unsigned i;
+	bool local;
+	unsigned rlow  = r % BIAS;
+	unsigned rhigh = r / BIAS;
+	if((0 != rlow) && preferred >= 0) {
+		// (BIAS - 1) out of BIAS chances
+		// Use perferred queues
+		i = preferred + (rhigh % READYQ_SHARD_FACTOR);
+		local = true;
+	}
+	else {
+		// 1 out of BIAS chances
+		// Use all queues
+		i = rhigh;
+		local = false;
+	}
+	return [i, local];
+}
+
+//-----------------------------------------------------------------------
+// try to pop from a lane given by index w
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned w) with (cltr->ready_queue) {
+	// Get relevant elements locally
+	__intrusive_lane_t & lane = lanes.data[w];
+
+	// If list looks empty retry
+	if( is_empty(lane) ) return 0p;
+
+	// If we can't get the lock retry
+	if( !__atomic_try_acquire(&lane.lock) ) return 0p;
+
+	// If list is empty, unlock and retry
+	if( is_empty(lane) ) {
+		__atomic_unlock(&lane.lock);
+		return 0p;
+	}
+
+	// Actually pop the list
+	struct $thread * thrd;
+	thrd = pop(lane);
+
+	/* paranoid */ verify(thrd);
+	/* paranoid */ verify(lane.lock);
+
+	// Unlock and return
+	__atomic_unlock(&lane.lock);
+
+	// Update statistics
+	#if !defined(__CFA_NO_STATISTICS__)
+		__tls_stats()->ready.pick.pop.success++;
+	#endif
+
+	#if defined(USE_WORKSTEALING)
+		lanes.times[i].val = thrd->links.ts;
+	#endif
+
+	// return the popped thread
+	return thrd;
+}
+
+//-----------------------------------------------------------------------
+// try to pop from any lanes making sure you don't miss any threads push
+// before the start of the function
 __attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
 	/* paranoid */ verify( lanes.count > 0 );
@@ -375,85 +426,6 @@
 }
 
-
 //-----------------------------------------------------------------------
-// Given 2 indexes, pick the list with the oldest push an try to pop from it
-static inline struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j) with (cltr->ready_queue) {
-	#if !defined(__CFA_NO_STATISTICS__)
-		__tls_stats()->ready.pick.pop.attempt++;
-	#endif
-
-	// Pick the bet list
-	int w = i;
-	if( __builtin_expect(!is_empty(lanes.data[j]), true) ) {
-		w = (ts(lanes.data[i]) < ts(lanes.data[j])) ? i : j;
-	}
-
-	return try_pop(cltr, w);
-}
-
-static inline struct $thread * try_pop(struct cluster * cltr, unsigned w) with (cltr->ready_queue) {
-	// Get relevant elements locally
-	__intrusive_lane_t & lane = lanes.data[w];
-
-	// If list looks empty retry
-	if( is_empty(lane) ) return 0p;
-
-	// If we can't get the lock retry
-	if( !__atomic_try_acquire(&lane.lock) ) return 0p;
-
-
-	// If list is empty, unlock and retry
-	if( is_empty(lane) ) {
-		__atomic_unlock(&lane.lock);
-		return 0p;
-	}
-
-	// Actually pop the list
-	struct $thread * thrd;
-	thrd = pop(lane);
-
-	/* paranoid */ verify(thrd);
-	/* paranoid */ verify(lane.lock);
-
-	// Unlock and return
-	__atomic_unlock(&lane.lock);
-
-	// Update statistics
-	#if !defined(__CFA_NO_STATISTICS__)
-		__tls_stats()->ready.pick.pop.success++;
-	#endif
-
-	// Update the thread bias
-	thrd->link.preferred = w / 4;
-
-	// return the popped thread
-	return thrd;
-}
-//-----------------------------------------------------------------------
-
-bool remove_head(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
-	for(i; lanes.count) {
-		__intrusive_lane_t & lane = lanes.data[i];
-
-		bool removed = false;
-
-		__atomic_acquire(&lane.lock);
-			if(head(lane)->link.next == thrd) {
-				$thread * pthrd;
-				pthrd = pop(lane);
-
-				/* paranoid */ verify( pthrd == thrd );
-
-				removed = true;
-			}
-		__atomic_unlock(&lane.lock);
-
-		if( removed ) return true;
-	}
-	return false;
-}
-
-//-----------------------------------------------------------------------
-
+// Check that all the intrusive queues in the data structure are still consistent
 static void check( __ready_queue_t & q ) with (q) {
 	#if defined(__CFA_WITH_VERIFY__) && !defined(USE_MPSC)
@@ -480,4 +452,20 @@
 }
 
+//-----------------------------------------------------------------------
+// Given 2 indexes, pick the list with the oldest push an try to pop from it
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j) with (cltr->ready_queue) {
+	#if !defined(__CFA_NO_STATISTICS__)
+		__tls_stats()->ready.pick.pop.attempt++;
+	#endif
+
+	// Pick the bet list
+	int w = i;
+	if( __builtin_expect(!is_empty(lanes.data[j]), true) ) {
+		w = (ts(lanes.data[i]) < ts(lanes.data[j])) ? i : j;
+	}
+
+	return try_pop(cltr, w);
+}
+
 // Call this function of the intrusive list was moved using memcpy
 // fixes the list so that the pointers back to anchors aren't left dangling
@@ -499,18 +487,18 @@
 }
 
-static void assign_list(unsigned & value, const int inc, dlist(processor, processor) & list, unsigned count) {
+static void assign_list(unsigned & value, dlist(processor, processor) & list, unsigned count) {
 	processor * it = &list`first;
 	for(unsigned i = 0; i < count; i++) {
 		/* paranoid */ verifyf( it, "Unexpected null iterator, at index %u of %u\n", i, count);
 		it->cltr_id = value;
-		value += inc;
+		value += READYQ_SHARD_FACTOR;
 		it = &(*it)`next;
 	}
 }
 
-static void reassign_cltr_id(struct cluster * cltr, const int inc) {
+static void reassign_cltr_id(struct cluster * cltr) {
 	unsigned preferred = 0;
-	assign_list(preferred, inc, cltr->procs.actives, cltr->procs.total - cltr->procs.idle);
-	assign_list(preferred, inc, cltr->procs.idles  , cltr->procs.idle );
+	assign_list(preferred, cltr->procs.actives, cltr->procs.total - cltr->procs.idle);
+	assign_list(preferred, cltr->procs.idles  , cltr->procs.idle );
 }
 
@@ -531,5 +519,5 @@
 		// Make sure we always have atleast 1 list
 		if(target >= 2) {
-			ncount = target * 4;
+			ncount = target * READYQ_SHARD_FACTOR;
 		} else {
 			ncount = 1;
@@ -553,5 +541,7 @@
 	}
 
-	reassign_cltr_id(cltr, 4);
+	fix_times(cltr);
+
+	reassign_cltr_id(cltr);
 
 	// Make sure that everything is consistent
@@ -579,7 +569,7 @@
 		// Find new count
 		// Make sure we always have atleast 1 list
-		lanes.count = target >= 2 ? target * 4: 1;
+		lanes.count = target >= 2 ? target * READYQ_SHARD_FACTOR: 1;
 		/* paranoid */ verify( ocount >= lanes.count );
-		/* paranoid */ verify( lanes.count == target * 4 || target < 2 );
+		/* paranoid */ verify( lanes.count == target * READYQ_SHARD_FACTOR || target < 2 );
 
 		// for printing count the number of displaced threads
@@ -626,5 +616,7 @@
 	}
 
-	reassign_cltr_id(cltr, 4);
+	fix_times(cltr);
+
+	reassign_cltr_id(cltr);
 
 	// Make sure that everything is consistent
Index: libcfa/src/concurrency/ready_subqueue.hfa
===================================================================
--- libcfa/src/concurrency/ready_subqueue.hfa	(revision 57b36754852fad4ae8e509a553d6d510ed886b7b)
+++ libcfa/src/concurrency/ready_subqueue.hfa	(revision 9cc3a185c1994c15462c67f498788eaade4cc34c)
@@ -246,2 +246,10 @@
 	#endif
 }
+
+// Aligned timestamps which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __timestamp_t {
+	volatile unsigned long long tv;
+};
+
+void  ?{}(__timestamp_t & this) { this.tv = 0; }
+void ^?{}(__timestamp_t & this) {}
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 57b36754852fad4ae8e509a553d6d510ed886b7b)
+++ libcfa/src/concurrency/thread.cfa	(revision 9cc3a185c1994c15462c67f498788eaade4cc34c)
@@ -39,5 +39,4 @@
 	link.next = 0p;
 	link.prev = 0p;
-	link.preferred = -1;
 	#if defined( __CFA_WITH_VERIFY__ )
 		canary = 0x0D15EA5E0D15EA5Ep;
