Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 9cac0daf2a429c77fd9c9d3eecdbbdb1683449b1)
+++ libcfa/src/concurrency/kernel.cfa	(revision 77f12656ccde8ee15eefbc2e91e2734d2a383736)
@@ -544,5 +544,5 @@
 	/* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd->canary );
 
-
+	const bool local = thrd->state != Start;
 	if (thrd->preempted == __NO_PREEMPTION) thrd->state = Ready;
 
@@ -552,5 +552,5 @@
 
 	// push the thread to the cluster ready-queue
-	push( cl, thrd );
+	push( cl, thrd, local );
 
 	// variable thrd is no longer safe to use
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 9cac0daf2a429c77fd9c9d3eecdbbdb1683449b1)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 77f12656ccde8ee15eefbc2e91e2734d2a383736)
@@ -261,5 +261,5 @@
 // push thread onto a ready queue for a cluster
 // returns true if the list was previously empty, false otherwise
-__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd);
+__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd, bool local);
 
 //-----------------------------------------------------------------------
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 9cac0daf2a429c77fd9c9d3eecdbbdb1683449b1)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 77f12656ccde8ee15eefbc2e91e2734d2a383736)
@@ -249,8 +249,8 @@
 	}
 
-	__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+	__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd, bool push_local) with (cltr->ready_queue) {
 		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
 
-		const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+		const bool external = !push_local || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
 		/* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
 
@@ -279,6 +279,6 @@
 		push(lanes.data[i], thrd);
 
-			// Unlock and return
-			__atomic_unlock( &lanes.data[i].lock );
+		// Unlock and return
+		__atomic_unlock( &lanes.data[i].lock );
 
 		// Mark the current index in the tls rng instance as having an item
@@ -336,14 +336,14 @@
 #endif
 #if defined(USE_WORK_STEALING)
-	__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+	__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd, bool push_local) with (cltr->ready_queue) {
 		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
 
 		// #define USE_PREFERRED
 		#if !defined(USE_PREFERRED)
-		const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+		const bool external = !push_local || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
 		/* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
 		#else
 			unsigned preferred = thrd->preferred;
-			const bool external = (!kernelTLS().this_processor) || preferred == -1u || thrd->curr_cluster != cltr;
+			const bool external = push_local || (!kernelTLS().this_processor) || preferred == -1u || thrd->curr_cluster != cltr;
 			/* paranoid */ verifyf(external || preferred < lanes.count, "Invalid preferred queue %u for %u lanes", preferred, lanes.count );
 
@@ -365,8 +365,8 @@
 			else {
 				#if !defined(USE_PREFERRED)
-				processor * proc = kernelTLS().this_processor;
-				unsigned r = proc->rdq.its++;
-				i =  proc->rdq.id + (r % READYQ_SHARD_FACTOR);
-		#else
+					processor * proc = kernelTLS().this_processor;
+					unsigned r = proc->rdq.its++;
+					i =  proc->rdq.id + (r % READYQ_SHARD_FACTOR);
+				#else
 					i = start + (r++ % READYQ_SHARD_FACTOR);
 				#endif
@@ -378,6 +378,6 @@
 		push(lanes.data[i], thrd);
 
-			// Unlock and return
-			__atomic_unlock( &lanes.data[i].lock );
+		// Unlock and return
+		__atomic_unlock( &lanes.data[i].lock );
 
 		#if !defined(__CFA_NO_STATISTICS__)
@@ -666,5 +666,5 @@
 				[thrd, _] = pop(lanes.data[idx]);
 
-				push(cltr, thrd);
+				push(cltr, thrd, true);
 
 				// for printing count the number of displaced threads