Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision fcd65ca7ff45aa05cca357c714e96e0fcb5f5642)
+++ libcfa/src/concurrency/invoke.h	(revision 24e321c53a4cf2985e9827928927e3b7d42e1791)
@@ -175,5 +175,5 @@
 		struct cluster * curr_cluster;
 
-		// preferred ready-queue
+		// preferred ready-queue or CPU
 		unsigned preferred;
 
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision fcd65ca7ff45aa05cca357c714e96e0fcb5f5642)
+++ libcfa/src/concurrency/io.cfa	(revision 24e321c53a4cf2985e9827928927e3b7d42e1791)
@@ -90,5 +90,5 @@
 	static inline unsigned __flush( struct $io_context & );
 	static inline __u32 __release_sqes( struct $io_context & );
-	extern void __kernel_unpark( thread$ * thrd );
+	extern void __kernel_unpark( thread$ * thrd, unpark_hint );
 
 	bool __cfa_io_drain( processor * proc ) {
@@ -118,5 +118,5 @@
 			__cfadbg_print_safe( io, "Kernel I/O : Syscall completed : cqe %p, result %d for %p\n", &cqe, cqe.res, future );
 
-			__kernel_unpark( fulfil( *future, cqe.res, false ) );
+			__kernel_unpark( fulfil( *future, cqe.res, false ), UNPARK_LOCAL );
 		}
 
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision fcd65ca7ff45aa05cca357c714e96e0fcb5f5642)
+++ libcfa/src/concurrency/kernel.cfa	(revision 24e321c53a4cf2985e9827928927e3b7d42e1791)
@@ -476,5 +476,5 @@
 		if(unlikely(thrd_dst->preempted != __NO_PREEMPTION)) {
 			// The thread was preempted, reschedule it and reset the flag
-			schedule_thread$( thrd_dst );
+			schedule_thread$( thrd_dst, UNPARK_LOCAL );
 			break RUNNING;
 		}
@@ -560,5 +560,5 @@
 // Scheduler routines
 // KERNEL ONLY
-static void __schedule_thread( thread$ * thrd ) {
+static void __schedule_thread( thread$ * thrd, unpark_hint hint ) {
 	/* paranoid */ verify( ! __preemption_enabled() );
 	/* paranoid */ verify( ready_schedule_islocked());
@@ -580,8 +580,8 @@
 	// Dereference the thread now because once we push it, there is not guaranteed it's still valid.
 	struct cluster * cl = thrd->curr_cluster;
-	__STATS(bool outside = thrd->last_proc && thrd->last_proc != kernelTLS().this_processor; )
+	__STATS(bool outside = hint == UNPARK_LOCAL && thrd->last_proc && thrd->last_proc != kernelTLS().this_processor; )
 
 	// push the thread to the cluster ready-queue
-	push( cl, thrd, local );
+	push( cl, thrd, hint );
 
 	// variable thrd is no longer safe to use
@@ -608,7 +608,7 @@
 }
 
-void schedule_thread$( thread$ * thrd ) {
+void schedule_thread$( thread$ * thrd, unpark_hint hint ) {
 	ready_schedule_lock();
-		__schedule_thread( thrd );
+		__schedule_thread( thrd, hint );
 	ready_schedule_unlock();
 }
@@ -661,5 +661,5 @@
 }
 
-void __kernel_unpark( thread$ * thrd ) {
+void __kernel_unpark( thread$ * thrd, unpark_hint hint ) {
 	/* paranoid */ verify( ! __preemption_enabled() );
 	/* paranoid */ verify( ready_schedule_islocked());
@@ -669,5 +669,5 @@
 	if(__must_unpark(thrd)) {
 		// Wake lost the race,
-		__schedule_thread( thrd );
+		__schedule_thread( thrd, hint );
 	}
 
@@ -676,5 +676,5 @@
 }
 
-void unpark( thread$ * thrd ) {
+void unpark( thread$ * thrd, unpark_hint hint ) {
 	if( !thrd ) return;
 
@@ -682,5 +682,5 @@
 		disable_interrupts();
 			// Wake lost the race,
-			schedule_thread$( thrd );
+			schedule_thread$( thrd, hint );
 		enable_interrupts(false);
 	}
Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision fcd65ca7ff45aa05cca357c714e96e0fcb5f5642)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision 24e321c53a4cf2985e9827928927e3b7d42e1791)
@@ -119,6 +119,9 @@
 
 	extern "Cforall" {
+		enum unpark_hint { UNPARK_LOCAL, UNPARK_REMOTE };
+
 		extern void park( void );
-		extern void unpark( struct thread$ * this );
+		extern void unpark( struct thread$ *, unpark_hint );
+		static inline void unpark( struct thread$ * thrd ) { unpark(thrd, UNPARK_LOCAL); }
 		static inline struct thread$ * active_thread () {
 			struct thread$ * t = publicTLS_get( this_thread );
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision fcd65ca7ff45aa05cca357c714e96e0fcb5f5642)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 24e321c53a4cf2985e9827928927e3b7d42e1791)
@@ -200,4 +200,28 @@
 	__cfadbg_print_safe(runtime_core, "Kernel : Main cluster ready\n");
 
+	// Construct the processor context of the main processor
+	void ?{}(processorCtx_t & this, processor * proc) {
+		(this.__cor){ "Processor" };
+		this.__cor.starter = 0p;
+		this.proc = proc;
+	}
+
+	void ?{}(processor & this) with( this ) {
+		( this.terminated ){};
+		( this.runner ){};
+		init( this, "Main Processor", *mainCluster, 0p );
+		kernel_thread = pthread_self();
+
+		runner{ &this };
+		__cfadbg_print_safe(runtime_core, "Kernel : constructed main processor context %p\n", &runner);
+	}
+
+	// Initialize the main processor and the main processor ctx
+	// (the coroutine that contains the processing control flow)
+	mainProcessor = (processor *)&storage_mainProcessor;
+	(*mainProcessor){};
+
+	register_tls( mainProcessor );
+
 	// Start by initializing the main thread
 	// SKULLDUGGERY: the mainThread steals the process main thread
@@ -210,30 +234,4 @@
 	__cfadbg_print_safe(runtime_core, "Kernel : Main thread ready\n");
 
-
-
-	// Construct the processor context of the main processor
-	void ?{}(processorCtx_t & this, processor * proc) {
-		(this.__cor){ "Processor" };
-		this.__cor.starter = 0p;
-		this.proc = proc;
-	}
-
-	void ?{}(processor & this) with( this ) {
-		( this.terminated ){};
-		( this.runner ){};
-		init( this, "Main Processor", *mainCluster, 0p );
-		kernel_thread = pthread_self();
-
-		runner{ &this };
-		__cfadbg_print_safe(runtime_core, "Kernel : constructed main processor context %p\n", &runner);
-	}
-
-	// Initialize the main processor and the main processor ctx
-	// (the coroutine that contains the processing control flow)
-	mainProcessor = (processor *)&storage_mainProcessor;
-	(*mainProcessor){};
-
-	register_tls( mainProcessor );
-
 	//initialize the global state variables
 	__cfaabi_tls.this_processor = mainProcessor;
@@ -251,5 +249,5 @@
 	// Add the main thread to the ready queue
 	// once resume is called on mainProcessor->runner the mainThread needs to be scheduled like any normal thread
-	schedule_thread$(mainThread);
+	schedule_thread$(mainThread, UNPARK_LOCAL);
 
 	// SKULLDUGGERY: Force a context switch to the main processor to set the main thread's context to the current UNIX
@@ -485,5 +483,5 @@
 	link.next = 0p;
 	link.ts   = -1llu;
-	preferred = -1u;
+	preferred = ready_queue_new_preferred();
 	last_proc = 0p;
 	#if defined( __CFA_WITH_VERIFY__ )
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision fcd65ca7ff45aa05cca357c714e96e0fcb5f5642)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 24e321c53a4cf2985e9827928927e3b7d42e1791)
@@ -46,5 +46,5 @@
 }
 
-void schedule_thread$( thread$ * ) __attribute__((nonnull (1)));
+void schedule_thread$( thread$ *, unpark_hint hint ) __attribute__((nonnull (1)));
 
 extern bool __preemption_enabled();
@@ -300,5 +300,5 @@
 // push thread onto a ready queue for a cluster
 // returns true if the list was previously empty, false otherwise
-__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, bool local);
+__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint);
 
 //-----------------------------------------------------------------------
@@ -321,4 +321,8 @@
 
 //-----------------------------------------------------------------------
+// get preferred ready for new thread
+unsigned ready_queue_new_preferred();
+
+//-----------------------------------------------------------------------
 // Increase the width of the ready queue (number of lanes) by 4
 void ready_queue_grow  (struct cluster * cltr);
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision fcd65ca7ff45aa05cca357c714e96e0fcb5f5642)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 24e321c53a4cf2985e9827928927e3b7d42e1791)
@@ -290,5 +290,5 @@
 //-----------------------------------------------------------------------
 #if defined(USE_CPU_WORK_STEALING)
-	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, bool push_local) with (cltr->ready_queue) {
+	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
 		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
 
@@ -450,8 +450,8 @@
 	}
 
-	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, bool push_local) with (cltr->ready_queue) {
+	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
 		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
 
-		const bool external = !push_local || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+		const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
 		/* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
 
@@ -537,14 +537,14 @@
 #endif
 #if defined(USE_WORK_STEALING)
-	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, bool push_local) with (cltr->ready_queue) {
+	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
 		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
 
 		// #define USE_PREFERRED
 		#if !defined(USE_PREFERRED)
-		const bool external = !push_local || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+		const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
 		/* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
 		#else
 			unsigned preferred = thrd->preferred;
-			const bool external = push_local || (!kernelTLS().this_processor) || preferred == -1u || thrd->curr_cluster != cltr;
+			const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || preferred == -1u || thrd->curr_cluster != cltr;
 			/* paranoid */ verifyf(external || preferred < lanes.count, "Invalid preferred queue %u for %u lanes", preferred, lanes.count );
 
@@ -687,5 +687,9 @@
 	#endif
 
-	thrd->preferred = w;
+	#if defined(USE_CPU_WORK_STEALING)
+		thrd->preferred = w / READYQ_SHARD_FACTOR;
+	#else
+		thrd->preferred = w;
+	#endif
 
 	// return the popped thread
@@ -713,4 +717,25 @@
 
 //-----------------------------------------------------------------------
+// get preferred ready for new thread
+unsigned ready_queue_new_preferred() {
+	unsigned pref = 0;
+	if(struct thread$ * thrd = publicTLS_get( this_thread )) {
+		pref = thrd->preferred;
+	}
+	else {
+		#if defined(USE_CPU_WORK_STEALING)
+			pref = __kernel_getcpu();
+		#endif
+	}
+
+	#if defined(USE_CPU_WORK_STEALING)
+		/* paranoid */ verify(pref >= 0);
+		/* paranoid */ verify(pref < cpu_info.hthrd_count);
+	#endif
+
+	return pref;
+}
+
+//-----------------------------------------------------------------------
 // Check that all the intrusive queues in the data structure are still consistent
 static void check( __ready_queue_t & q ) with (q) {
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision fcd65ca7ff45aa05cca357c714e96e0fcb5f5642)
+++ libcfa/src/concurrency/thread.cfa	(revision 24e321c53a4cf2985e9827928927e3b7d42e1791)
@@ -43,5 +43,5 @@
 	link.next = 0p;
 	link.ts   = -1llu;
-	preferred = thread_rand() % cl.ready_queue.lanes.count;
+	preferred = ready_queue_new_preferred();
 	last_proc = 0p;
 	#if defined( __CFA_WITH_VERIFY__ )
@@ -140,5 +140,5 @@
 	/* paranoid */ verify( this_thrd->context.SP );
 
-	schedule_thread$( this_thrd );
+	schedule_thread$( this_thrd, UNPARK_LOCAL );
 	enable_interrupts();
 }
