Index: libcfa/src/concurrency/clib/cfathread.cfa
===================================================================
--- libcfa/src/concurrency/clib/cfathread.cfa	(revision 00e9be94eec3b41099f7decdab5a57d6edea62ed)
+++ libcfa/src/concurrency/clib/cfathread.cfa	(revision 3ec79f71ac1364f434c450785a53ca1807f141a7)
@@ -243,5 +243,5 @@
 	// Mutex
 	struct cfathread_mutex {
-		single_acquisition_lock impl;
+		fast_lock impl;
 	};
 	int cfathread_mutex_init(cfathread_mutex_t *restrict mut, const cfathread_mutexattr_t *restrict) __attribute__((nonnull (1))) { *mut = new(); return 0; }
@@ -258,5 +258,5 @@
 	// Condition
 	struct cfathread_condition {
-		condition_variable(single_acquisition_lock) impl;
+		condition_variable(fast_lock) impl;
 	};
 	int cfathread_cond_init(cfathread_cond_t *restrict cond, const cfathread_condattr_t *restrict) __attribute__((nonnull (1))) { *cond = new(); return 0; }
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision 00e9be94eec3b41099f7decdab5a57d6edea62ed)
+++ libcfa/src/concurrency/invoke.h	(revision 3ec79f71ac1364f434c450785a53ca1807f141a7)
@@ -148,4 +148,5 @@
 		struct $thread * prev;
 		volatile unsigned long long ts;
+		unsigned preferred;
 	};
 
@@ -199,4 +200,6 @@
 		} node;
 
+		struct processor * last_proc;
+
 		#if defined( __CFA_WITH_VERIFY__ )
 			void * canary;
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 00e9be94eec3b41099f7decdab5a57d6edea62ed)
+++ libcfa/src/concurrency/io.cfa	(revision 3ec79f71ac1364f434c450785a53ca1807f141a7)
@@ -40,4 +40,5 @@
 	#include "kernel.hfa"
 	#include "kernel/fwd.hfa"
+	#include "kernel_private.hfa"
 	#include "io/types.hfa"
 
@@ -89,7 +90,9 @@
 	static inline unsigned __flush( struct $io_context & );
 	static inline __u32 __release_sqes( struct $io_context & );
+	extern void __kernel_unpark( $thread * thrd );
 
 	bool __cfa_io_drain( processor * proc ) {
 		/* paranoid */ verify( ! __preemption_enabled() );
+		/* paranoid */ verify( ready_schedule_islocked() );
 		/* paranoid */ verify( proc );
 		/* paranoid */ verify( proc->io.ctx );
@@ -115,5 +118,5 @@
 			__cfadbg_print_safe( io, "Kernel I/O : Syscall completed : cqe %p, result %d for %p\n", &cqe, cqe.res, future );
 
-			fulfil( *future, cqe.res );
+			__kernel_unpark( fulfil( *future, cqe.res, false ) );
 		}
 
@@ -124,4 +127,5 @@
 		__atomic_store_n( ctx->cq.head, head + count, __ATOMIC_SEQ_CST );
 
+		/* paranoid */ verify( ready_schedule_islocked() );
 		/* paranoid */ verify( ! __preemption_enabled() );
 
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 00e9be94eec3b41099f7decdab5a57d6edea62ed)
+++ libcfa/src/concurrency/kernel.cfa	(revision 3ec79f71ac1364f434c450785a53ca1807f141a7)
@@ -34,4 +34,9 @@
 #include "invoke.h"
 
+#if !defined(__CFA_NO_STATISTICS__)
+	#define __STATS( ...) __VA_ARGS__
+#else
+	#define __STATS( ...)
+#endif
 
 //-----------------------------------------------------------------------------
@@ -166,7 +171,5 @@
 		preemption_scope scope = { this };
 
-		#if !defined(__CFA_NO_STATISTICS__)
-			unsigned long long last_tally = rdtscl();
-		#endif
+		__STATS( unsigned long long last_tally = rdtscl(); )
 
 		// if we need to run some special setup, now is the time to do it.
@@ -266,4 +269,117 @@
 				__cfa_io_flush( this );
 			}
+
+		// 	SEARCH: {
+		// 		/* paranoid */ verify( ! __preemption_enabled() );
+		// 		/* paranoid */ verify( kernelTLS().this_proc_id );
+
+		// 		// First, lock the scheduler since we are searching for a thread
+
+		// 		// Try to get the next thread
+		// 		ready_schedule_lock();
+		// 		readyThread = pop_fast( this->cltr );
+		// 		ready_schedule_unlock();
+		// 		if(readyThread) {  break SEARCH; }
+
+		// 		// If we can't find a thread, might as well flush any outstanding I/O
+		// 		if(this->io.pending) { __cfa_io_flush( this ); }
+
+		// 		// Spin a little on I/O, just in case
+		// 		for(25) {
+		// 			__maybe_io_drain( this );
+		// 			ready_schedule_lock();
+		// 			readyThread = pop_fast( this->cltr );
+		// 			ready_schedule_unlock();
+		// 			if(readyThread) {  break SEARCH; }
+		// 		}
+
+		// 		// no luck, try stealing a few times
+		// 		for(25) {
+		// 			if( __maybe_io_drain( this ) ) {
+		// 				ready_schedule_lock();
+		// 				readyThread = pop_fast( this->cltr );
+		// 			} else {
+		// 				ready_schedule_lock();
+		// 				readyThread = pop_slow( this->cltr );
+		// 			}
+		// 			ready_schedule_unlock();
+		// 			if(readyThread) {  break SEARCH; }
+		// 		}
+
+		// 		// still no luck, search for a thread
+		// 		ready_schedule_lock();
+		// 		readyThread = pop_search( this->cltr );
+		// 		ready_schedule_unlock();
+		// 		if(readyThread) { break SEARCH; }
+
+		// 		// Don't block if we are done
+		// 		if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
+
+		// 		__STATS( __tls_stats()->ready.sleep.halts++; )
+
+		// 		// Push self to idle stack
+		// 		mark_idle(this->cltr->procs, * this);
+
+		// 		// Confirm the ready-queue is empty
+		// 		__maybe_io_drain( this );
+		// 		ready_schedule_lock();
+		// 		readyThread = pop_search( this->cltr );
+		// 		ready_schedule_unlock();
+
+		// 		if( readyThread ) {
+		// 			// A thread was found, cancel the halt
+		// 			mark_awake(this->cltr->procs, * this);
+
+		// 			__STATS( __tls_stats()->ready.sleep.cancels++; )
+
+		// 			// continue the main loop
+		// 			break SEARCH;
+		// 		}
+
+		// 		__STATS( if(this->print_halts) __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->id, rdtscl()); )
+		// 		__cfadbg_print_safe(runtime_core, "Kernel : core %p waiting on eventfd %d\n", this, this->idle);
+
+		// 		// __disable_interrupts_hard();
+		// 		eventfd_t val;
+		// 		eventfd_read( this->idle, &val );
+		// 		// __enable_interrupts_hard();
+
+		// 		__STATS( if(this->print_halts) __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->id, rdtscl()); )
+
+		// 		// We were woken up, remove self from idle
+		// 		mark_awake(this->cltr->procs, * this);
+
+		// 		// DON'T just proceed, start looking again
+		// 		continue MAIN_LOOP;
+		// 	}
+
+		// RUN_THREAD:
+		// 	/* paranoid */ verify( kernelTLS().this_proc_id );
+		// 	/* paranoid */ verify( ! __preemption_enabled() );
+		// 	/* paranoid */ verify( readyThread );
+
+		// 	// Reset io dirty bit
+		// 	this->io.dirty = false;
+
+		// 	// We found a thread run it
+		// 	__run_thread(this, readyThread);
+
+		// 	// Are we done?
+		// 	if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
+
+		// 	#if !defined(__CFA_NO_STATISTICS__)
+		// 		unsigned long long curr = rdtscl();
+		// 		if(curr > (last_tally + 500000000)) {
+		// 			__tally_stats(this->cltr->stats, __cfaabi_tls.this_stats);
+		// 			last_tally = curr;
+		// 		}
+		// 	#endif
+
+		// 	if(this->io.pending && !this->io.dirty) {
+		// 		__cfa_io_flush( this );
+		// 	}
+
+		// 	// Check if there is pending io
+		// 	__maybe_io_drain( this );
 		}
 
@@ -402,7 +518,5 @@
 	$thread * thrd_src = kernelTLS().this_thread;
 
-	#if !defined(__CFA_NO_STATISTICS__)
-		struct processor * last_proc = kernelTLS().this_processor;
-	#endif
+	__STATS( thrd_src->last_proc = kernelTLS().this_processor; )
 
 	// Run the thread on this processor
@@ -423,5 +537,6 @@
 
 	#if !defined(__CFA_NO_STATISTICS__)
-		if(last_proc != kernelTLS().this_processor) {
+		/* paranoid */ verify( thrd_src->last_proc != 0p );
+		if(thrd_src->last_proc != kernelTLS().this_processor) {
 			__tls_stats()->ready.threads.migration++;
 		}
@@ -436,5 +551,5 @@
 // Scheduler routines
 // KERNEL ONLY
-void __schedule_thread( $thread * thrd ) {
+static void __schedule_thread( $thread * thrd ) {
 	/* paranoid */ verify( ! __preemption_enabled() );
 	/* paranoid */ verify( kernelTLS().this_proc_id );
@@ -457,4 +572,5 @@
 	// Dereference the thread now because once we push it, there is not guaranteed it's still valid.
 	struct cluster * cl = thrd->curr_cluster;
+	__STATS(bool outside = thrd->last_proc && thrd->last_proc != kernelTLS().this_processor; )
 
 	// push the thread to the cluster ready-queue
@@ -470,8 +586,12 @@
 		if( kernelTLS().this_stats ) {
 			__tls_stats()->ready.threads.threads++;
+			if(outside) {
+				__tls_stats()->ready.threads.extunpark++;
+			}
 			__push_stat( __tls_stats(), __tls_stats()->ready.threads.threads, false, "Processor", kernelTLS().this_processor );
 		}
 		else {
 			__atomic_fetch_add(&cl->stats->ready.threads.threads, 1, __ATOMIC_RELAXED);
+			__atomic_fetch_add(&cl->stats->ready.threads.extunpark, 1, __ATOMIC_RELAXED);
 			__push_stat( cl->stats, cl->stats->ready.threads.threads, true, "Cluster", cl );
 		}
@@ -508,5 +628,12 @@
 
 	ready_schedule_lock();
-		$thread * thrd = pop_slow( this );
+		$thread * thrd;
+		for(25) {
+			thrd = pop_slow( this );
+			if(thrd) goto RET;
+		}
+		thrd = pop_search( this );
+
+		RET:
 	ready_schedule_unlock();
 
@@ -532,4 +659,19 @@
 }
 
+void __kernel_unpark( $thread * thrd ) {
+	/* paranoid */ verify( ! __preemption_enabled() );
+	/* paranoid */ verify( ready_schedule_islocked());
+
+	if( !thrd ) return;
+
+	if(__must_unpark(thrd)) {
+		// Wake lost the race,
+		__schedule_thread( thrd );
+	}
+
+	/* paranoid */ verify( ready_schedule_islocked());
+	/* paranoid */ verify( ! __preemption_enabled() );
+}
+
 void unpark( $thread * thrd ) {
 	if( !thrd ) return;
@@ -744,4 +886,5 @@
 
 static inline bool __maybe_io_drain( processor * proc ) {
+	bool ret = false;
 	#if defined(CFA_HAVE_LINUX_IO_URING_H)
 		__cfadbg_print_safe(runtime_core, "Kernel : core %p checking io for ring %d\n", proc, proc->io.ctx->fd);
@@ -752,6 +895,9 @@
 		unsigned tail = *ctx->cq.tail;
 		if(head == tail) return false;
-		return __cfa_io_drain( proc );
+		ready_schedule_lock();
+		ret = __cfa_io_drain( proc );
+		ready_schedule_unlock();
 	#endif
+	return ret;
 }
 
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 00e9be94eec3b41099f7decdab5a57d6edea62ed)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 3ec79f71ac1364f434c450785a53ca1807f141a7)
@@ -447,4 +447,6 @@
 	link.next = 0p;
 	link.prev = 0p;
+	link.preferred = -1u;
+	last_proc = 0p;
 	#if defined( __CFA_WITH_VERIFY__ )
 		canary = 0x0D15EA5E0D15EA5Ep;
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 00e9be94eec3b41099f7decdab5a57d6edea62ed)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 3ec79f71ac1364f434c450785a53ca1807f141a7)
@@ -284,5 +284,5 @@
 
 //-----------------------------------------------------------------------
-// pop thread from the ready queue of a cluster
+// pop thread from the local queues of a cluster
 // returns 0p if empty
 // May return 0p spuriously
@@ -290,8 +290,14 @@
 
 //-----------------------------------------------------------------------
-// pop thread from the ready queue of a cluster
+// pop thread from any ready queue of a cluster
+// returns 0p if empty
+// May return 0p spuriously
+__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr);
+
+//-----------------------------------------------------------------------
+// search all ready queues of a cluster for any thread
 // returns 0p if empty
 // guaranteed to find any threads added before this call
-__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr);
+__attribute__((hot)) struct $thread * pop_search(struct cluster * cltr);
 
 //-----------------------------------------------------------------------
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 00e9be94eec3b41099f7decdab5a57d6edea62ed)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 3ec79f71ac1364f434c450785a53ca1807f141a7)
@@ -344,5 +344,6 @@
 	}
 
-	__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) {
+	__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) { return pop_fast(cltr); }
+	__attribute__((hot)) struct $thread * pop_search(struct cluster * cltr) {
 		return search(cltr);
 	}
@@ -436,10 +437,9 @@
 
 	__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
-		for(25) {
-			unsigned i = __tls_rand() % lanes.count;
-			$thread * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal));
-			if(t) return t;
-		}
-
+		unsigned i = __tls_rand() % lanes.count;
+		return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal));
+	}
+
+	__attribute__((hot)) struct $thread * pop_search(struct cluster * cltr) with (cltr->ready_queue) {
 		return search(cltr);
 	}
Index: libcfa/src/concurrency/stats.cfa
===================================================================
--- libcfa/src/concurrency/stats.cfa	(revision 00e9be94eec3b41099f7decdab5a57d6edea62ed)
+++ libcfa/src/concurrency/stats.cfa	(revision 3ec79f71ac1364f434c450785a53ca1807f141a7)
@@ -38,4 +38,5 @@
 		stats->ready.pop.search.espec   = 0;
 		stats->ready.threads.migration = 0;
+		stats->ready.threads.extunpark = 0;
 		stats->ready.threads.threads   = 0;
 		stats->ready.sleep.halts   = 0;
@@ -95,4 +96,5 @@
 		__atomic_fetch_add( &cltr->ready.pop.search.espec  , proc->ready.pop.search.espec  , __ATOMIC_SEQ_CST ); proc->ready.pop.search.espec   = 0;
 		__atomic_fetch_add( &cltr->ready.threads.migration , proc->ready.threads.migration , __ATOMIC_SEQ_CST ); proc->ready.threads.migration  = 0;
+		__atomic_fetch_add( &cltr->ready.threads.extunpark , proc->ready.threads.extunpark , __ATOMIC_SEQ_CST ); proc->ready.threads.extunpark  = 0;
 		__atomic_fetch_add( &cltr->ready.threads.threads   , proc->ready.threads.threads   , __ATOMIC_SEQ_CST ); proc->ready.threads.threads    = 0;
 		__atomic_fetch_add( &cltr->ready.sleep.halts       , proc->ready.sleep.halts       , __ATOMIC_SEQ_CST ); proc->ready.sleep.halts        = 0;
@@ -132,5 +134,5 @@
 			uint64_t totalR = ready.pop.local.success + ready.pop.help.success + ready.pop.steal.success + ready.pop.search.success;
 			uint64_t totalS = ready.push.local.success + ready.push.share.success + ready.push.extrn.success;
-			sstr | "- totals   : " | eng3(totalR) | "run," | eng3(totalS) | "schd (" | eng3(ready.push.extrn.success) | "ext," | eng3(ready.threads.migration) | "mig)";
+			sstr | "- totals   : " | eng3(totalR) | "run," | eng3(totalS) | "schd (" | eng3(ready.push.extrn.success) | "ext," | eng3(ready.threads.migration) | "mig," | eng3(ready.threads.extunpark) | " eupk)";
 
 			double push_len = ((double)ready.push.local.attempt + ready.push.share.attempt + ready.push.extrn.attempt) / totalS;
Index: libcfa/src/concurrency/stats.hfa
===================================================================
--- libcfa/src/concurrency/stats.hfa	(revision 00e9be94eec3b41099f7decdab5a57d6edea62ed)
+++ libcfa/src/concurrency/stats.hfa	(revision 3ec79f71ac1364f434c450785a53ca1807f141a7)
@@ -70,4 +70,5 @@
 		struct {
 			volatile uint64_t migration;
+			volatile uint64_t extunpark;
 			volatile  int64_t threads; // number of threads in the system, includes only local change
 		} threads;
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 00e9be94eec3b41099f7decdab5a57d6edea62ed)
+++ libcfa/src/concurrency/thread.cfa	(revision 3ec79f71ac1364f434c450785a53ca1807f141a7)
@@ -39,4 +39,6 @@
 	link.next = 0p;
 	link.prev = 0p;
+	link.preferred = -1u;
+	last_proc = 0p;
 	#if defined( __CFA_WITH_VERIFY__ )
 		canary = 0x0D15EA5E0D15EA5Ep;