Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision 4f7b418b1407ea6b3c3af7a736618baceddcbbc1)
+++ libcfa/src/concurrency/invoke.h	(revision 9f575eacc1e359b231c369fde8ace4f941121f26)
@@ -92,5 +92,5 @@
 	};
 
-	enum coroutine_state { Halted, Start, Inactive, Active, Primed };
+	enum coroutine_state { Halted, Start, Primed, Inactive, Active, Rerun, Reschedule };
 
 	struct coroutine_desc {
@@ -164,5 +164,6 @@
 
 		// current execution status for coroutine
-		enum coroutine_state state;
+		volatile int state;
+		int preempted;
 
 		//SKULLDUGGERY errno is not save in the thread data structure because returnToKernel appears to be the only function to require saving and restoring it
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 4f7b418b1407ea6b3c3af7a736618baceddcbbc1)
+++ libcfa/src/concurrency/kernel.cfa	(revision 9f575eacc1e359b231c369fde8ace4f941121f26)
@@ -327,15 +327,70 @@
 	kernelTLS.this_thread = thrd_dst;
 
-	// set state of processor coroutine to inactive and the thread to active
-	proc_cor->state = proc_cor->state == Halted ? Halted : Inactive;
-	thrd_dst->state = Active;
-
-	// set context switch to the thread that the processor is executing
-	verify( thrd_dst->context.SP );
-	CtxSwitch( &proc_cor->context, &thrd_dst->context );
-	// when CtxSwitch returns we are back in the processor coroutine
+	// set state of processor coroutine to inactive
+	verify(proc_cor->state == Active);
+	proc_cor->state = Inactive;
+
+	// Actually run the thread
+	RUN:
+	{
+		if(unlikely(thrd_dst->preempted)) {
+			thrd_dst->preempted = false;
+		} else {
+			thrd_dst->state = Active;
+		}
+
+		// set context switch to the thread that the processor is executing
+		verify( thrd_dst->context.SP );
+		CtxSwitch( &proc_cor->context, &thrd_dst->context );
+		// when CtxSwitch returns we are back in the processor coroutine
+	}
+
+	// We just finished running a thread, there are a few things that could have happened.
+	// 1 - Regular case : the thread has blocked and now one has scheduled it yet.
+	// 2 - Racy case    : the thread has blocked but someone has already tried to schedule it.
+	// 3 - Polite Racy case : the thread has blocked, someone has already tried to schedule it, but the thread is nice and wants to go through the ready-queue any way
+	// 4 - Preempted
+	// In case 1, we may have won a race so we can't write to the state again.
+	// In case 2, we lost the race so we now own the thread.
+	// In case 3, we lost the race but can just reschedule the thread.
+
+	if(unlikely(thrd_dst->preempted)) {
+		// The thread was preempted, reschedule it and reset the flag
+		ScheduleThread( thrd_dst );
+
+		// Just before returning to the processor, set the processor coroutine to active
+		proc_cor->state = Active;
+		return;
+	}
 
 	// set state of processor coroutine to active and the thread to inactive
-	thrd_dst->state = thrd_dst->state == Halted ? Halted : Inactive;
+	enum coroutine_state old_state = __atomic_exchange_n(&thrd_dst->state, Inactive, __ATOMIC_SEQ_CST);
+	switch(old_state) {
+		case Halted:
+			// The thread has halted, it should never be scheduled/run again, leave it back to Halted and move on
+			thrd_dst->state = Halted;
+			break;
+		case Active:
+			// This is case 1, the regular case, nothing more is needed
+			break;
+		case Rerun:
+			// This is case 2, the racy case, someone tried to run this thread before it finished blocking
+			// In this case, just run it again.
+			goto RUN;
+		case Reschedule:
+			// This is case 3, someone tried to run this before it finished blocking
+			// but it must go through the ready-queue
+			thrd_dst->state = Inactive;  /*restore invariant */
+			ScheduleThread( thrd_dst );
+			break;
+		case Inactive:
+		case Start:
+		case Primed:
+		default:
+			// This makes no sense, something is wrong abort
+			abort("Finished running a thread that was Inactive/Start/Primed %d\n", old_state);
+	}
+
+	// Just before returning to the processor, set the processor coroutine to active
 	proc_cor->state = Active;
 }
@@ -343,28 +398,23 @@
 // KERNEL_ONLY
 static void returnToKernel() {
+	verify( ! kernelTLS.preemption_state.enabled );
 	coroutine_desc * proc_cor = get_coroutine(kernelTLS.this_processor->runner);
 	thread_desc * thrd_src = kernelTLS.this_thread;
 
-	// set state of current coroutine to inactive
-	thrd_src->state = thrd_src->state == Halted ? Halted : Inactive;
-	proc_cor->state = Active;
-	int local_errno = *__volatile_errno();
-	#if defined( __i386 ) || defined( __x86_64 )
-		__x87_store;
-	#endif
-
-	// set new coroutine that the processor is executing
-	// and context switch to it
-	verify( proc_cor->context.SP );
-	CtxSwitch( &thrd_src->context, &proc_cor->context );
-
-	// set state of new coroutine to active
-	proc_cor->state = proc_cor->state == Halted ? Halted : Inactive;
-	thrd_src->state = Active;
-
-	#if defined( __i386 ) || defined( __x86_64 )
-		__x87_load;
-	#endif
-	*__volatile_errno() = local_errno;
+	// Run the thread on this processor
+	{
+		int local_errno = *__volatile_errno();
+		#if defined( __i386 ) || defined( __x86_64 )
+			__x87_store;
+		#endif
+		verify( proc_cor->context.SP );
+		CtxSwitch( &thrd_src->context, &proc_cor->context );
+		#if defined( __i386 ) || defined( __x86_64 )
+			__x87_load;
+		#endif
+		*__volatile_errno() = local_errno;
+	}
+
+	verify( ! kernelTLS.preemption_state.enabled );
 }
 
@@ -374,4 +424,5 @@
 static void finishRunning(processor * this) with( this->finish ) {
 	verify( ! kernelTLS.preemption_state.enabled );
+	verify( action_code == No_Action );
 	choose( action_code ) {
 	case No_Action:
@@ -532,32 +583,26 @@
 
 // KERNEL ONLY
-void ScheduleThread( thread_desc * thrd ) {
-	verify( thrd );
-	verify( thrd->state != Halted );
-
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	verifyf( thrd->next == 0p, "Expected null got %p", thrd->next );
-
-	with( *thrd->curr_cluster ) {
-		lock  ( ready_queue_lock __cfaabi_dbg_ctx2 );
-		bool was_empty = !(ready_queue != 0);
-		append( ready_queue, thrd );
-		unlock( ready_queue_lock );
-
-		if(was_empty) {
-			lock      (proc_list_lock __cfaabi_dbg_ctx2);
-			if(idles) {
-				wake_fast(idles.head);
-			}
-			unlock    (proc_list_lock);
+void ScheduleThread( thread_desc * thrd ) with( *thrd->curr_cluster ) {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verifyf( thrd->state == Inactive || thrd->state == Start || thrd->preempted, "state : %d, preempted %d\n", thrd->state, thrd->preempted);
+	/* paranoid */ verifyf( thrd->next == 0p, "Expected null got %p", thrd->next );
+
+	lock  ( ready_queue_lock __cfaabi_dbg_ctx2 );
+	bool was_empty = !(ready_queue != 0);
+	append( ready_queue, thrd );
+	unlock( ready_queue_lock );
+
+	if(was_empty) {
+		lock      (proc_list_lock __cfaabi_dbg_ctx2);
+		if(idles) {
+			wake_fast(idles.head);
 		}
-		else if( struct processor * idle = idles.head ) {
-			wake_fast(idle);
-		}
-
-	}
-
-	verify( ! kernelTLS.preemption_state.enabled );
+		unlock    (proc_list_lock);
+	}
+	else if( struct processor * idle = idles.head ) {
+		wake_fast(idle);
+	}
+
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 }
 
@@ -582,8 +627,5 @@
 void BlockInternal( __spinlock_t * lock ) {
 	disable_interrupts();
-	with( *kernelTLS.this_processor ) {
-		finish.action_code = Release;
-		finish.lock        = lock;
-	}
+	unlock( *lock );
 
 	verify( ! kernelTLS.preemption_state.enabled );
@@ -596,8 +638,5 @@
 void BlockInternal( thread_desc * thrd ) {
 	disable_interrupts();
-	with( * kernelTLS.this_processor ) {
-		finish.action_code = Schedule;
-		finish.thrd        = thrd;
-	}
+	WakeThread( thrd, false );
 
 	verify( ! kernelTLS.preemption_state.enabled );
@@ -609,11 +648,7 @@
 
 void BlockInternal( __spinlock_t * lock, thread_desc * thrd ) {
-	assert(thrd);
 	disable_interrupts();
-	with( * kernelTLS.this_processor ) {
-		finish.action_code = Release_Schedule;
-		finish.lock        = lock;
-		finish.thrd        = thrd;
-	}
+	unlock( *lock );
+	WakeThread( thrd, false );
 
 	verify( ! kernelTLS.preemption_state.enabled );
@@ -626,8 +661,6 @@
 void BlockInternal(__spinlock_t * locks [], unsigned short count) {
 	disable_interrupts();
-	with( * kernelTLS.this_processor ) {
-		finish.action_code = Release_Multi;
-		finish.locks       = locks;
-		finish.lock_count  = count;
+	for(int i = 0; i < count; i++) {
+		unlock( *locks[i] );
 	}
 
@@ -641,10 +674,9 @@
 void BlockInternal(__spinlock_t * locks [], unsigned short lock_count, thread_desc * thrds [], unsigned short thrd_count) {
 	disable_interrupts();
-	with( *kernelTLS.this_processor ) {
-		finish.action_code = Release_Multi_Schedule;
-		finish.locks       = locks;
-		finish.lock_count  = lock_count;
-		finish.thrds       = thrds;
-		finish.thrd_count  = thrd_count;
+	for(int i = 0; i < lock_count; i++) {
+		unlock( *locks[i] );
+	}
+	for(int i = 0; i < thrd_count; i++) {
+		WakeThread( thrds[i], false );
 	}
 
@@ -658,8 +690,5 @@
 void BlockInternal(__finish_callback_fptr_t callback) {
 	disable_interrupts();
-	with( *kernelTLS.this_processor ) {
-		finish.action_code = Callback;
-		finish.callback    = callback;
-	}
+	callback();
 
 	verify( ! kernelTLS.preemption_state.enabled );
@@ -673,9 +702,6 @@
 void LeaveThread(__spinlock_t * lock, thread_desc * thrd) {
 	verify( ! kernelTLS.preemption_state.enabled );
-	with( * kernelTLS.this_processor ) {
-		finish.action_code = thrd ? Release_Schedule : Release;
-		finish.lock        = lock;
-		finish.thrd        = thrd;
-	}
+	unlock( *lock );
+	WakeThread( thrd, false );
 
 	returnToKernel();
@@ -932,5 +958,5 @@
 
 	// make new owner
-	WakeThread( thrd );
+	WakeThread( thrd, false );
 }
 
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 4f7b418b1407ea6b3c3af7a736618baceddcbbc1)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 9f575eacc1e359b231c369fde8ace4f941121f26)
@@ -31,12 +31,33 @@
 }
 
-void ScheduleThread( thread_desc * );
-static inline void WakeThread( thread_desc * thrd ) {
+void ScheduleThread( thread_desc * ) __attribute__((nonnull (1)));
+static inline void WakeThread( thread_desc * thrd, bool must_yield ) {
 	if( !thrd ) return;
 
-	verify(thrd->state == Inactive);
+	enum coroutine_state new_state = must_yield ? Reschedule : Rerun;
 
 	disable_interrupts();
-	ScheduleThread( thrd );
+	static_assert(sizeof(thrd->state) == sizeof(int));
+	enum coroutine_state old_state = (enum coroutine_state)__atomic_exchange_n((volatile int *)&thrd->state, (int)new_state, __ATOMIC_SEQ_CST);
+	switch(old_state) {
+		case Active:
+			// Wake won the race, the thread will reschedule/rerun itself
+			break;
+		case Inactive:
+			// Wake lost the race,
+			thrd->state = Inactive;
+			ScheduleThread( thrd );
+			break;
+		case Rerun:
+		case Reschedule:
+			abort("More than one thread attempted to schedule thread %p\n", thrd);
+			break;
+		case Halted:
+		case Start:
+		case Primed:
+		default:
+			// This makes no sense, something is wrong abort
+			abort();
+	}
 	enable_interrupts( __cfaabi_dbg_ctx );
 }
Index: libcfa/src/concurrency/monitor.cfa
===================================================================
--- libcfa/src/concurrency/monitor.cfa	(revision 4f7b418b1407ea6b3c3af7a736618baceddcbbc1)
+++ libcfa/src/concurrency/monitor.cfa	(revision 9f575eacc1e359b231c369fde8ace4f941121f26)
@@ -225,5 +225,5 @@
 
 		//We need to wake-up the thread
-		WakeThread( new_owner );
+		WakeThread( new_owner, false );
 	}
 
Index: libcfa/src/concurrency/mutex.cfa
===================================================================
--- libcfa/src/concurrency/mutex.cfa	(revision 4f7b418b1407ea6b3c3af7a736618baceddcbbc1)
+++ libcfa/src/concurrency/mutex.cfa	(revision 9f575eacc1e359b231c369fde8ace4f941121f26)
@@ -63,5 +63,5 @@
 	this.is_locked = (this.blocked_threads != 0);
 	WakeThread(
-		pop_head( this.blocked_threads )
+		pop_head( this.blocked_threads ), false
 	);
 	unlock( this.lock );
@@ -121,5 +121,5 @@
 		owner = thrd;
 		recursion_count = (thrd ? 1 : 0);
-		WakeThread( thrd );
+		WakeThread( thrd, false );
 	}
 	unlock( lock );
@@ -139,5 +139,5 @@
 	lock( lock __cfaabi_dbg_ctx2 );
 	WakeThread(
-		pop_head( this.blocked_threads )
+		pop_head( this.blocked_threads ), false
 	);
 	unlock( lock );
@@ -148,5 +148,5 @@
 	while(this.blocked_threads) {
 		WakeThread(
-			pop_head( this.blocked_threads )
+			pop_head( this.blocked_threads ), false
 		);
 	}
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 4f7b418b1407ea6b3c3af7a736618baceddcbbc1)
+++ libcfa/src/concurrency/preemption.cfa	(revision 9f575eacc1e359b231c369fde8ace4f941121f26)
@@ -394,5 +394,6 @@
 	// Preemption can occur here
 
-	BlockInternal( kernelTLS.this_thread ); // Do the actual CtxSwitch
+	kernelTLS.this_thread->preempted = true;
+	BlockInternal(); // Do the actual CtxSwitch
 }
 
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 4f7b418b1407ea6b3c3af7a736618baceddcbbc1)
+++ libcfa/src/concurrency/thread.cfa	(revision 9f575eacc1e359b231c369fde8ace4f941121f26)
@@ -36,4 +36,5 @@
 	self_cor{ name, storage, storageSize };
 	state = Start;
+	preempted = false;
 	curr_cor = &self_cor;
 	self_mon.owner = &this;
