Index: src/libcfa/bits/locks.h
===================================================================
--- src/libcfa/bits/locks.h	(revision 214e8da33e6cb23acc53448d1c57eea90c6f03ae)
+++ src/libcfa/bits/locks.h	(revision ea8b2f7e16005c4ecdb2566b748c8eb02f60f32b)
@@ -18,4 +18,11 @@
 #include "bits/debug.h"
 #include "bits/defs.h"
+#include <assert.h>
+
+#ifdef __cforall
+	extern "C" {
+		#include <pthread.h>
+	}
+#endif
 
 // pause to prevent excess processor bus usage
@@ -112,3 +119,45 @@
 		__atomic_clear( &this.lock, __ATOMIC_RELEASE );
 	}
+
+
+	#ifdef __CFA_WITH_VERIFY__
+		extern bool __cfaabi_dbg_in_kernel();
+	#endif
+
+	struct __bin_sem_t {
+		int_fast8_t     counter;
+		pthread_mutex_t lock;
+		pthread_cond_t  cond;
+	};
+
+	static inline void ?{}(__bin_sem_t & this) with( this ) {
+		counter = 0;
+		pthread_mutex_init(&lock, NULL);
+		pthread_cond_init (&cond, NULL);
+	}
+
+	static inline void ^?{}(__bin_sem_t & this) with( this ) {
+		pthread_mutex_destroy(&lock);
+		pthread_cond_destroy (&cond);
+	}
+
+	static inline void wait(__bin_sem_t & this) with( this ) {
+		verify(__cfaabi_dbg_in_kernel());
+		pthread_mutex_lock(&lock);
+		if(counter != 0) {   // this must be a loop, not if!
+			pthread_cond_wait(&cond, &lock);
+		}
+		counter = 1;
+		pthread_mutex_unlock(&lock);
+	}
+
+	static inline void post(__bin_sem_t & this) with( this ) {
+		verify(__cfaabi_dbg_in_kernel());
+		pthread_mutex_lock(&lock);
+		bool needs_signal = counter == 0;
+		counter = 1;
+		pthread_mutex_unlock(&lock);
+		if (!needs_signal)
+			pthread_cond_signal(&cond);
+		}
 #endif
Index: src/libcfa/concurrency/kernel
===================================================================
--- src/libcfa/concurrency/kernel	(revision 214e8da33e6cb23acc53448d1c57eea90c6f03ae)
+++ src/libcfa/concurrency/kernel	(revision ea8b2f7e16005c4ecdb2566b748c8eb02f60f32b)
@@ -133,7 +133,8 @@
 	// Idle lock
 	sem_t idleLock;
+	// __bin_sem_t idleLock;
 
 	// Link lists fields
-	struct {
+	struct __dbg_node_proc {
 		struct processor * next;
 		struct processor * prev;
@@ -182,5 +183,5 @@
 
 	// Link lists fields
-	struct {
+	struct __dbg_node_cltr {
 		cluster * next;
 		cluster * prev;
Index: src/libcfa/concurrency/kernel.c
===================================================================
--- src/libcfa/concurrency/kernel.c	(revision 214e8da33e6cb23acc53448d1c57eea90c6f03ae)
+++ src/libcfa/concurrency/kernel.c	(revision ea8b2f7e16005c4ecdb2566b748c8eb02f60f32b)
@@ -17,4 +17,5 @@
 #include <stddef.h>
 #include <errno.h>
+#include <string.h>
 extern "C" {
 #include <stdio.h>
@@ -50,5 +51,7 @@
 thread_desc * mainThread;
 
-struct { __dllist_t(cluster) list; __spinlock_t lock; } global_clusters;
+extern "C" {
+struct { __dllist_t(cluster) list; __spinlock_t lock; } __cfa_dbg_global_clusters;
+}
 
 //-----------------------------------------------------------------------------
@@ -150,8 +153,8 @@
 
 void ^?{}(processor & this) with( this ){
-	if( ! do_terminate ) {
+	if( ! __atomic_load_n(&do_terminate, __ATOMIC_ACQUIRE) ) {
 		__cfaabi_dbg_print_safe("Kernel : core %p signaling termination\n", &this);
 		terminate(&this);
-		verify(this.do_terminate);
+		verify( __atomic_load_n(&do_terminate, __ATOMIC_SEQ_CST) );
 		verify( kernelTLS.this_processor != &this);
 		P( terminated );
@@ -199,5 +202,5 @@
 
 		thread_desc * readyThread = NULL;
-		for( unsigned int spin_count = 0; ! this->do_terminate; spin_count++ )
+		for( unsigned int spin_count = 0; ! __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST); spin_count++ )
 		{
 			readyThread = nextThread( this->cltr );
@@ -218,5 +221,6 @@
 			else
 			{
-				spin(this, &spin_count);
+				// spin(this, &spin_count);
+				halt(this);
 			}
 		}
@@ -545,6 +549,6 @@
 	__cfaabi_dbg_print_safe("Kernel : Starting\n");
 
-	global_clusters.list{ __get };
-	global_clusters.lock{};
+	__cfa_dbg_global_clusters.list{ __get };
+	__cfa_dbg_global_clusters.lock{};
 
 	// Initialize the main cluster
@@ -627,6 +631,7 @@
 	// When its coroutine terminates, it return control to the mainThread
 	// which is currently here
-	mainProcessor->do_terminate = true;
+	__atomic_store_n(&mainProcessor->do_terminate, true, __ATOMIC_RELEASE);
 	returnToKernel();
+	mainThread->self_cor.state = Halted;
 
 	// THE SYSTEM IS NOW COMPLETELY STOPPED
@@ -644,6 +649,6 @@
 	^(mainThread){};
 
-	^(global_clusters.list){};
-	^(global_clusters.lock){};
+	^(__cfa_dbg_global_clusters.list){};
+	^(__cfa_dbg_global_clusters.lock){};
 
 	__cfaabi_dbg_print_safe("Kernel : Shutdown complete\n");
@@ -655,4 +660,6 @@
 
 void halt(processor * this) with( *this ) {
+	verify( ! __atomic_load_n(&do_terminate, __ATOMIC_SEQ_CST) );
+
 	with( *cltr ) {
 		lock      (proc_list_lock __cfaabi_dbg_ctx2);
@@ -664,7 +671,15 @@
 	__cfaabi_dbg_print_safe("Kernel : Processor %p ready to sleep\n", this);
 
-	verify( ({int sval = 0; sem_getvalue(&this->idleLock, &sval); sval; }) < 200);
+	// #ifdef __CFA_WITH_VERIFY__
+	// 	int sval = 0;
+	// 	sem_getvalue(&this->idleLock, &sval);
+	// 	verifyf(sval < 200, "Binary semaphore reached value %d : \n", sval);
+	// #endif
+
+	verify( ! __atomic_load_n(&do_terminate, __ATOMIC_SEQ_CST) );
 	int __attribute__((unused)) ret = sem_wait(&idleLock);
-	verify(ret > 0 || errno == EINTR);
+	// verifyf(ret >= 0 || errno == EINTR, "Sem_wait returned %d (errno %d : %s\n", ret, errno, strerror(errno));
+
+	// wait( idleLock );
 
 	__cfaabi_dbg_print_safe("Kernel : Processor %p woke up and ready to run\n", this);
@@ -681,6 +696,13 @@
 	__cfaabi_dbg_print_safe("Kernel : Waking up processor %p\n", this);
 	int __attribute__((unused)) ret = sem_post(&this->idleLock);
-	verify(ret > 0 || errno == EINTR);
-	verify( ({int sval = 0; sem_getvalue(&this->idleLock, &sval); sval; }) < 200);
+	// verifyf(ret >= 0 || errno == EINTR, "Sem_post returned %d (errno %d : %s\n", ret, errno, strerror(errno));
+
+	// #ifdef __CFA_WITH_VERIFY__
+	// 	int sval = 0;
+	// 	sem_getvalue(&this->idleLock, &sval);
+	// 	verifyf(sval < 200, "Binary semaphore reached value %d\n", sval);
+	// #endif
+
+	// post( this->idleLock );
 }
 
@@ -798,13 +820,13 @@
 // Global Queues
 void doregister( cluster     & cltr ) {
-	lock      ( global_clusters.lock __cfaabi_dbg_ctx2);
-	push_front( global_clusters.list, cltr );
-	unlock    ( global_clusters.lock );
+	lock      ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
+	push_front( __cfa_dbg_global_clusters.list, cltr );
+	unlock    ( __cfa_dbg_global_clusters.lock );
 }
 
 void unregister( cluster     & cltr ) {
-	lock  ( global_clusters.lock __cfaabi_dbg_ctx2);
-	remove( global_clusters.list, cltr );
-	unlock( global_clusters.lock );
+	lock  ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
+	remove( __cfa_dbg_global_clusters.list, cltr );
+	unlock( __cfa_dbg_global_clusters.lock );
 }
 
Index: src/libcfa/concurrency/preemption.c
===================================================================
--- src/libcfa/concurrency/preemption.c	(revision 214e8da33e6cb23acc53448d1c57eea90c6f03ae)
+++ src/libcfa/concurrency/preemption.c	(revision ea8b2f7e16005c4ecdb2566b748c8eb02f60f32b)
@@ -265,7 +265,9 @@
 // kill wrapper : signal a processor
 void terminate(processor * this) {
-	this->do_terminate = true;
-	wake(this);
+	disable_interrupts();
+	__atomic_store_n(&this->do_terminate, true, __ATOMIC_SEQ_CST);
+	wake( this );
 	sigval_t value = { PREEMPT_TERMINATE };
+	enable_interrupts();
 	pthread_sigqueue( this->kernel_thread, SIGUSR1, value );
 }
@@ -369,5 +371,5 @@
 	choose(sfp->si_value.sival_int) {
 		case PREEMPT_NORMAL   : ;// Normal case, nothing to do here
-		case PREEMPT_TERMINATE: verify( kernelTLS.this_processor->do_terminate);
+		case PREEMPT_TERMINATE: verify( __atomic_load_n( &kernelTLS.this_processor->do_terminate, __ATOMIC_SEQ_CST ) );
 		default:
 			abort( "internal error, signal value is %d", sfp->si_value.sival_int );
@@ -488,4 +490,10 @@
 }
 
+#ifdef __CFA_WITH_VERIFY__
+bool __cfaabi_dbg_in_kernel() {
+	return !kernelTLS.preemption_state.enabled;
+}
+#endif
+
 // Local Variables: //
 // mode: c //
Index: src/libcfa/stdhdr/assert.h
===================================================================
--- src/libcfa/stdhdr/assert.h	(revision 214e8da33e6cb23acc53448d1c57eea90c6f03ae)
+++ src/libcfa/stdhdr/assert.h	(revision ea8b2f7e16005c4ecdb2566b748c8eb02f60f32b)
@@ -33,4 +33,5 @@
 	#define verify(x) assert(x)
 	#define verifyf(x, ...) assertf(x, __VA_ARGS__)
+	#define __CFA_WITH_VERIFY__
 #else
 	#define verify(x)
Index: src/prelude/sync-builtins.cf
===================================================================
--- src/prelude/sync-builtins.cf	(revision 214e8da33e6cb23acc53448d1c57eea90c6f03ae)
+++ src/prelude/sync-builtins.cf	(revision ea8b2f7e16005c4ecdb2566b748c8eb02f60f32b)
@@ -248,4 +248,6 @@
 #endif
 
+_Bool __atomic_load_n(const volatile _Bool *, int);
+void __atomic_load(const volatile _Bool *, volatile _Bool *, int);
 char __atomic_load_n(const volatile char *, int);
 char __atomic_load_1(const volatile char *, int);
@@ -285,5 +287,4 @@
 
 void __atomic_store_n(volatile _Bool *, _Bool, int);
-void __atomic_store_1(volatile _Bool *, _Bool, int);
 void __atomic_store(volatile _Bool *, _Bool *, int);
 void __atomic_store_n(volatile char *, char, int);
Index: src/tests/preempt_longrun/processor.c
===================================================================
--- src/tests/preempt_longrun/processor.c	(revision 214e8da33e6cb23acc53448d1c57eea90c6f03ae)
+++ src/tests/preempt_longrun/processor.c	(revision ea8b2f7e16005c4ecdb2566b748c8eb02f60f32b)
@@ -2,4 +2,6 @@
 #include <thread>
 #include <time>
+
+#include <unistd.h>
 
 #ifndef PREEMPTION_RATE
@@ -15,7 +17,9 @@
 int main(int argc, char* argv[]) {
 	processor * p[15];
+	write(STDERR_FILENO, "Preparing\n", sizeof("Preparing\n"));
 	for ( int pi = 0; pi < 15; pi++ ) {
 		p[pi] = new();
 	}
+	write(STDERR_FILENO, "Starting\n", sizeof("Starting\n"));
 	for ( int i = 0; i < N; i++) {
 		int pi = i % 15;
@@ -23,6 +27,8 @@
 		p[pi] = new();
 	}
+	write(STDERR_FILENO, "Stopping\n", sizeof("Stopping\n"));
 	for ( int pi = 0; pi < 15; pi++ ) {
 		delete( p[pi] );
 	}
+	write(STDERR_FILENO, "Done\n", sizeof("Done\n"));
 }
