Index: libcfa/prelude/defines.hfa.in
===================================================================
--- libcfa/prelude/defines.hfa.in	(revision f66605204cbdef9351e5c18bbb41c70d7643bbbb)
+++ libcfa/prelude/defines.hfa.in	(revision 4069faade93e68df42b8c7af24253b6a041b469b)
@@ -16,4 +16,4 @@
 #undef HAVE_LINUX_IO_URING_H
 
-#define __CFA_IO_POLLING_USER__
+// #define __CFA_IO_POLLING_USER__
 // #define __CFA_IO_POLLING_KERNEL__
Index: libcfa/src/bits/locks.hfa
===================================================================
--- libcfa/src/bits/locks.hfa	(revision f66605204cbdef9351e5c18bbb41c70d7643bbbb)
+++ libcfa/src/bits/locks.hfa	(revision 4069faade93e68df42b8c7af24253b6a041b469b)
@@ -113,13 +113,13 @@
 
 	struct __bin_sem_t {
-		bool     		signaled;
 		pthread_mutex_t 	lock;
 		pthread_cond_t  	cond;
+		int     		val;
 	};
 
 	static inline void ?{}(__bin_sem_t & this) with( this ) {
-		signaled = false;
 		pthread_mutex_init(&lock, NULL);
 		pthread_cond_init (&cond, NULL);
+		val = 0;
 	}
 
@@ -132,18 +132,21 @@
 		verify(__cfaabi_dbg_in_kernel());
 		pthread_mutex_lock(&lock);
-			if(!signaled) {   // this must be a loop, not if!
+			while(val < 1) {
 				pthread_cond_wait(&cond, &lock);
 			}
-			signaled = false;
+			val -= 1;
 		pthread_mutex_unlock(&lock);
 	}
 
 	static inline bool post(__bin_sem_t & this) with( this ) {
+		bool needs_signal = false;
+
 		pthread_mutex_lock(&lock);
-			bool needs_signal = !signaled;
-			signaled = true;
+			if(val < 1) {
+				val += 1;
+				pthread_cond_signal(&cond);
+				needs_signal = true;
+			}
 		pthread_mutex_unlock(&lock);
-
-		if (needs_signal) pthread_cond_signal(&cond);
 
 		return needs_signal;
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision f66605204cbdef9351e5c18bbb41c70d7643bbbb)
+++ libcfa/src/concurrency/io.cfa	(revision 4069faade93e68df42b8c7af24253b6a041b469b)
@@ -14,4 +14,6 @@
 //
 
+// #define __CFA_DEBUG_PRINT_IO__
+
 #include "kernel.hfa"
 
@@ -210,13 +212,16 @@
 	void __kernel_io_finish_start( cluster & this ) {
 		#if defined(__CFA_IO_POLLING_USER__)
-			(this.io.poller.fast){ this };
+			__cfadbg_print_safe(io, "Kernel I/O : Creating fast poller for cluter %p\n", &this);
+			(this.io.poller.fast){ "Fast IO Poller", this };
 			__thrd_start( this.io.poller.fast, main );
 		#endif
 
 		// Create the poller thread
+		__cfadbg_print_safe(io, "Kernel I/O : Creating slow poller for cluter %p\n", &this);
 		this.io.poller.slow.stack = __create_pthread( &this.io.poller.slow.kthrd, __io_poller_slow, &this );
 	}
 
 	void __kernel_io_prepare_stop( cluster & this ) {
+		__cfadbg_print_safe(io, "Kernel I/O : Stopping pollers for cluster\n", &this);
 		// Notify the poller thread of the shutdown
 		__atomic_store_n(&this.io.done, true, __ATOMIC_SEQ_CST);
@@ -233,4 +238,6 @@
 		free( this.io.poller.slow.stack );
 
+		__cfadbg_print_safe(io, "Kernel I/O : Slow poller stopped for cluster\n", &this);
+
 		#if defined(__CFA_IO_POLLING_USER__)
 			// unpark the fast io_poller
@@ -238,4 +245,6 @@
 
 			^(this.io.poller.fast){};
+
+			__cfadbg_print_safe(io, "Kernel I/O : Fast poller stopped for cluster\n", &this);
 		#endif
 	}
@@ -324,5 +333,5 @@
 
 			struct io_user_data * data = (struct io_user_data *)cqe.user_data;
-			// __cfaabi_bits_print_safe( STDERR_FILENO, "Performed reading io cqe %p, result %d for %p\n", data, cqe.res, data->thrd );
+			__cfadbg_print_safe( io, "Kernel I/O : Performed reading io cqe %p, result %d for %p\n", data, cqe.res, data->thrd );
 
 			data->result = cqe.res;
@@ -369,4 +378,5 @@
 				int count = __drain_io( ring, &mask, 1, true );
 				if(count > 0) {
+					__cfadbg_print_safe(io, "Kernel I/O : Moving to ring %p to fast poller\n", &ring);
 					__unpark( &ring.poller.fast.thrd __cfaabi_dbg_ctx2 );
 					wait( ring.poller.sem );
@@ -398,4 +408,5 @@
 				else {
 					// We didn't get anything baton pass to the slow poller
+					__cfadbg_print_safe(io, "Kernel I/O : Moving to ring %p to slow poller\n", &this.ring);
 					post( this.ring->poller.sem );
 					park( __cfaabi_dbg_ctx );
@@ -464,5 +475,4 @@
 		// Submit however, many entries need to be submitted
 		int ret = syscall( __NR_io_uring_enter, ring.fd, 1, 0, 0, 0p, 0);
-		// __cfaabi_bits_print_safe( STDERR_FILENO, "Performed io_submit, returned %d\n", ret );
 		if( ret < 0 ) {
 			switch((int)errno) {
@@ -481,4 +491,5 @@
 		// Make sure that idx was submitted
 		// Be careful to not get false positive if we cycled the entire list or that someone else submitted for us
+		__cfadbg_print_safe( io, "Kernel I/O : Performed io_submit for %p, returned %d\n", active_thread(), ret );
 	}
 
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision f66605204cbdef9351e5c18bbb41c70d7643bbbb)
+++ libcfa/src/concurrency/kernel.cfa	(revision 4069faade93e68df42b8c7af24253b6a041b469b)
@@ -15,4 +15,5 @@
 
 #define __cforall_thread__
+// #define __CFA_DEBUG_PRINT_RUNTIME_CORE__
 
 //C Includes
@@ -40,4 +41,5 @@
 #include "invoke.h"
 
+
 //-----------------------------------------------------------------------------
 // Some assembly required
@@ -230,14 +232,14 @@
 	idle{};
 
-	__cfaabi_dbg_print_safe("Kernel : Starting core %p\n", &this);
+	__cfadbg_print_safe(runtime_core, "Kernel : Starting core %p\n", &this);
 
 	this.stack = __create_pthread( &this.kernel_thread, __invoke_processor, (void *)&this );
 
-	__cfaabi_dbg_print_safe("Kernel : core %p started\n", &this);
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
 }
 
 void ^?{}(processor & this) with( this ){
 	if( ! __atomic_load_n(&do_terminate, __ATOMIC_ACQUIRE) ) {
-		__cfaabi_dbg_print_safe("Kernel : core %p signaling termination\n", &this);
+		__cfadbg_print_safe(runtime_core, "Kernel : core %p signaling termination\n", &this);
 
 		__atomic_store_n(&do_terminate, true, __ATOMIC_RELAXED);
@@ -289,5 +291,5 @@
 	verify(this);
 
-	__cfaabi_dbg_print_safe("Kernel : core %p starting\n", this);
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p starting\n", this);
 
 	doregister(this->cltr, this);
@@ -297,5 +299,5 @@
 		preemption_scope scope = { this };
 
-		__cfaabi_dbg_print_safe("Kernel : core %p started\n", this);
+		__cfadbg_print_safe(runtime_core, "Kernel : core %p started\n", this);
 
 		$thread * readyThread = 0p;
@@ -323,5 +325,5 @@
 		}
 
-		__cfaabi_dbg_print_safe("Kernel : core %p stopping\n", this);
+		__cfadbg_print_safe(runtime_core, "Kernel : core %p stopping\n", this);
 	}
 
@@ -330,5 +332,5 @@
 	V( this->terminated );
 
-	__cfaabi_dbg_print_safe("Kernel : core %p terminated\n", this);
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p terminated\n", this);
 
 	// HACK : the coroutine context switch expects this_thread to be set
@@ -475,5 +477,5 @@
 
 	//We now have a proper context from which to schedule threads
-	__cfaabi_dbg_print_safe("Kernel : core %p created (%p, %p)\n", proc, &proc->runner, &ctx);
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p created (%p, %p)\n", proc, &proc->runner, &ctx);
 
 	// SKULLDUGGERY: Since the coroutine doesn't have its own stack, we can't
@@ -486,5 +488,5 @@
 
 	// Main routine of the core returned, the core is now fully terminated
-	__cfaabi_dbg_print_safe("Kernel : core %p main ended (%p)\n", proc, &proc->runner);
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p main ended (%p)\n", proc, &proc->runner);
 
 	return 0p;
@@ -717,5 +719,5 @@
 static void __kernel_startup(void) {
 	verify( ! kernelTLS.preemption_state.enabled );
-	__cfaabi_dbg_print_safe("Kernel : Starting\n");
+	__cfadbg_print_safe(runtime_core, "Kernel : Starting\n");
 
 	__page_size = sysconf( _SC_PAGESIZE );
@@ -728,5 +730,5 @@
 	(*mainCluster){"Main Cluster"};
 
-	__cfaabi_dbg_print_safe("Kernel : Main cluster ready\n");
+	__cfadbg_print_safe(runtime_core, "Kernel : Main cluster ready\n");
 
 	// Start by initializing the main thread
@@ -738,5 +740,5 @@
 	(*mainThread){ &info };
 
-	__cfaabi_dbg_print_safe("Kernel : Main thread ready\n");
+	__cfadbg_print_safe(runtime_core, "Kernel : Main thread ready\n");
 
 
@@ -759,5 +761,5 @@
 
 		runner{ &this };
-		__cfaabi_dbg_print_safe("Kernel : constructed main processor context %p\n", &runner);
+		__cfadbg_print_safe(runtime_core, "Kernel : constructed main processor context %p\n", &runner);
 	}
 
@@ -834,5 +836,5 @@
 	^(__cfa_dbg_global_clusters.lock){};
 
-	__cfaabi_dbg_print_safe("Kernel : Shutdown complete\n");
+	__cfadbg_print_safe(runtime_core, "Kernel : Shutdown complete\n");
 }
 
@@ -859,9 +861,9 @@
 
 	// We are ready to sleep
-	__cfaabi_dbg_print_safe("Kernel : Processor %p ready to sleep\n", this);
+	__cfadbg_print_safe(runtime_core, "Kernel : Processor %p ready to sleep\n", this);
 	wait( idle );
 
 	// We have woken up
-	__cfaabi_dbg_print_safe("Kernel : Processor %p woke up and ready to run\n", this);
+	__cfadbg_print_safe(runtime_core, "Kernel : Processor %p woke up and ready to run\n", this);
 
 	// Get ourself off the idle list
@@ -879,5 +881,5 @@
 static bool __wake_one(cluster * this, __attribute__((unused)) bool force) {
 	// if we don't want to force check if we know it's false
-	if( !this->idles.head && !force ) return false;
+	// if( !this->idles.head && !force ) return false;
 
 	// First, lock the cluster idle
@@ -892,4 +894,5 @@
 
 	// Wake them up
+	__cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this->idles.head);
 	post( this->idles.head->idle );
 
@@ -901,4 +904,5 @@
 // Unconditionnaly wake a thread
 static bool __wake_proc(processor * this) {
+	__cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this);
 	return post( this->idle );
 }
