Index: benchmark/io/readv.cfa
===================================================================
--- benchmark/io/readv.cfa	(revision 9987d798e132e92c92b98630dd5f1a7de52adbd4)
+++ benchmark/io/readv.cfa	(revision 4069faade93e68df42b8c7af24253b6a041b469b)
@@ -30,5 +30,19 @@
 unsigned long int buflen = 50;
 
+cluster * the_cluster;
+
 thread Reader {};
+void ?{}( Reader & this ) {
+	((thread&)this){ "Reader Thread", *the_cluster };
+}
+
+struct my_processor {
+	processor p;
+};
+
+void ?{}( my_processor & this ) {
+	(this.p){ "I/O Processor", *the_cluster };
+}
+
 void main( Reader & ) {
 	while(!__atomic_load_n(&run, __ATOMIC_RELAXED)) yield();
@@ -38,5 +52,7 @@
 
 	while(__atomic_load_n(&run, __ATOMIC_RELAXED)) {
-		cfa_preadv2(fd, &iov, 1, 0, 0);
+		int r = cfa_preadv2(fd, &iov, 1, 0, 0);
+		if(r < 0) abort(strerror(-r));
+
 		__atomic_fetch_add( &count, 1, __ATOMIC_SEQ_CST );
 	}
@@ -44,8 +60,4 @@
 
 int main(int argc, char * argv[]) {
-	#if !defined(__CFA_NO_STATISTICS__)
-		print_stats_at_exit( *active_cluster() );
-	#endif
-
 	double duration   = 5.0;
 	unsigned long int nthreads = 2;
@@ -117,5 +129,5 @@
 	}
 
-	int fd = open(__FILE__, 0);
+	fd = open(__FILE__, 0);
 	if(fd < 0) {
 		fprintf(stderr, "Could not open source file\n");
@@ -125,26 +137,33 @@
 	printf("Running %lu threads over %lu processors for %lf seconds\n", nthreads, nprocs, duration);
 
-	Time start, end;
 	{
-		processor procs[nprocs - 1];
+		Time start, end;
+		cluster cl = { "IO Cluster" };
+		the_cluster = &cl;
+		#if !defined(__CFA_NO_STATISTICS__)
+			print_stats_at_exit( cl );
+		#endif
 		{
-			Reader threads[nthreads];
+			my_processor procs[nprocs];
+			{
+				Reader threads[nthreads];
 
-			printf("Starting\n");
-			start = getTime();
-			run = true;
-			do {
-				sleep(500`ms);
+				printf("Starting\n");
+				start = getTime();
+				run = true;
+				do {
+					sleep(500`ms);
+					end = getTime();
+				} while( (end - start) < duration`s );
+				run = false;
 				end = getTime();
-			} while( (end - start) < duration`s );
-			run = false;
-			end = getTime();
+				printf("Done\n");
+			}
 		}
+		printf("Took %ld ms\n", (end - start)`ms);
+		printf("Total reads:      %'zu\n", count);
+		printf("Reads per second: %'lf\n", ((double)count) / (end - start)`s);
 	}
-	printf("Took %ld ms\n", (end - start)`ms);
-	printf("Total reads:      %'zu\n", count);
-	printf("Reads per second: %'lf\n", ((double)count) / (end - start)`s);
 
 	close(fd);
-	printf("Done\n");
 }
Index: libcfa/prelude/defines.hfa.in
===================================================================
--- libcfa/prelude/defines.hfa.in	(revision 9987d798e132e92c92b98630dd5f1a7de52adbd4)
+++ libcfa/prelude/defines.hfa.in	(revision 4069faade93e68df42b8c7af24253b6a041b469b)
@@ -16,4 +16,4 @@
 #undef HAVE_LINUX_IO_URING_H
 
-#define __CFA_IO_POLLING_USER__
+// #define __CFA_IO_POLLING_USER__
 // #define __CFA_IO_POLLING_KERNEL__
Index: libcfa/src/bits/locks.hfa
===================================================================
--- libcfa/src/bits/locks.hfa	(revision 9987d798e132e92c92b98630dd5f1a7de52adbd4)
+++ libcfa/src/bits/locks.hfa	(revision 4069faade93e68df42b8c7af24253b6a041b469b)
@@ -113,13 +113,13 @@
 
 	struct __bin_sem_t {
-		bool     		signaled;
 		pthread_mutex_t 	lock;
 		pthread_cond_t  	cond;
+		int     		val;
 	};
 
 	static inline void ?{}(__bin_sem_t & this) with( this ) {
-		signaled = false;
 		pthread_mutex_init(&lock, NULL);
 		pthread_cond_init (&cond, NULL);
+		val = 0;
 	}
 
@@ -132,18 +132,21 @@
 		verify(__cfaabi_dbg_in_kernel());
 		pthread_mutex_lock(&lock);
-			if(!signaled) {   // this must be a loop, not if!
+			while(val < 1) {
 				pthread_cond_wait(&cond, &lock);
 			}
-			signaled = false;
+			val -= 1;
 		pthread_mutex_unlock(&lock);
 	}
 
 	static inline bool post(__bin_sem_t & this) with( this ) {
+		bool needs_signal = false;
+
 		pthread_mutex_lock(&lock);
-			bool needs_signal = !signaled;
-			signaled = true;
+			if(val < 1) {
+				val += 1;
+				pthread_cond_signal(&cond);
+				needs_signal = true;
+			}
 		pthread_mutex_unlock(&lock);
-
-		if (needs_signal) pthread_cond_signal(&cond);
 
 		return needs_signal;
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 9987d798e132e92c92b98630dd5f1a7de52adbd4)
+++ libcfa/src/concurrency/io.cfa	(revision 4069faade93e68df42b8c7af24253b6a041b469b)
@@ -14,4 +14,6 @@
 //
 
+// #define __CFA_DEBUG_PRINT_IO__
+
 #include "kernel.hfa"
 
@@ -210,13 +212,16 @@
 	void __kernel_io_finish_start( cluster & this ) {
 		#if defined(__CFA_IO_POLLING_USER__)
-			(this.io.poller.fast){ this };
+			__cfadbg_print_safe(io, "Kernel I/O : Creating fast poller for cluter %p\n", &this);
+			(this.io.poller.fast){ "Fast IO Poller", this };
 			__thrd_start( this.io.poller.fast, main );
 		#endif
 
 		// Create the poller thread
+		__cfadbg_print_safe(io, "Kernel I/O : Creating slow poller for cluter %p\n", &this);
 		this.io.poller.slow.stack = __create_pthread( &this.io.poller.slow.kthrd, __io_poller_slow, &this );
 	}
 
 	void __kernel_io_prepare_stop( cluster & this ) {
+		__cfadbg_print_safe(io, "Kernel I/O : Stopping pollers for cluster\n", &this);
 		// Notify the poller thread of the shutdown
 		__atomic_store_n(&this.io.done, true, __ATOMIC_SEQ_CST);
@@ -233,4 +238,6 @@
 		free( this.io.poller.slow.stack );
 
+		__cfadbg_print_safe(io, "Kernel I/O : Slow poller stopped for cluster\n", &this);
+
 		#if defined(__CFA_IO_POLLING_USER__)
 			// unpark the fast io_poller
@@ -238,4 +245,6 @@
 
 			^(this.io.poller.fast){};
+
+			__cfadbg_print_safe(io, "Kernel I/O : Fast poller stopped for cluster\n", &this);
 		#endif
 	}
@@ -324,5 +333,5 @@
 
 			struct io_user_data * data = (struct io_user_data *)cqe.user_data;
-			// __cfaabi_bits_print_safe( STDERR_FILENO, "Performed reading io cqe %p, result %d for %p\n", data, cqe.res, data->thrd );
+			__cfadbg_print_safe( io, "Kernel I/O : Performed reading io cqe %p, result %d for %p\n", data, cqe.res, data->thrd );
 
 			data->result = cqe.res;
@@ -369,4 +378,5 @@
 				int count = __drain_io( ring, &mask, 1, true );
 				if(count > 0) {
+					__cfadbg_print_safe(io, "Kernel I/O : Moving to ring %p to fast poller\n", &ring);
 					__unpark( &ring.poller.fast.thrd __cfaabi_dbg_ctx2 );
 					wait( ring.poller.sem );
@@ -398,4 +408,5 @@
 				else {
 					// We didn't get anything baton pass to the slow poller
+					__cfadbg_print_safe(io, "Kernel I/O : Moving to ring %p to slow poller\n", &this.ring);
 					post( this.ring->poller.sem );
 					park( __cfaabi_dbg_ctx );
@@ -464,5 +475,4 @@
 		// Submit however, many entries need to be submitted
 		int ret = syscall( __NR_io_uring_enter, ring.fd, 1, 0, 0, 0p, 0);
-		// __cfaabi_bits_print_safe( STDERR_FILENO, "Performed io_submit, returned %d\n", ret );
 		if( ret < 0 ) {
 			switch((int)errno) {
@@ -481,4 +491,5 @@
 		// Make sure that idx was submitted
 		// Be careful to not get false positive if we cycled the entire list or that someone else submitted for us
+		__cfadbg_print_safe( io, "Kernel I/O : Performed io_submit for %p, returned %d\n", active_thread(), ret );
 	}
 
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 9987d798e132e92c92b98630dd5f1a7de52adbd4)
+++ libcfa/src/concurrency/kernel.cfa	(revision 4069faade93e68df42b8c7af24253b6a041b469b)
@@ -15,4 +15,5 @@
 
 #define __cforall_thread__
+// #define __CFA_DEBUG_PRINT_RUNTIME_CORE__
 
 //C Includes
@@ -40,4 +41,5 @@
 #include "invoke.h"
 
+
 //-----------------------------------------------------------------------------
 // Some assembly required
@@ -230,14 +232,14 @@
 	idle{};
 
-	__cfaabi_dbg_print_safe("Kernel : Starting core %p\n", &this);
+	__cfadbg_print_safe(runtime_core, "Kernel : Starting core %p\n", &this);
 
 	this.stack = __create_pthread( &this.kernel_thread, __invoke_processor, (void *)&this );
 
-	__cfaabi_dbg_print_safe("Kernel : core %p started\n", &this);
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
 }
 
 void ^?{}(processor & this) with( this ){
 	if( ! __atomic_load_n(&do_terminate, __ATOMIC_ACQUIRE) ) {
-		__cfaabi_dbg_print_safe("Kernel : core %p signaling termination\n", &this);
+		__cfadbg_print_safe(runtime_core, "Kernel : core %p signaling termination\n", &this);
 
 		__atomic_store_n(&do_terminate, true, __ATOMIC_RELAXED);
@@ -289,5 +291,5 @@
 	verify(this);
 
-	__cfaabi_dbg_print_safe("Kernel : core %p starting\n", this);
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p starting\n", this);
 
 	doregister(this->cltr, this);
@@ -297,5 +299,5 @@
 		preemption_scope scope = { this };
 
-		__cfaabi_dbg_print_safe("Kernel : core %p started\n", this);
+		__cfadbg_print_safe(runtime_core, "Kernel : core %p started\n", this);
 
 		$thread * readyThread = 0p;
@@ -323,5 +325,5 @@
 		}
 
-		__cfaabi_dbg_print_safe("Kernel : core %p stopping\n", this);
+		__cfadbg_print_safe(runtime_core, "Kernel : core %p stopping\n", this);
 	}
 
@@ -330,5 +332,5 @@
 	V( this->terminated );
 
-	__cfaabi_dbg_print_safe("Kernel : core %p terminated\n", this);
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p terminated\n", this);
 
 	// HACK : the coroutine context switch expects this_thread to be set
@@ -475,5 +477,5 @@
 
 	//We now have a proper context from which to schedule threads
-	__cfaabi_dbg_print_safe("Kernel : core %p created (%p, %p)\n", proc, &proc->runner, &ctx);
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p created (%p, %p)\n", proc, &proc->runner, &ctx);
 
 	// SKULLDUGGERY: Since the coroutine doesn't have its own stack, we can't
@@ -486,5 +488,5 @@
 
 	// Main routine of the core returned, the core is now fully terminated
-	__cfaabi_dbg_print_safe("Kernel : core %p main ended (%p)\n", proc, &proc->runner);
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p main ended (%p)\n", proc, &proc->runner);
 
 	return 0p;
@@ -717,5 +719,5 @@
 static void __kernel_startup(void) {
 	verify( ! kernelTLS.preemption_state.enabled );
-	__cfaabi_dbg_print_safe("Kernel : Starting\n");
+	__cfadbg_print_safe(runtime_core, "Kernel : Starting\n");
 
 	__page_size = sysconf( _SC_PAGESIZE );
@@ -728,5 +730,5 @@
 	(*mainCluster){"Main Cluster"};
 
-	__cfaabi_dbg_print_safe("Kernel : Main cluster ready\n");
+	__cfadbg_print_safe(runtime_core, "Kernel : Main cluster ready\n");
 
 	// Start by initializing the main thread
@@ -738,5 +740,5 @@
 	(*mainThread){ &info };
 
-	__cfaabi_dbg_print_safe("Kernel : Main thread ready\n");
+	__cfadbg_print_safe(runtime_core, "Kernel : Main thread ready\n");
 
 
@@ -759,5 +761,5 @@
 
 		runner{ &this };
-		__cfaabi_dbg_print_safe("Kernel : constructed main processor context %p\n", &runner);
+		__cfadbg_print_safe(runtime_core, "Kernel : constructed main processor context %p\n", &runner);
 	}
 
@@ -834,5 +836,5 @@
 	^(__cfa_dbg_global_clusters.lock){};
 
-	__cfaabi_dbg_print_safe("Kernel : Shutdown complete\n");
+	__cfadbg_print_safe(runtime_core, "Kernel : Shutdown complete\n");
 }
 
@@ -859,9 +861,9 @@
 
 	// We are ready to sleep
-	__cfaabi_dbg_print_safe("Kernel : Processor %p ready to sleep\n", this);
+	__cfadbg_print_safe(runtime_core, "Kernel : Processor %p ready to sleep\n", this);
 	wait( idle );
 
 	// We have woken up
-	__cfaabi_dbg_print_safe("Kernel : Processor %p woke up and ready to run\n", this);
+	__cfadbg_print_safe(runtime_core, "Kernel : Processor %p woke up and ready to run\n", this);
 
 	// Get ourself off the idle list
@@ -879,5 +881,5 @@
 static bool __wake_one(cluster * this, __attribute__((unused)) bool force) {
 	// if we don't want to force check if we know it's false
-	if( !this->idles.head && !force ) return false;
+	// if( !this->idles.head && !force ) return false;
 
 	// First, lock the cluster idle
@@ -892,4 +894,5 @@
 
 	// Wake them up
+	__cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this->idles.head);
 	post( this->idles.head->idle );
 
@@ -901,4 +904,5 @@
 // Unconditionnaly wake a thread
 static bool __wake_proc(processor * this) {
+	__cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this);
 	return post( this->idle );
 }
