Index: libcfa/src/bits/defs.hfa
===================================================================
--- libcfa/src/bits/defs.hfa	(revision 6047b008bf08ee8a70f0827c3ab59cb8fa9f4d24)
+++ libcfa/src/bits/defs.hfa	(revision dddb3dd01058a6f0eb621887f1bc9675d616a840)
@@ -74,2 +74,4 @@
 	#error unsupported architecture
 #endif
+
+#define CFA_IO_LAZY (1_l64u << 32_l64u)
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 6047b008bf08ee8a70f0827c3ab59cb8fa9f4d24)
+++ libcfa/src/concurrency/io.cfa	(revision dddb3dd01058a6f0eb621887f1bc9675d616a840)
@@ -32,4 +32,5 @@
 	extern "C" {
 		#include <sys/syscall.h>
+		#include <sys/eventfd.h>
 
 		#include <linux/io_uring.h>
@@ -79,45 +80,8 @@
 	};
 
-//=============================================================================================
-// I/O Syscall
-//=============================================================================================
-	static int __io_uring_enter( struct $io_context & ctx, unsigned to_submit, bool get ) {
-		__STATS__( false, io.calls.count++; )
-		bool need_sys_to_submit = false;
-		bool need_sys_to_complete = false;
-		unsigned flags = 0;
-
-		TO_SUBMIT:
-		if( to_submit > 0 ) {
-			if( !(ctx.ring_flags & IORING_SETUP_SQPOLL) ) {
-				need_sys_to_submit = true;
-				break TO_SUBMIT;
-			}
-			if( (*ctx.sq.flags) & IORING_SQ_NEED_WAKEUP ) {
-				need_sys_to_submit = true;
-				flags |= IORING_ENTER_SQ_WAKEUP;
-			}
-		}
-
-		if( get && !(ctx.ring_flags & IORING_SETUP_SQPOLL) ) {
-			flags |= IORING_ENTER_GETEVENTS;
-			if( (ctx.ring_flags & IORING_SETUP_IOPOLL) ) {
-				need_sys_to_complete = true;
-			}
-		}
-
-		int ret = 0;
-		if( need_sys_to_submit || need_sys_to_complete ) {
-			__cfadbg_print_safe(io_core, "Kernel I/O : IO_URING enter %d %u %u\n", ctx.fd, to_submit, flags);
-			__STATS__( false, io.calls.blocks++; )
-			ret = syscall( __NR_io_uring_enter, ctx.fd, to_submit, 0, flags, (sigset_t *)0p, _NSIG / 8);
-			__cfadbg_print_safe(io_core, "Kernel I/O : IO_URING %d returned %d\n", ctx.fd, ret);
-		}
-
-		// Memory barrier
-		__atomic_thread_fence( __ATOMIC_SEQ_CST );
-		return ret;
-	}
-
+	static $io_context * __ioarbiter_allocate( $io_arbiter & mutex this, processor *, __u32 idxs[], __u32 want );
+	static void __ioarbiter_submit( $io_arbiter & mutex this, $io_context * , __u32 idxs[], __u32 have, bool lazy );
+	static void __ioarbiter_flush ( $io_arbiter & mutex this, $io_context * );
+	static inline void __ioarbiter_notify( $io_context & ctx );
 //=============================================================================================
 // I/O Polling
@@ -126,7 +90,54 @@
 	static inline __u32 __release_sqes( struct $io_context & );
 
-	static bool __drain_io( struct  $io_context & ctx ) {
-		unsigned to_submit = __flush( ctx );
-		int ret = __io_uring_enter( ctx, to_submit, true );
+	void __cfa_io_drain( processor * proc ) {
+		/* paranoid */ verify( ! __preemption_enabled() );
+		/* paranoid */ verify( proc );
+		/* paranoid */ verify( proc->io.ctx );
+
+		// Drain the queue
+		$io_context * ctx = proc->io.ctx;
+		unsigned head = *ctx->cq.head;
+		unsigned tail = *ctx->cq.tail;
+		const __u32 mask = *ctx->cq.mask;
+
+		__u32 count = tail - head;
+		__STATS__( false, io.calls.drain++; io.calls.completed += count; )
+
+		for(i; count) {
+			unsigned idx = (head + i) & mask;
+			volatile struct io_uring_cqe & cqe = ctx->cq.cqes[idx];
+
+			/* paranoid */ verify(&cqe);
+
+			struct io_future_t * future = (struct io_future_t *)(uintptr_t)cqe.user_data;
+			__cfadbg_print_safe( io, "Kernel I/O : Syscall completed : cqe %p, result %d for %p\n", &cqe, cqe.res, future );
+
+			fulfil( *future, cqe.res );
+		}
+
+		__cfadbg_print_safe(io, "Kernel I/O : %u completed\n", count);
+
+		// Mark to the kernel that the cqe has been seen
+		// Ensure that the kernel only sees the new value of the head index after the CQEs have been read.
+		__atomic_store_n( ctx->cq.head, head + count, __ATOMIC_SEQ_CST );
+
+		/* paranoid */ verify( ! __preemption_enabled() );
+
+		return;
+	}
+
+	void __cfa_io_flush( processor * proc ) {
+		/* paranoid */ verify( ! __preemption_enabled() );
+		/* paranoid */ verify( proc );
+		/* paranoid */ verify( proc->io.ctx );
+
+		$io_context & ctx = *proc->io.ctx;
+
+		if(!ctx.ext_sq.empty) {
+			__ioarbiter_flush( *ctx.arbiter, &ctx );
+		}
+
+		__STATS__( true, io.calls.flush++; )
+		int ret = syscall( __NR_io_uring_enter, ctx.fd, ctx.sq.to_submit, 0, 0, (sigset_t *)0p, _NSIG / 8);
 		if( ret < 0 ) {
 			switch((int)errno) {
@@ -136,6 +147,5 @@
 				// Update statistics
 				__STATS__( false, io.calls.errors.busy ++; )
-				return true;
-				break;
+				return;
 			default:
 				abort( "KERNEL ERROR: IO_URING SYSCALL - (%d) %s\n", (int)errno, strerror(errno) );
@@ -143,106 +153,19 @@
 		}
 
-		// update statistics
-		if (to_submit > 0) {
-			__STATS__( false, io.calls.submitted += ret; )
-			/* paranoid */ verify( ctx.sq.to_submit <= *ctx.sq.num );
-
-			/* paranoid */ verify( ctx.sq.to_submit >= ret );
-			ctx.sq.to_submit -= ret;
-
-			/* paranoid */ verify( ctx.sq.to_submit <= *ctx.sq.num );
-
-			if(ret) {
-				__cfadbg_print_safe(io, "Kernel I/O : %u submitted to io_uring\n", ret);
-			}
-		}
+		__cfadbg_print_safe(io, "Kernel I/O : %u submitted to io_uring %d\n", ret, ctx.fd);
+		__STATS__( true, io.calls.submitted += ret; )
+		/* paranoid */ verify( ctx.sq.to_submit <= *ctx.sq.num );
+		/* paranoid */ verify( ctx.sq.to_submit >= ret );
+
+		ctx.sq.to_submit -= ret;
+
+		/* paranoid */ verify( ctx.sq.to_submit <= *ctx.sq.num );
 
 		// Release the consumed SQEs
 		__release_sqes( ctx );
 
-		// Drain the queue
-		unsigned head = *ctx.cq.head;
-		unsigned tail = *ctx.cq.tail;
-		const __u32 mask = *ctx.cq.mask;
-
-		// Nothing was new return 0
-		if (head == tail) {
-			return ctx.sq.to_submit > 0;
-		}
-
-		__u32 count = tail - head;
-		/* paranoid */ verify( count != 0 );
-		__STATS__( false, io.calls.completed += count; )
-
-		for(i; count) {
-			unsigned idx = (head + i) & mask;
-			volatile struct io_uring_cqe & cqe = ctx.cq.cqes[idx];
-
-			/* paranoid */ verify(&cqe);
-
-			struct io_future_t * future = (struct io_future_t *)(uintptr_t)cqe.user_data;
-			__cfadbg_print_safe( io, "Kernel I/O : Syscall completed : cqe %p, result %d for %p\n", &cqe, cqe.res, future );
-
-			fulfil( *future, cqe.res );
-		}
-
-		if(count) {
-			__cfadbg_print_safe(io, "Kernel I/O : %u completed\n", count);
-		}
-
-		// Mark to the kernel that the cqe has been seen
-		// Ensure that the kernel only sees the new value of the head index after the CQEs have been read.
-		__atomic_store_n( ctx.cq.head, head + count, __ATOMIC_SEQ_CST );
-
-		return count > 0 || to_submit > 0;
-	}
-
-	void main( $io_context & this ) {
-		__cfadbg_print_safe(io_core, "Kernel I/O : IO poller %d (%p) ready\n", this.fd, &this);
-
-		const int reset_cnt = 5;
-		int reset = reset_cnt;
-		// Then loop until we need to start
-		LOOP:
-		while() {
-			waitfor( ^?{} : this) {
-				break LOOP;
-			}
-			or else {}
-
-			// Drain the io
-			bool again = __drain_io( this );
-
-			if(!again) reset--;
-
-			// If we got something, just yield and check again
-			if(reset > 1) {
-				yield();
-				continue LOOP;
-			}
-
-			// We alread failed to find completed entries a few time.
-			if(reset == 1) {
-				// Rearm the context so it can block
-				// but don't block right away
-				// we need to retry one last time in case
-				// something completed *just now*
-				__ioctx_prepare_block( this );
-				continue LOOP;
-			}
-
-			__STATS__( false,
-				io.poller.sleeps += 1;
-			)
-			__cfadbg_print_safe(io_core, "Kernel I/O : Parking io poller %d (%p)\n", this.fd, &this);
-
-			// block this thread
-			wait( this.sem );
-
-			// restore counter
-			reset = reset_cnt;
-		}
-
-		__cfadbg_print_safe(io_core, "Kernel I/O : Fast poller %d (%p) stopping\n", this.fd, &this);
+		/* paranoid */ verify( ! __preemption_enabled() );
+
+		ctx.proc->io.pending = false;
 	}
 
@@ -266,10 +189,4 @@
 //         head and tail must be fully filled and shouldn't ever be touched again.
 //
-
-	static $io_context * __ioarbiter_allocate( $io_arbiter & mutex this, processor *, __u32 idxs[], __u32 want );
-	static void __ioarbiter_submit  ( $io_arbiter & mutex this, $io_context * , __u32 idxs[], __u32 have );
-	static void __ioarbiter_flush   ( $io_arbiter & mutex this, $io_context * );
-	static inline void __ioarbiter_notify( $io_context & ctx );
-
 	//=============================================================================================
 	// Allocation
@@ -278,4 +195,5 @@
 		struct io_uring_sqe * sqes = ctx->sq.sqes;
 		for(i; want) {
+			__cfadbg_print_safe(io, "Kernel I/O : filling loop\n");
 			out_sqes[i] = &sqes[idxs[i]];
 		}
@@ -295,4 +213,5 @@
 		// copy all the indexes we want from the available list
 		for(i; want) {
+			__cfadbg_print_safe(io, "Kernel I/O : allocating loop\n");
 			idxs[i] = sq.free_ring.array[(fhead + i) & mask];
 		}
@@ -315,45 +234,36 @@
 		disable_interrupts();
 		processor * proc = __cfaabi_tls.this_processor;
+		$io_context * ctx = proc->io.ctx;
 		/* paranoid */ verify( __cfaabi_tls.this_processor );
-		/* paranoid */ verify( proc->io.lock == false );
-
-		__atomic_store_n( &proc->io.lock, true, __ATOMIC_SEQ_CST );
-		$io_context * ctx = proc->io.ctx;
+		/* paranoid */ verify( ctx );
+
+		__cfadbg_print_safe(io, "Kernel I/O : attempting to fast allocation\n");
+
+		// We can proceed to the fast path
+		if( __alloc(ctx, idxs, want) ) {
+			// Allocation was successful
+			__STATS__( true, io.alloc.fast += 1; )
+			enable_interrupts( __cfaabi_dbg_ctx );
+
+			__cfadbg_print_safe(io, "Kernel I/O : fast allocation successful from ring %d\n", ctx->fd);
+
+			__fill( sqes, want, idxs, ctx );
+			return ctx;
+		}
+		// The fast path failed, fallback
+		__STATS__( true, io.alloc.fail += 1; )
+
+		// Fast path failed, fallback on arbitration
+		__STATS__( true, io.alloc.slow += 1; )
+		enable_interrupts( __cfaabi_dbg_ctx );
+
 		$io_arbiter * ioarb = proc->cltr->io.arbiter;
 		/* paranoid */ verify( ioarb );
 
-		// Can we proceed to the fast path
-		if(  ctx				// We alreay have an instance?
-		&&  !ctx->revoked )		// Our instance is still valid?
-		{
-			__cfadbg_print_safe(io, "Kernel I/O : attempting to fast allocation\n");
-
-			// We can proceed to the fast path
-			if( __alloc(ctx, idxs, want) ) {
-				// Allocation was successful
-				// Mark the instance as no longer in-use and re-enable interrupts
-				__atomic_store_n( &proc->io.lock, false, __ATOMIC_RELEASE );
-				__STATS__( true, io.alloc.fast += 1; )
-				enable_interrupts( __cfaabi_dbg_ctx );
-
-				__cfadbg_print_safe(io, "Kernel I/O : fast allocation successful\n");
-
-				__fill( sqes, want, idxs, ctx );
-				return ctx;
-			}
-			// The fast path failed, fallback
-			__STATS__( true, io.alloc.fail += 1; )
-		}
-
-		// Fast path failed, fallback on arbitration
-		__atomic_store_n( &proc->io.lock, false, __ATOMIC_RELEASE );
-		__STATS__( true, io.alloc.slow += 1; )
-		enable_interrupts( __cfaabi_dbg_ctx );
-
 		__cfadbg_print_safe(io, "Kernel I/O : falling back on arbiter for allocation\n");
 
 		struct $io_context * ret = __ioarbiter_allocate(*ioarb, proc, idxs, want);
 
-		__cfadbg_print_safe(io, "Kernel I/O : slow allocation completed\n");
+		__cfadbg_print_safe(io, "Kernel I/O : slow allocation completed from ring %d\n", ret->fd);
 
 		__fill( sqes, want, idxs,ret );
@@ -364,44 +274,43 @@
 	//=============================================================================================
 	// submission
-	static inline void __submit( struct $io_context * ctx, __u32 idxs[], __u32 have) {
+	static inline void __submit( struct $io_context * ctx, __u32 idxs[], __u32 have, bool lazy) {
 		// We can proceed to the fast path
 		// Get the right objects
 		__sub_ring_t & sq = ctx->sq;
 		const __u32 mask  = *sq.mask;
-		__u32 tail = sq.kring.ready;
+		__u32 tail = *sq.kring.tail;
 
 		// Add the sqes to the array
 		for( i; have ) {
+			__cfadbg_print_safe(io, "Kernel I/O : __submit loop\n");
 			sq.kring.array[ (tail + i) & mask ] = idxs[i];
 		}
 
 		// Make the sqes visible to the submitter
-		__atomic_store_n(&sq.kring.ready, tail + have, __ATOMIC_RELEASE);
-
-		// Make sure the poller is awake
-		__cfadbg_print_safe(io, "Kernel I/O : waking the poller\n");
-		post( ctx->sem );
-	}
-
-	void cfa_io_submit( struct $io_context * inctx, __u32 idxs[], __u32 have ) __attribute__((nonnull (1))) {
-		__cfadbg_print_safe(io, "Kernel I/O : attempting to submit %u\n", have);
+		__atomic_store_n(sq.kring.tail, tail + have, __ATOMIC_RELEASE);
+		sq.to_submit++;
+
+		ctx->proc->io.pending = true;
+		ctx->proc->io.dirty   = true;
+		if(sq.to_submit > 30 || !lazy) {
+			__cfa_io_flush( ctx->proc );
+		}
+	}
+
+	void cfa_io_submit( struct $io_context * inctx, __u32 idxs[], __u32 have, bool lazy ) __attribute__((nonnull (1))) {
+		__cfadbg_print_safe(io, "Kernel I/O : attempting to submit %u (%s)\n", have, lazy ? "lazy" : "eager");
 
 		disable_interrupts();
 		processor * proc = __cfaabi_tls.this_processor;
+		$io_context * ctx = proc->io.ctx;
 		/* paranoid */ verify( __cfaabi_tls.this_processor );
-		/* paranoid */ verify( proc->io.lock == false );
-
-		__atomic_store_n( &proc->io.lock, true, __ATOMIC_SEQ_CST );
-		$io_context * ctx = proc->io.ctx;
+		/* paranoid */ verify( ctx );
 
 		// Can we proceed to the fast path
-		if(  ctx				// We alreay have an instance?
-		&&  !ctx->revoked		// Our instance is still valid?
-		&&   ctx == inctx )		// We have the right instance?
+		if( ctx == inctx )		// We have the right instance?
 		{
-			__submit(ctx, idxs, have);
+			__submit(ctx, idxs, have, lazy);
 
 			// Mark the instance as no longer in-use, re-enable interrupts and return
-			__atomic_store_n( &proc->io.lock, false, __ATOMIC_RELEASE );
 			__STATS__( true, io.submit.fast += 1; )
 			enable_interrupts( __cfaabi_dbg_ctx );
@@ -412,5 +321,4 @@
 
 		// Fast path failed, fallback on arbitration
-		__atomic_store_n( &proc->io.lock, false, __ATOMIC_RELEASE );
 		__STATS__( true, io.submit.slow += 1; )
 		enable_interrupts( __cfaabi_dbg_ctx );
@@ -418,33 +326,9 @@
 		__cfadbg_print_safe(io, "Kernel I/O : falling back on arbiter for submission\n");
 
-		__ioarbiter_submit(*inctx->arbiter, inctx, idxs, have);
+		__ioarbiter_submit(*inctx->arbiter, inctx, idxs, have, lazy);
 	}
 
 	//=============================================================================================
 	// Flushing
-	static unsigned __flush( struct $io_context & ctx ) {
-		// First check for external
-		if( !__atomic_load_n(&ctx.ext_sq.empty, __ATOMIC_SEQ_CST) ) {
-			// We have external submissions, delegate to the arbiter
-			__ioarbiter_flush( *ctx.arbiter, &ctx );
-		}
-
-		__u32 tail  = *ctx.sq.kring.tail;
-		__u32 ready = ctx.sq.kring.ready;
-
-		/* paranoid */ verify( ctx.sq.to_submit <= *ctx.sq.num );
-		ctx.sq.to_submit += (ready - tail);
-		/* paranoid */ verify( ctx.sq.to_submit <= *ctx.sq.num );
-
-		if(ctx.sq.to_submit) {
-			__cfadbg_print_safe(io, "Kernel I/O : %u ready to submit\n", ctx.sq.to_submit);
-		}
-
-		__atomic_store_n(ctx.sq.kring.tail, ready, __ATOMIC_RELEASE);
-
-		return ctx.sq.to_submit;
-	}
-
-
 	// Go through the ring's submit queue and release everything that has already been consumed
 	// by io_uring
@@ -484,4 +368,5 @@
 		// go through the range and release the sqes
 		for( i; count ) {
+			__cfadbg_print_safe(io, "Kernel I/O : release loop\n");
 			__u32 idx = ctx.sq.kring.array[ (phead + i) & mask ];
 			ctx.sq.free_ring.array[ (ftail + i) & mask ] = idx;
@@ -499,68 +384,6 @@
 // I/O Arbiter
 //=============================================================================================
-	static inline void __revoke( $io_arbiter & this, $io_context * ctx ) {
-		if(ctx->revoked) return;
-
-		/* paranoid */ verify( ctx->proc );
-		remove( this.assigned, *ctx );
-
-		// Mark as revoked
-		__atomic_store_n(&ctx->revoked, true, __ATOMIC_SEQ_CST);
-
-		// Wait for the processor to no longer use it
-		while(ctx->proc->io.lock) Pause();
-
-		// Remove the coupling with the processor
-		ctx->proc->io.ctx = 0p;
-		ctx->proc = 0p;
-
-		// add to available contexts
-		addHead( this.available, *ctx );
-	}
-
-	static inline void __assign( $io_arbiter & this, $io_context * ctx, processor * proc ) {
-		remove( this.available, *ctx );
-
-		ctx->revoked = false;
-		ctx->proc = proc;
-		__atomic_store_n(&proc->io.ctx, ctx, __ATOMIC_SEQ_CST);
-
-		// add to assigned contexts
-		addTail( this.assigned, *ctx );
-	}
-
 	static $io_context * __ioarbiter_allocate( $io_arbiter & mutex this, processor * proc, __u32 idxs[], __u32 want ) {
 		__cfadbg_print_safe(io, "Kernel I/O : arbiter allocating\n");
-
-		SeqIter($io_context) iter;
-		$io_context & ci;
-		// Do we already have something available?
-		for( over( iter, this.available ); iter | ci;) {
-			__cfadbg_print_safe(io, "Kernel I/O : attempting available context\n");
-
-			$io_context * c = &ci;
-			if(__alloc(c, idxs, want)) {
-				__assign( this, c, proc);
-				return c;
-			}
-		}
-
-
-		// Otherwise, we have no choice but to revoke everyone to check if other instance have available data
-		for( over( iter, this.assigned ); iter | ci; ) {
-			__cfadbg_print_safe(io, "Kernel I/O : revoking context for allocation\n");
-
-			$io_context * c = &ci;
-			__revoke( this, c );
-
-			__STATS__( false, io.alloc.revoke += 1; )
-
-			if(__alloc(c, idxs, want)) {
-				__assign( this, c, proc);
-				return c;
-			}
-		}
-
-		__cfadbg_print_safe(io, "Kernel I/O : waiting for available resources\n");
 
 		__STATS__( false, io.alloc.block += 1; )
@@ -577,6 +400,6 @@
 		/* paranoid */ verify( ret );
 
-		__assign( this, this.pending.ctx, proc);
 		return this.pending.ctx;
+
 	}
 
@@ -586,4 +409,5 @@
 
 		while( !is_empty(this.pending.blocked) ) {
+			__cfadbg_print_safe(io, "Kernel I/O : notifying\n");
 			__u32 have = ctx->sq.free_ring.tail - ctx->sq.free_ring.head;
 			__u32 want = front( this.pending.blocked );
@@ -604,5 +428,5 @@
 
 	// Simply append to the pending
-	static void __ioarbiter_submit( $io_arbiter & mutex this, $io_context * ctx, __u32 idxs[], __u32 have ) {
+	static void __ioarbiter_submit( $io_arbiter & mutex this, $io_context * ctx, __u32 idxs[], __u32 have, bool lazy ) {
 		__cfadbg_print_safe(io, "Kernel I/O : submitting %u from the arbiter to context %u\n", have, ctx->fd);
 
@@ -612,7 +436,4 @@
 		__atomic_store_n( &ctx->ext_sq.empty, false, __ATOMIC_SEQ_CST );
 
-		// Wake-up the poller
-		post( ctx->sem );
-
 		__cfadbg_print_safe(io, "Kernel I/O : waiting to submit %u\n", have);
 
@@ -621,5 +442,5 @@
 
 		// Submit our indexes
-		__submit(ctx, idxs, have);
+		__submit(ctx, idxs, have, lazy);
 
 		__cfadbg_print_safe(io, "Kernel I/O : %u submitted from arbiter\n", have);
@@ -630,6 +451,4 @@
 
 		__STATS__( false, io.flush.external += 1; )
-
-		__revoke( this, ctx );
 
 		__cfadbg_print_safe(io, "Kernel I/O : arbiter flushing\n");
@@ -643,25 +462,3 @@
 		ctx->ext_sq.empty = true;
 	}
-
-	void __ioarbiter_register( $io_arbiter & mutex this, $io_context & ctx ) {
-		__cfadbg_print_safe(io, "Kernel I/O : registering new context\n");
-
-		ctx.arbiter = &this;
-
-		// add to available contexts
-		addHead( this.available, ctx );
-
-		// Check if this solves pending allocations
-		if(this.pending.flag) {
-			__ioarbiter_notify( ctx );
-		}
-	}
-
-	void __ioarbiter_unregister( $io_arbiter & mutex this, $io_context & ctx ) {
-		/* paranoid */ verify( &this == ctx.arbiter );
-
-		__revoke( this, &ctx );
-
-		remove( this.available, ctx );
-	}
 #endif
Index: libcfa/src/concurrency/io/call.cfa.in
===================================================================
--- libcfa/src/concurrency/io/call.cfa.in	(revision 6047b008bf08ee8a70f0827c3ab59cb8fa9f4d24)
+++ libcfa/src/concurrency/io/call.cfa.in	(revision dddb3dd01058a6f0eb621887f1bc9675d616a840)
@@ -75,5 +75,5 @@
 
 	extern struct $io_context * cfa_io_allocate(struct io_uring_sqe * out_sqes[], __u32 out_idxs[], __u32 want)  __attribute__((nonnull (1,2)));
-	extern void cfa_io_submit( struct $io_context * in_ctx, __u32 in_idxs[], __u32 have ) __attribute__((nonnull (1,2)));
+	extern void cfa_io_submit( struct $io_context * in_ctx, __u32 in_idxs[], __u32 have, bool lazy ) __attribute__((nonnull (1,2)));
 #endif
 
@@ -185,5 +185,5 @@
 		return ', '.join(args_a)
 
-AsyncTemplate = """inline void async_{name}(io_future_t & future, {params}, int submit_flags) {{
+AsyncTemplate = """inline void async_{name}(io_future_t & future, {params}, __u64 submit_flags) {{
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_{op})
 		ssize_t res = {name}({args});
@@ -216,9 +216,9 @@
 
 		verify( sqe->user_data == (__u64)(uintptr_t)&future );
-		cfa_io_submit( ctx, &idx, 1 );
+		cfa_io_submit( ctx, &idx, 1, 0 != (submit_flags & CFA_IO_LAZY) );
 	#endif
 }}"""
 
-SyncTemplate = """{ret} cfa_{name}({params}, int submit_flags) {{
+SyncTemplate = """{ret} cfa_{name}({params}, __u64 submit_flags) {{
 	io_future_t future;
 
@@ -388,8 +388,8 @@
 	if c.define:
 		print("""#if defined({define})
-	{ret} cfa_{name}({params}, int submit_flags);
+	{ret} cfa_{name}({params}, __u64 submit_flags);
 #endif""".format(define=c.define,ret=c.ret, name=c.name, params=c.params))
 	else:
-		print("{ret} cfa_{name}({params}, int submit_flags);"
+		print("{ret} cfa_{name}({params}, __u64 submit_flags);"
 		.format(ret=c.ret, name=c.name, params=c.params))
 
@@ -399,8 +399,8 @@
 	if c.define:
 		print("""#if defined({define})
-	void async_{name}(io_future_t & future, {params}, int submit_flags);
+	void async_{name}(io_future_t & future, {params}, __u64 submit_flags);
 #endif""".format(define=c.define,name=c.name, params=c.params))
 	else:
-		print("void async_{name}(io_future_t & future, {params}, int submit_flags);"
+		print("void async_{name}(io_future_t & future, {params}, __u64 submit_flags);"
 		.format(name=c.name, params=c.params))
 print("\n")
Index: libcfa/src/concurrency/io/setup.cfa
===================================================================
--- libcfa/src/concurrency/io/setup.cfa	(revision 6047b008bf08ee8a70f0827c3ab59cb8fa9f4d24)
+++ libcfa/src/concurrency/io/setup.cfa	(revision dddb3dd01058a6f0eb621887f1bc9675d616a840)
@@ -26,12 +26,4 @@
 
 #if !defined(CFA_HAVE_LINUX_IO_URING_H)
-	void __kernel_io_startup() {
-		// Nothing to do without io_uring
-	}
-
-	void __kernel_io_shutdown() {
-		// Nothing to do without io_uring
-	}
-
 	void ?{}(io_context_params & this) {}
 
@@ -97,118 +89,10 @@
 
 //=============================================================================================
-// I/O Startup / Shutdown logic + Master Poller
-//=============================================================================================
-
-	// IO Master poller loop forward
-	static void * iopoll_loop( __attribute__((unused)) void * args );
-
-	static struct {
-		      pthread_t  thrd;    // pthread handle to io poller thread
-		      void *     stack;   // pthread stack for io poller thread
-		      int        epollfd; // file descriptor to the epoll instance
-		volatile     bool run;     // Whether or not to continue
-		volatile     bool stopped; // Whether the poller has finished running
-		volatile uint64_t epoch;   // Epoch used for memory reclamation
-	} iopoll;
-
-	void __kernel_io_startup(void) {
-		__cfadbg_print_safe(io_core, "Kernel : Creating EPOLL instance\n" );
-
-		iopoll.epollfd = epoll_create1(0);
-		if (iopoll.epollfd == -1) {
-			abort( "internal error, epoll_create1\n");
-		}
-
-		__cfadbg_print_safe(io_core, "Kernel : Starting io poller thread\n" );
-
-		iopoll.stack   = __create_pthread( &iopoll.thrd, iopoll_loop, 0p );
-		iopoll.run     = true;
-		iopoll.stopped = false;
-		iopoll.epoch   = 0;
-	}
-
-	void __kernel_io_shutdown(void) {
-		// Notify the io poller thread of the shutdown
-		iopoll.run = false;
-		sigval val = { 1 };
-		pthread_sigqueue( iopoll.thrd, SIGUSR1, val );
-
-		// Wait for the io poller thread to finish
-
-		__destroy_pthread( iopoll.thrd, iopoll.stack, 0p );
-
-		int ret = close(iopoll.epollfd);
-		if (ret == -1) {
-			abort( "internal error, close epoll\n");
-		}
-
-		// Io polling is now fully stopped
-
-		__cfadbg_print_safe(io_core, "Kernel : IO poller stopped\n" );
-	}
-
-	static void * iopoll_loop( __attribute__((unused)) void * args ) {
-		__processor_id_t id;
-		id.full_proc = false;
-		id.id = doregister(&id);
-		__cfaabi_tls.this_proc_id = &id;
-		__cfadbg_print_safe(io_core, "Kernel : IO poller thread starting\n" );
-
-		// Block signals to control when they arrive
-		sigset_t mask;
-		sigfillset(&mask);
-		if ( pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
-		abort( "internal error, pthread_sigmask" );
-		}
-
-		sigdelset( &mask, SIGUSR1 );
-
-		// Create sufficient events
-		struct epoll_event events[10];
-		// Main loop
-		while( iopoll.run ) {
-			__cfadbg_print_safe(io_core, "Kernel I/O - epoll : waiting on io_uring contexts\n");
-
-			// increment the epoch to notify any deleters we are starting a new cycle
-			__atomic_fetch_add(&iopoll.epoch, 1, __ATOMIC_SEQ_CST);
-
-			// Wait for events
-			int nfds = epoll_pwait( iopoll.epollfd, events, 10, -1, &mask );
-
-			__cfadbg_print_safe(io_core, "Kernel I/O - epoll : %d io contexts events, waking up\n", nfds);
-
-			// Check if an error occured
-			if (nfds == -1) {
-				if( errno == EINTR ) continue;
-				abort( "internal error, pthread_sigmask" );
-			}
-
-			for(i; nfds) {
-				$io_context * io_ctx = ($io_context *)(uintptr_t)events[i].data.u64;
-				/* paranoid */ verify( io_ctx );
-				__cfadbg_print_safe(io_core, "Kernel I/O - epoll : Unparking io poller %d (%p)\n", io_ctx->fd, io_ctx);
-				#if !defined( __CFA_NO_STATISTICS__ )
-					__cfaabi_tls.this_stats = io_ctx->self.curr_cluster->stats;
-				#endif
-
-				eventfd_t v;
-				eventfd_read(io_ctx->efd, &v);
-
-				post( io_ctx->sem );
-			}
-		}
-
-		__atomic_store_n(&iopoll.stopped, true, __ATOMIC_SEQ_CST);
-
-		__cfadbg_print_safe(io_core, "Kernel : IO poller thread stopping\n" );
-		unregister(&id);
-		return 0p;
-	}
-
-//=============================================================================================
 // I/O Context Constrution/Destruction
 //=============================================================================================
 
-	static void __io_uring_setup ( $io_context & this, const io_context_params & params_in );
+
+
+	static void __io_uring_setup ( $io_context & this, const io_context_params & params_in, int procfd );
 	static void __io_uring_teardown( $io_context & this );
 	static void __epoll_register($io_context & ctx);
@@ -217,28 +101,16 @@
 	void __ioarbiter_unregister( $io_arbiter & mutex, $io_context & ctx );
 
-	void ?{}($io_context & this, struct cluster & cl) {
-		(this.self){ "IO Poller", cl };
+	void ?{}($io_context & this, processor * proc, struct cluster & cl) {
+		/* paranoid */ verify( cl.io.arbiter );
+		this.proc = proc;
+		this.arbiter = cl.io.arbiter;
 		this.ext_sq.empty = true;
-		this.revoked = true;
-		__io_uring_setup( this, cl.io.params );
+		(this.ext_sq.blocked){};
+		__io_uring_setup( this, cl.io.params, proc->idle );
 		__cfadbg_print_safe(io_core, "Kernel I/O : Created ring for io_context %u (%p)\n", this.fd, &this);
-
-		__epoll_register(this);
-
-		__ioarbiter_register(*cl.io.arbiter, this);
-
-		__thrd_start( this, main );
-		__cfadbg_print_safe(io_core, "Kernel I/O : Started poller thread for io_context %u\n", this.fd);
-	}
-
-	void ^?{}($io_context & mutex this) {
+	}
+
+	void ^?{}($io_context & this) {
 		__cfadbg_print_safe(io_core, "Kernel I/O : tearing down io_context %u\n", this.fd);
-
-		^(this.self){};
-		__cfadbg_print_safe(io_core, "Kernel I/O : Stopped poller thread for io_context %u\n", this.fd);
-
-		__ioarbiter_unregister(*this.arbiter, this);
-
-		__epoll_unregister(this);
 
 		__io_uring_teardown( this );
@@ -246,22 +118,8 @@
 	}
 
-	void ?{}(io_context & this, struct cluster & cl) {
-		// this.ctx = new(cl);
-		this.ctx = alloc();
-		(*this.ctx){ cl };
-
-		__cfadbg_print_safe(io_core, "Kernel I/O : io_context %u ready\n", this.ctx->fd);
-	}
-
-	void ^?{}(io_context & this) {
-		post( this.ctx->sem );
-
-		delete(this.ctx);
-	}
-
 	extern void __disable_interrupts_hard();
 	extern void __enable_interrupts_hard();
 
-	static void __io_uring_setup( $io_context & this, const io_context_params & params_in ) {
+	static void __io_uring_setup( $io_context & this, const io_context_params & params_in, int procfd ) {
 		// Step 1 : call to setup
 		struct io_uring_params params;
@@ -339,5 +197,4 @@
 		sq.dropped     = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
 
-		sq.kring.ready = 0;
 		sq.kring.released = 0;
 
@@ -362,12 +219,9 @@
 		// io_uring_register is so f*cking slow on some machine that it
 		// will never succeed if preemption isn't hard blocked
+		__cfadbg_print_safe(io_core, "Kernel I/O : registering %d for completion with ring %d\n", procfd, fd);
+
 		__disable_interrupts_hard();
 
-		int efd = eventfd(0, 0);
-		if (efd < 0) {
-			abort("KERNEL ERROR: IO_URING EVENTFD - %s\n", strerror(errno));
-		}
-
-		int ret = syscall( __NR_io_uring_register, fd, IORING_REGISTER_EVENTFD, &efd, 1);
+		int ret = syscall( __NR_io_uring_register, fd, IORING_REGISTER_EVENTFD, &procfd, 1);
 		if (ret < 0) {
 			abort("KERNEL ERROR: IO_URING EVENTFD REGISTER - %s\n", strerror(errno));
@@ -375,4 +229,6 @@
 
 		__enable_interrupts_hard();
+
+		__cfadbg_print_safe(io_core, "Kernel I/O : registered %d for completion with ring %d\n", procfd, fd);
 
 		// some paranoid checks
@@ -390,5 +246,4 @@
 		this.ring_flags = 0;
 		this.fd         = fd;
-		this.efd        = efd;
 	}
 
@@ -411,49 +266,57 @@
 		// close the file descriptor
 		close(this.fd);
-		close(this.efd);
 
 		free( this.sq.free_ring.array ); // Maybe null, doesn't matter
 	}
 
+	void __cfa_io_start( processor * proc ) {
+		proc->io.ctx = alloc();
+		(*proc->io.ctx){proc, *proc->cltr};
+	}
+	void __cfa_io_stop ( processor * proc ) {
+		^(*proc->io.ctx){};
+		free(proc->io.ctx);
+	}
+
 //=============================================================================================
 // I/O Context Sleep
 //=============================================================================================
-	static inline void __epoll_ctl($io_context & ctx, int op, const char * error) {
-		struct epoll_event ev;
-		ev.events = EPOLLIN | EPOLLONESHOT;
-		ev.data.u64 = (__u64)&ctx;
-		int ret = epoll_ctl(iopoll.epollfd, op, ctx.efd, &ev);
-		if (ret < 0) {
-			abort( "KERNEL ERROR: EPOLL %s - (%d) %s\n", error, (int)errno, strerror(errno) );
-		}
-	}
-
-	static void __epoll_register($io_context & ctx) {
-		__epoll_ctl(ctx, EPOLL_CTL_ADD, "ADD");
-	}
-
-	static void __epoll_unregister($io_context & ctx) {
-		// Read the current epoch so we know when to stop
-		size_t curr = __atomic_load_n(&iopoll.epoch, __ATOMIC_SEQ_CST);
-
-		// Remove the fd from the iopoller
-		__epoll_ctl(ctx, EPOLL_CTL_DEL, "REMOVE");
-
-		// Notify the io poller thread of the shutdown
-		iopoll.run = false;
-		sigval val = { 1 };
-		pthread_sigqueue( iopoll.thrd, SIGUSR1, val );
-
-		// Make sure all this is done
-		__atomic_thread_fence(__ATOMIC_SEQ_CST);
-
-		// Wait for the next epoch
-		while(curr == iopoll.epoch && !iopoll.stopped) Pause();
-	}
-
-	void __ioctx_prepare_block($io_context & ctx) {
-		__cfadbg_print_safe(io_core, "Kernel I/O - epoll : Re-arming io poller %d (%p)\n", ctx.fd, &ctx);
-		__epoll_ctl(ctx, EPOLL_CTL_MOD, "REARM");
-	}
+	// static inline void __epoll_ctl($io_context & ctx, int op, const char * error) {
+	// 	struct epoll_event ev;
+	// 	ev.events = EPOLLIN | EPOLLONESHOT;
+	// 	ev.data.u64 = (__u64)&ctx;
+	// 	int ret = epoll_ctl(iopoll.epollfd, op, ctx.efd, &ev);
+	// 	if (ret < 0) {
+	// 		abort( "KERNEL ERROR: EPOLL %s - (%d) %s\n", error, (int)errno, strerror(errno) );
+	// 	}
+	// }
+
+	// static void __epoll_register($io_context & ctx) {
+	// 	__epoll_ctl(ctx, EPOLL_CTL_ADD, "ADD");
+	// }
+
+	// static void __epoll_unregister($io_context & ctx) {
+	// 	// Read the current epoch so we know when to stop
+	// 	size_t curr = __atomic_load_n(&iopoll.epoch, __ATOMIC_SEQ_CST);
+
+	// 	// Remove the fd from the iopoller
+	// 	__epoll_ctl(ctx, EPOLL_CTL_DEL, "REMOVE");
+
+	// 	// Notify the io poller thread of the shutdown
+	// 	iopoll.run = false;
+	// 	sigval val = { 1 };
+	// 	pthread_sigqueue( iopoll.thrd, SIGUSR1, val );
+
+	// 	// Make sure all this is done
+	// 	__atomic_thread_fence(__ATOMIC_SEQ_CST);
+
+	// 	// Wait for the next epoch
+	// 	while(curr == iopoll.epoch && !iopoll.stopped) Pause();
+	// }
+
+	// void __ioctx_prepare_block($io_context & ctx) {
+	// 	__cfadbg_print_safe(io_core, "Kernel I/O - epoll : Re-arming io poller %d (%p)\n", ctx.fd, &ctx);
+	// 	__epoll_ctl(ctx, EPOLL_CTL_MOD, "REARM");
+	// }
 
 
@@ -466,6 +329,6 @@
 
 	void ^?{}( $io_arbiter & mutex this ) {
-		/* paranoid */ verify( empty(this.assigned) );
-		/* paranoid */ verify( empty(this.available) );
+		// /* paranoid */ verify( empty(this.assigned) );
+		// /* paranoid */ verify( empty(this.available) );
 		/* paranoid */ verify( is_empty(this.pending.blocked) );
 	}
Index: libcfa/src/concurrency/io/types.hfa
===================================================================
--- libcfa/src/concurrency/io/types.hfa	(revision 6047b008bf08ee8a70f0827c3ab59cb8fa9f4d24)
+++ libcfa/src/concurrency/io/types.hfa	(revision dddb3dd01058a6f0eb621887f1bc9675d616a840)
@@ -38,5 +38,4 @@
 			volatile __u32 * head;	 // one passed last index consumed by the kernel
 			volatile __u32 * tail;   // one passed last index visible to the kernel
-			volatile __u32 ready;    // one passed last index added to array ()
 			volatile __u32 released; // one passed last index released back to the free list
 
@@ -97,10 +96,6 @@
 
 	struct __attribute__((aligned(128))) $io_context {
-		inline Seqable;
-
-		volatile bool revoked;
+		$io_arbiter * arbiter;
 		processor * proc;
-
-		$io_arbiter * arbiter;
 
 		struct {
@@ -113,16 +108,5 @@
 		__u32 ring_flags;
 		int fd;
-		int efd;
-
-		single_sem sem;
-		$thread self;
 	};
-
-	void main( $io_context & this );
-	static inline $thread  * get_thread ( $io_context & this ) __attribute__((const)) { return &this.self; }
-	static inline $monitor * get_monitor( $io_context & this ) __attribute__((const)) { return &this.self.self_mon; }
-	static inline $io_context *& Back( $io_context * n ) { return ($io_context *)Back( (Seqable *)n ); }
-	static inline $io_context *& Next( $io_context * n ) { return ($io_context *)Next( (Colable *)n ); }
-	void ^?{}( $io_context & mutex this );
 
 	monitor __attribute__((aligned(128))) $io_arbiter {
@@ -132,8 +116,4 @@
 			volatile bool flag;
 		} pending;
-
-		Sequence($io_context) assigned;
-
-		Sequence($io_context) available;
 	};
 
@@ -167,5 +147,5 @@
 	#endif
 
-	void __ioctx_prepare_block($io_context & ctx);
+	// void __ioctx_prepare_block($io_context & ctx);
 #endif
 
Index: libcfa/src/concurrency/iofwd.hfa
===================================================================
--- libcfa/src/concurrency/iofwd.hfa	(revision 6047b008bf08ee8a70f0827c3ab59cb8fa9f4d24)
+++ libcfa/src/concurrency/iofwd.hfa	(revision dddb3dd01058a6f0eb621887f1bc9675d616a840)
@@ -59,71 +59,71 @@
 // underlying calls
 extern struct $io_context * cfa_io_allocate(struct io_uring_sqe * out_sqes[], __u32 out_idxs[], __u32 want)  __attribute__((nonnull (1,2)));
-extern void cfa_io_submit( struct $io_context * in_ctx, __u32 in_idxs[], __u32 have ) __attribute__((nonnull (1,2)));
+extern void cfa_io_submit( struct $io_context * in_ctx, __u32 in_idxs[], __u32 have, bool lazy ) __attribute__((nonnull (1,2)));
 
 //----------
 // synchronous calls
 #if defined(CFA_HAVE_PREADV2)
-	extern ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags);
+	extern ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
 #if defined(CFA_HAVE_PWRITEV2)
-	extern ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags);
+	extern ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
-extern int cfa_fsync(int fd, int submit_flags);
-extern int cfa_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event, int submit_flags);
-extern int cfa_sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags, int submit_flags);
-extern  ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags, int submit_flags);
-extern ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags, int submit_flags);
-extern ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags, int submit_flags);
-extern ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags, int submit_flags);
-extern int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, int submit_flags);
-extern int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen, int submit_flags);
-extern int cfa_fallocate(int fd, int mode, off_t offset, off_t len, int submit_flags);
-extern int cfa_posix_fadvise(int fd, off_t offset, off_t len, int advice, int submit_flags);
-extern int cfa_madvise(void *addr, size_t length, int advice, int submit_flags);
-extern int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode, int submit_flags);
+extern int cfa_fsync(int fd, __u64 submit_flags);
+extern int cfa_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event, __u64 submit_flags);
+extern int cfa_sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags, __u64 submit_flags);
+extern  ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags, __u64 submit_flags);
+extern ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags, __u64 submit_flags);
+extern ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags, __u64 submit_flags);
+extern ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags, __u64 submit_flags);
+extern int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, __u64 submit_flags);
+extern int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen, __u64 submit_flags);
+extern int cfa_fallocate(int fd, int mode, off_t offset, off_t len, __u64 submit_flags);
+extern int cfa_posix_fadvise(int fd, off_t offset, off_t len, int advice, __u64 submit_flags);
+extern int cfa_madvise(void *addr, size_t length, int advice, __u64 submit_flags);
+extern int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode, __u64 submit_flags);
 #if defined(CFA_HAVE_OPENAT2)
-	extern int cfa_openat2(int dirfd, const char *pathname, struct open_how * how, size_t size, int submit_flags);
+	extern int cfa_openat2(int dirfd, const char *pathname, struct open_how * how, size_t size, __u64 submit_flags);
 #endif
-extern int cfa_close(int fd, int submit_flags);
+extern int cfa_close(int fd, __u64 submit_flags);
 #if defined(CFA_HAVE_STATX)
-	extern int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, int submit_flags);
+	extern int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, __u64 submit_flags);
 #endif
-extern ssize_t cfa_read(int fd, void * buf, size_t count, int submit_flags);
-extern ssize_t cfa_write(int fd, void * buf, size_t count, int submit_flags);
-extern ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags, int submit_flags);
-extern ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags, int submit_flags);
+extern ssize_t cfa_read(int fd, void * buf, size_t count, __u64 submit_flags);
+extern ssize_t cfa_write(int fd, void * buf, size_t count, __u64 submit_flags);
+extern ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags, __u64 submit_flags);
+extern ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags, __u64 submit_flags);
 
 //----------
 // asynchronous calls
 #if defined(CFA_HAVE_PREADV2)
-	extern void async_preadv2(io_future_t & future, int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags);
+	extern void async_preadv2(io_future_t & future, int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
 #if defined(CFA_HAVE_PWRITEV2)
-	extern void async_pwritev2(io_future_t & future, int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags);
+	extern void async_pwritev2(io_future_t & future, int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
-extern void async_fsync(io_future_t & future, int fd, int submit_flags);
-extern void async_epoll_ctl(io_future_t & future, int epfd, int op, int fd, struct epoll_event *event, int submit_flags);
-extern void async_sync_file_range(io_future_t & future, int fd, off64_t offset, off64_t nbytes, unsigned int flags, int submit_flags);
-extern void async_sendmsg(io_future_t & future, int sockfd, const struct msghdr *msg, int flags, int submit_flags);
-extern void async_recvmsg(io_future_t & future, int sockfd, struct msghdr *msg, int flags, int submit_flags);
-extern void async_send(io_future_t & future, int sockfd, const void *buf, size_t len, int flags, int submit_flags);
-extern void async_recv(io_future_t & future, int sockfd, void *buf, size_t len, int flags, int submit_flags);
-extern void async_accept4(io_future_t & future, int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, int submit_flags);
-extern void async_connect(io_future_t & future, int sockfd, const struct sockaddr *addr, socklen_t addrlen, int submit_flags);
-extern void async_fallocate(io_future_t & future, int fd, int mode, off_t offset, off_t len, int submit_flags);
-extern void async_posix_fadvise(io_future_t & future, int fd, off_t offset, off_t len, int advice, int submit_flags);
-extern void async_madvise(io_future_t & future, void *addr, size_t length, int advice, int submit_flags);
-extern void async_openat(io_future_t & future, int dirfd, const char *pathname, int flags, mode_t mode, int submit_flags);
+extern void async_fsync(io_future_t & future, int fd, __u64 submit_flags);
+extern void async_epoll_ctl(io_future_t & future, int epfd, int op, int fd, struct epoll_event *event, __u64 submit_flags);
+extern void async_sync_file_range(io_future_t & future, int fd, off64_t offset, off64_t nbytes, unsigned int flags, __u64 submit_flags);
+extern void async_sendmsg(io_future_t & future, int sockfd, const struct msghdr *msg, int flags, __u64 submit_flags);
+extern void async_recvmsg(io_future_t & future, int sockfd, struct msghdr *msg, int flags, __u64 submit_flags);
+extern void async_send(io_future_t & future, int sockfd, const void *buf, size_t len, int flags, __u64 submit_flags);
+extern void async_recv(io_future_t & future, int sockfd, void *buf, size_t len, int flags, __u64 submit_flags);
+extern void async_accept4(io_future_t & future, int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, __u64 submit_flags);
+extern void async_connect(io_future_t & future, int sockfd, const struct sockaddr *addr, socklen_t addrlen, __u64 submit_flags);
+extern void async_fallocate(io_future_t & future, int fd, int mode, off_t offset, off_t len, __u64 submit_flags);
+extern void async_posix_fadvise(io_future_t & future, int fd, off_t offset, off_t len, int advice, __u64 submit_flags);
+extern void async_madvise(io_future_t & future, void *addr, size_t length, int advice, __u64 submit_flags);
+extern void async_openat(io_future_t & future, int dirfd, const char *pathname, int flags, mode_t mode, __u64 submit_flags);
 #if defined(CFA_HAVE_OPENAT2)
-	extern void async_openat2(io_future_t & future, int dirfd, const char *pathname, struct open_how * how, size_t size, int submit_flags);
+	extern void async_openat2(io_future_t & future, int dirfd, const char *pathname, struct open_how * how, size_t size, __u64 submit_flags);
 #endif
-extern void async_close(io_future_t & future, int fd, int submit_flags);
+extern void async_close(io_future_t & future, int fd, __u64 submit_flags);
 #if defined(CFA_HAVE_STATX)
-	extern void async_statx(io_future_t & future, int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, int submit_flags);
+	extern void async_statx(io_future_t & future, int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, __u64 submit_flags);
 #endif
-void async_read(io_future_t & future, int fd, void * buf, size_t count, int submit_flags);
-extern void async_write(io_future_t & future, int fd, void * buf, size_t count, int submit_flags);
-extern void async_splice(io_future_t & future, int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags, int submit_flags);
-extern void async_tee(io_future_t & future, int fd_in, int fd_out, size_t len, unsigned int flags, int submit_flags);
+void async_read(io_future_t & future, int fd, void * buf, size_t count, __u64 submit_flags);
+extern void async_write(io_future_t & future, int fd, void * buf, size_t count, __u64 submit_flags);
+extern void async_splice(io_future_t & future, int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags, __u64 submit_flags);
+extern void async_tee(io_future_t & future, int fd_in, int fd_out, size_t len, unsigned int flags, __u64 submit_flags);
 
 
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 6047b008bf08ee8a70f0827c3ab59cb8fa9f4d24)
+++ libcfa/src/concurrency/kernel.cfa	(revision dddb3dd01058a6f0eb621887f1bc9675d616a840)
@@ -22,4 +22,7 @@
 #include <signal.h>
 #include <unistd.h>
+extern "C" {
+	#include <sys/eventfd.h>
+}
 
 //CFA Includes
@@ -109,5 +112,4 @@
 static void __run_thread(processor * this, $thread * dst);
 static void __wake_one(cluster * cltr);
-static void wait(__bin_sem_t & this);
 
 static void push  (__cluster_idles & idles, processor & proc);
@@ -115,4 +117,12 @@
 static [unsigned idle, unsigned total, * processor] query( & __cluster_idles idles );
 
+extern void __cfa_io_start( processor * );
+extern void __cfa_io_drain( processor * );
+extern void __cfa_io_flush( processor * );
+extern void __cfa_io_stop ( processor * );
+static inline void __maybe_io_drain( processor * );
+
+extern void __disable_interrupts_hard();
+extern void __enable_interrupts_hard();
 
 //=============================================================================================
@@ -130,4 +140,6 @@
 	verify(this);
 
+	__cfa_io_start( this );
+
 	__cfadbg_print_safe(runtime_core, "Kernel : core %p starting\n", this);
 	#if !defined(__CFA_NO_STATISTICS__)
@@ -151,8 +163,12 @@
 		MAIN_LOOP:
 		for() {
+			// Check if there is pending io
+			__maybe_io_drain( this );
+
 			// Try to get the next thread
 			readyThread = __next_thread( this->cltr );
 
 			if( !readyThread ) {
+				__cfa_io_flush( this );
 				readyThread = __next_thread_slow( this->cltr );
 			}
@@ -190,5 +206,10 @@
 				#endif
 
-				wait( this->idle );
+				__cfadbg_print_safe(runtime_core, "Kernel : core %p waiting on eventfd %d\n", this, this->idle);
+
+				__disable_interrupts_hard();
+				eventfd_t val;
+				eventfd_read( this->idle, &val );
+				__enable_interrupts_hard();
 
 				#if !defined(__CFA_NO_STATISTICS__)
@@ -206,4 +227,7 @@
 
 			/* paranoid */ verify( readyThread );
+
+			// Reset io dirty bit
+			this->io.dirty = false;
 
 			// We found a thread run it
@@ -220,4 +244,8 @@
 				}
 			#endif
+
+			if(this->io.pending && !this->io.dirty) {
+				__cfa_io_flush( this );
+			}
 		}
 
@@ -225,5 +253,8 @@
 	}
 
+	__cfa_io_stop( this );
+
 	post( this->terminated );
+
 
 	if(this == mainProcessor) {
@@ -248,4 +279,6 @@
 	/* paranoid */ verifyf( thrd_dst->link.next == 0p, "Expected null got %p", thrd_dst->link.next );
 	__builtin_prefetch( thrd_dst->context.SP );
+
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p running thread %p (%s)\n", this, thrd_dst, thrd_dst->self_cor.name);
 
 	$coroutine * proc_cor = get_coroutine(this->runner);
@@ -330,4 +363,6 @@
 	// Just before returning to the processor, set the processor coroutine to active
 	proc_cor->state = Active;
+
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p finished running thread %p\n", this, thrd_dst);
 
 	/* paranoid */ verify( ! __preemption_enabled() );
@@ -549,35 +584,4 @@
 // Kernel Idle Sleep
 //=============================================================================================
-extern "C" {
-	char * strerror(int);
-}
-#define CHECKED(x) { int err = x; if( err != 0 ) abort("KERNEL ERROR: Operation \"" #x "\" return error %d - %s\n", err, strerror(err)); }
-
-static void wait(__bin_sem_t & this) with( this ) {
-	verify(__cfaabi_dbg_in_kernel());
-	CHECKED( pthread_mutex_lock(&lock) );
-		while(val < 1) {
-			pthread_cond_wait(&cond, &lock);
-		}
-		val -= 1;
-	CHECKED( pthread_mutex_unlock(&lock) );
-}
-
-static bool post(__bin_sem_t & this) with( this ) {
-	bool needs_signal = false;
-
-	CHECKED( pthread_mutex_lock(&lock) );
-		if(val < 1) {
-			val += 1;
-			pthread_cond_signal(&cond);
-			needs_signal = true;
-		}
-	CHECKED( pthread_mutex_unlock(&lock) );
-
-	return needs_signal;
-}
-
-#undef CHECKED
-
 // Wake a thread from the front if there are any
 static void __wake_one(cluster * this) {
@@ -595,5 +599,7 @@
 
 	// We found a processor, wake it up
-	post( p->idle );
+	eventfd_t val;
+	val = 1;
+	eventfd_write( p->idle, val );
 
 	#if !defined(__CFA_NO_STATISTICS__)
@@ -613,5 +619,7 @@
 	disable_interrupts();
 		/* paranoid */ verify( ! __preemption_enabled() );
-		post( this->idle );
+		eventfd_t val;
+		val = 1;
+		eventfd_read( this->idle, &val );
 	enable_interrupts( __cfaabi_dbg_ctx );
 }
@@ -696,4 +704,20 @@
 // Kernel Utilities
 //=============================================================================================
+#if defined(CFA_HAVE_LINUX_IO_URING_H)
+#include "io/types.hfa"
+#endif
+
+static inline void __maybe_io_drain( processor * proc ) {
+	#if defined(CFA_HAVE_LINUX_IO_URING_H)
+		__cfadbg_print_safe(runtime_core, "Kernel : core %p checking io for ring %d\n", proc, proc->io.ctx->fd);
+
+		// Check if we should drain the queue
+		$io_context * ctx = proc->io.ctx;
+		unsigned head = *ctx->cq.head;
+		unsigned tail = *ctx->cq.tail;
+		if(head != tail) __cfa_io_drain( proc );
+	#endif
+}
+
 //-----------------------------------------------------------------------------
 // Debug
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 6047b008bf08ee8a70f0827c3ab59cb8fa9f4d24)
+++ libcfa/src/concurrency/kernel.hfa	(revision dddb3dd01058a6f0eb621887f1bc9675d616a840)
@@ -28,15 +28,7 @@
 }
 
-//-----------------------------------------------------------------------------
-// Underlying Locks
 #ifdef __CFA_WITH_VERIFY__
 	extern bool __cfaabi_dbg_in_kernel();
 #endif
-
-struct __bin_sem_t {
-	pthread_mutex_t 	lock;
-	pthread_cond_t  	cond;
-	int     		val;
-};
 
 //-----------------------------------------------------------------------------
@@ -52,11 +44,4 @@
 void  ?{}(io_context_params & this);
 
-struct io_context {
-	$io_context * ctx;
-	cluster * cltr;
-};
-void  ?{}(io_context & this, struct cluster & cl);
-void ^?{}(io_context & this);
-
 //-----------------------------------------------------------------------------
 // Processor
@@ -98,6 +83,7 @@
 
 	struct {
-		$io_context * volatile ctx;
-		volatile bool lock;
+		$io_context * ctx;
+		bool pending;
+		bool dirty;
 	} io;
 
@@ -110,5 +96,5 @@
 
 	// Idle lock (kernel semaphore)
-	__bin_sem_t idle;
+	int idle;
 
 	// Termination synchronisation (user semaphore)
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 6047b008bf08ee8a70f0827c3ab59cb8fa9f4d24)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision dddb3dd01058a6f0eb621887f1bc9675d616a840)
@@ -22,4 +22,5 @@
 extern "C" {
       #include <limits.h>       // PTHREAD_STACK_MIN
+	#include <sys/eventfd.h>  // eventfd
       #include <sys/mman.h>     // mprotect
       #include <sys/resource.h> // getrlimit
@@ -80,6 +81,4 @@
 static void ?{}(processorCtx_t & this) {}
 static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info);
-static void ?{}(__bin_sem_t & this);
-static void ^?{}(__bin_sem_t & this);
 
 #if defined(__CFA_WITH_VERIFY__)
@@ -91,6 +90,4 @@
 extern void __kernel_alarm_startup(void);
 extern void __kernel_alarm_shutdown(void);
-extern void __kernel_io_startup (void);
-extern void __kernel_io_shutdown(void);
 
 //-----------------------------------------------------------------------------
@@ -104,5 +101,4 @@
 KERNEL_STORAGE($thread,	             mainThread);
 KERNEL_STORAGE(__stack_t,            mainThreadCtx);
-KERNEL_STORAGE(io_context,           mainIoContext);
 KERNEL_STORAGE(__scheduler_RWLock_t, __scheduler_lock);
 #if !defined(__CFA_NO_STATISTICS__)
@@ -200,5 +196,4 @@
 
 	void ?{}(processor & this) with( this ) {
-		( this.idle ){};
 		( this.terminated ){};
 		( this.runner ){};
@@ -228,10 +223,4 @@
 	__kernel_alarm_startup();
 
-	// Start IO
-	__kernel_io_startup();
-
-	io_context * mainio = (io_context *)&storage_mainIoContext;
-	(*mainio){ *mainCluster };
-
 	// Add the main thread to the ready queue
 	// once resume is called on mainProcessor->runner the mainThread needs to be scheduled like any normal thread
@@ -255,8 +244,4 @@
 
 static void __kernel_shutdown(void) {
-	//Before we start shutting things down, wait for systems that need threading to shutdown
-	io_context * mainio = (io_context *)&storage_mainIoContext;
-	^(*mainio){};
-
 	/* paranoid */ verify( __preemption_enabled() );
 	disable_interrupts();
@@ -276,7 +261,4 @@
 	// Disable preemption
 	__kernel_alarm_shutdown();
-
-	// Stop IO
-	__kernel_io_shutdown();
 
 	// Destroy the main processor and its context in reverse order of construction
@@ -479,5 +461,11 @@
 
 	this.io.ctx = 0p;
-	this.io.lock = false;
+	this.io.pending = false;
+	this.io.dirty   = false;
+
+	this.idle = eventfd(0, 0);
+	if (idle < 0) {
+		abort("KERNEL ERROR: PROCESSOR EVENTFD - %s\n", strerror(errno));
+	}
 
 	#if !defined(__CFA_NO_STATISTICS__)
@@ -521,8 +509,9 @@
 	// Finally we don't need the read_lock any more
 	unregister((__processor_id_t*)&this);
+
+	close(this.idle);
 }
 
 void ?{}(processor & this, const char name[], cluster & _cltr) {
-	( this.idle ){};
 	( this.terminated ){};
 	( this.runner ){};
@@ -726,27 +715,4 @@
 }
 
-extern "C" {
-	char * strerror(int);
-}
-#define CHECKED(x) { int err = x; if( err != 0 ) abort("KERNEL ERROR: Operation \"" #x "\" return error %d - %s\n", err, strerror(err)); }
-
-static void ?{}(__bin_sem_t & this) with( this ) {
-	// Create the mutex with error checking
-	pthread_mutexattr_t mattr;
-	pthread_mutexattr_init( &mattr );
-	pthread_mutexattr_settype( &mattr, PTHREAD_MUTEX_ERRORCHECK_NP);
-	pthread_mutex_init(&lock, &mattr);
-
-	pthread_cond_init (&cond, (const pthread_condattr_t *)0p);  // workaround trac#208: cast should not be required
-	val = 0;
-}
-
-static void ^?{}(__bin_sem_t & this) with( this ) {
-	CHECKED( pthread_mutex_destroy(&lock) );
-	CHECKED( pthread_cond_destroy (&cond) );
-}
-
-#undef CHECKED
-
 #if defined(__CFA_WITH_VERIFY__)
 static bool verify_fwd_bck_rng(void) {
Index: libcfa/src/concurrency/stats.cfa
===================================================================
--- libcfa/src/concurrency/stats.cfa	(revision 6047b008bf08ee8a70f0827c3ab59cb8fa9f4d24)
+++ libcfa/src/concurrency/stats.cfa	(revision dddb3dd01058a6f0eb621887f1bc9675d616a840)
@@ -33,8 +33,8 @@
 			stats->io.submit.slow       = 0;
 			stats->io.flush.external    = 0;
-			stats->io.calls.count       = 0;
+			stats->io.calls.flush       = 0;
 			stats->io.calls.submitted   = 0;
+			stats->io.calls.drain       = 0;
 			stats->io.calls.completed   = 0;
-			stats->io.calls.blocks      = 0;
 			stats->io.calls.errors.busy = 0;
 			stats->io.poller.sleeps     = 0;
@@ -67,8 +67,8 @@
 			__atomic_fetch_add( &cltr->io.submit.slow      , proc->io.submit.slow      , __ATOMIC_SEQ_CST ); proc->io.submit.slow       = 0;
 			__atomic_fetch_add( &cltr->io.flush.external   , proc->io.flush.external   , __ATOMIC_SEQ_CST ); proc->io.flush.external    = 0;
-			__atomic_fetch_add( &cltr->io.calls.count      , proc->io.calls.count      , __ATOMIC_SEQ_CST ); proc->io.calls.count       = 0;
+			__atomic_fetch_add( &cltr->io.calls.flush      , proc->io.calls.flush      , __ATOMIC_SEQ_CST ); proc->io.calls.flush       = 0;
 			__atomic_fetch_add( &cltr->io.calls.submitted  , proc->io.calls.submitted  , __ATOMIC_SEQ_CST ); proc->io.calls.submitted   = 0;
+			__atomic_fetch_add( &cltr->io.calls.drain      , proc->io.calls.drain      , __ATOMIC_SEQ_CST ); proc->io.calls.drain       = 0;
 			__atomic_fetch_add( &cltr->io.calls.completed  , proc->io.calls.completed  , __ATOMIC_SEQ_CST ); proc->io.calls.completed   = 0;
-			__atomic_fetch_add( &cltr->io.calls.blocks     , proc->io.calls.blocks     , __ATOMIC_SEQ_CST ); proc->io.calls.blocks      = 0;
 			__atomic_fetch_add( &cltr->io.calls.errors.busy, proc->io.calls.errors.busy, __ATOMIC_SEQ_CST ); proc->io.calls.errors.busy = 0;
 			__atomic_fetch_add( &cltr->io.poller.sleeps    , proc->io.poller.sleeps    , __ATOMIC_SEQ_CST ); proc->io.poller.sleeps     = 0;
@@ -110,6 +110,6 @@
 				double avgfasts = ((double)io.submit.fast) / total_submits;
 
-				double avgsubs = ((double)io.calls.submitted) / io.calls.count;
-				double avgcomp = ((double)io.calls.completed) / io.calls.count;
+				double avgsubs = ((double)io.calls.submitted) / io.calls.flush;
+				double avgcomp = ((double)io.calls.completed) / io.calls.drain;
 
 				__cfaabi_bits_print_safe( STDOUT_FILENO,
@@ -129,5 +129,5 @@
 					, io.submit.fast, io.submit.slow, avgfasts
 					, io.flush.external
-					, io.calls.count, io.calls.blocks, io.calls.errors.busy
+					, io.calls.flush, io.calls.drain, io.calls.errors.busy
 					, io.calls.submitted, avgsubs
 					, io.calls.completed, avgcomp
Index: libcfa/src/concurrency/stats.hfa
===================================================================
--- libcfa/src/concurrency/stats.hfa	(revision 6047b008bf08ee8a70f0827c3ab59cb8fa9f4d24)
+++ libcfa/src/concurrency/stats.hfa	(revision dddb3dd01058a6f0eb621887f1bc9675d616a840)
@@ -80,8 +80,8 @@
 			} flush;
 			struct {
-				volatile uint64_t count;
+				volatile uint64_t drain;
+				volatile uint64_t completed;
+				volatile uint64_t flush;
 				volatile uint64_t submitted;
-				volatile uint64_t completed;
-				volatile uint64_t blocks;
 				struct {
 					volatile uint64_t busy;
