Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 730f4f153cdc265f2cb9a7c7ff64a8e3c629cb76)
+++ libcfa/src/concurrency/io.cfa	(revision 70ac8d0fbe40bd7d5685450882da3c93ffb7a479)
@@ -18,7 +18,8 @@
 
 #include "kernel.hfa"
+#include "bitmanip.hfa"
 
 #if !defined(HAVE_LINUX_IO_URING_H)
-	void __kernel_io_startup( cluster &, int, bool ) {
+	void __kernel_io_startup( cluster &, unsigned, bool ) {
 		// Nothing to do without io_uring
 	}
@@ -91,5 +92,4 @@
 	struct __io_poller_fast {
 		struct __io_data * ring;
-		bool waiting;
 		$thread thrd;
 	};
@@ -97,5 +97,4 @@
 	void ?{}( __io_poller_fast & this, struct cluster & cltr ) {
 		this.ring = cltr.io;
-		this.waiting = true;
 		(this.thrd){ "Fast I/O Poller", cltr };
 	}
@@ -126,5 +125,6 @@
 		// Like head/tail but not seen by the kernel
 		volatile uint32_t alloc;
-		volatile uint32_t ready;
+		volatile uint32_t * ready;
+		uint32_t ready_cnt;
 
 		__spinlock_t lock;
@@ -145,4 +145,9 @@
 					volatile unsigned long long int block;
 				} submit_avg;
+				struct {
+					volatile unsigned long long int val;
+					volatile unsigned long long int cnt;
+					volatile unsigned long long int block;
+				} look_avg;
 			} stats;
 		#endif
@@ -192,4 +197,5 @@
 				void * stack;
 				pthread_t kthrd;
+				volatile bool blocked;
 			} slow;
 			__io_poller_fast fast;
@@ -201,5 +207,5 @@
 // I/O Startup / Shutdown logic
 //=============================================================================================
-	void __kernel_io_startup( cluster & this, int io_flags, bool main_cluster ) {
+	void __kernel_io_startup( cluster & this, unsigned io_flags, bool main_cluster ) {
 		this.io = malloc();
 
@@ -274,5 +280,17 @@
 		sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
 		sq.alloc = *sq.tail;
-		sq.ready = *sq.tail;
+
+		if( io_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
+			/* paranoid */ verify( is_pow2( io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET ) || ((io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET) < 8)  );
+			sq.ready_cnt = max(io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET, 8);
+			sq.ready = alloc_align( 64, sq.ready_cnt );
+			for(i; sq.ready_cnt) {
+				sq.ready[i] = -1ul32;
+			}
+		}
+		else {
+			sq.ready_cnt = 0;
+			sq.ready = 0p;
+		}
 
 		// completion queue
@@ -307,4 +325,7 @@
 			this.io->submit_q.stats.submit_avg.cnt   = 0;
 			this.io->submit_q.stats.submit_avg.block = 0;
+			this.io->submit_q.stats.look_avg.val   = 0;
+			this.io->submit_q.stats.look_avg.cnt   = 0;
+			this.io->submit_q.stats.look_avg.block = 0;
 			this.io->completion_q.stats.completed_avg.val = 0;
 			this.io->completion_q.stats.completed_avg.slow_cnt = 0;
@@ -326,4 +347,5 @@
 		// Create the poller thread
 		__cfadbg_print_safe(io_core, "Kernel I/O : Creating slow poller for cluter %p\n", &this);
+		this.io->poller.slow.blocked = false;
 		this.io->poller.slow.stack = __create_pthread( &this.io->poller.slow.kthrd, __io_poller_slow, &this );
 	}
@@ -347,14 +369,12 @@
 		if( this.io->cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD ) {
 			with( this.io->poller.fast ) {
-				/* paranoid */ verify( waiting ); // The thread shouldn't be in a system call
 				/* paranoid */ verify( this.procs.head == 0p || &this == mainCluster );
 				/* paranoid */ verify( this.idles.head == 0p || &this == mainCluster );
 
 				// We need to adjust the clean-up based on where the thread is
-				if( thrd.preempted != __NO_PREEMPTION ) {
+				if( thrd.state == Ready || thrd.preempted != __NO_PREEMPTION ) {
 
 					// This is the tricky case
 					// The thread was preempted and now it is on the ready queue
-					/* paranoid */ verify( thrd.state == Active );           // The thread better be in this state
 					/* paranoid */ verify( thrd.next == 1p );                // The thread should be the last on the list
 					/* paranoid */ verify( this.ready_queue.head == &thrd ); // The thread should be the only thing on the list
@@ -405,14 +425,27 @@
 			if(this.print_stats) {
 				with(this.io->submit_q.stats, this.io->completion_q.stats) {
+					double lavgv = 0;
+					double lavgb = 0;
+					if(look_avg.cnt != 0) {
+						lavgv = ((double)look_avg.val  ) / look_avg.cnt;
+						lavgb = ((double)look_avg.block) / look_avg.cnt;
+					}
+
 					__cfaabi_bits_print_safe( STDERR_FILENO,
 						"----- I/O uRing Stats -----\n"
-						"- total submit calls  : %'15llu\n"
-						"- avg submit          : %'18.2lf\n"
-						"- pre-submit block %%  : %'18.2lf\n"
-						"- total wait calls    : %'15llu   (%'llu slow, %'llu fast)\n"
-						"- avg completion/wait : %'18.2lf\n",
+						"- total submit calls     : %'15llu\n"
+						"- avg submit             : %'18.2lf\n"
+						"- pre-submit block %%     : %'18.2lf\n"
+						"- total ready search     : %'15llu\n"
+						"- avg ready search len   : %'18.2lf\n"
+						"- avg ready search block : %'18.2lf\n"
+						"- total wait calls       : %'15llu   (%'llu slow, %'llu fast)\n"
+						"- avg completion/wait    : %'18.2lf\n",
 						submit_avg.cnt,
 						((double)submit_avg.val) / submit_avg.cnt,
 						(100.0 * submit_avg.block) / submit_avg.cnt,
+						look_avg.cnt,
+						lavgv,
+						lavgb,
 						completed_avg.slow_cnt + completed_avg.fast_cnt,
 						completed_avg.slow_cnt,  completed_avg.fast_cnt,
@@ -441,4 +474,5 @@
 		close(this.io->fd);
 
+		free( this.io->submit_q.ready ); // Maybe null, doesn't matter
 		free( this.io );
 	}
@@ -454,6 +488,36 @@
 	// Process a single completion message from the io_uring
 	// This is NOT thread-safe
-	static int __drain_io( struct __io_data & ring, sigset_t * mask, int waitcnt, bool in_kernel ) {
-		int ret = syscall( __NR_io_uring_enter, ring.fd, 0, waitcnt, IORING_ENTER_GETEVENTS, mask, _NSIG / 8);
+	static [int, bool] __drain_io( & struct __io_data ring, * sigset_t mask, int waitcnt, bool in_kernel ) {
+		unsigned to_submit = 0;
+		if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
+
+			// If the poller thread also submits, then we need to aggregate the submissions which are ready
+			uint32_t * tail = ring.submit_q.tail;
+			const uint32_t mask = *ring.submit_q.mask;
+
+			// Go through the list of ready submissions
+			for( i; ring.submit_q.ready_cnt ) {
+				// replace any submission with the sentinel, to consume it.
+				uint32_t idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
+
+				// If it was already the sentinel, then we are done
+				if( idx == -1ul32 ) continue;
+
+				// If we got a real submission, append it to the list
+				ring.submit_q.array[ ((*tail) + to_submit) & mask ] = idx & mask;
+				to_submit++;
+			}
+
+			// Increment the tail based on how many we are ready to submit
+			__atomic_fetch_add(tail, to_submit, __ATOMIC_SEQ_CST);
+
+			// update statistics
+			#if !defined(__CFA_NO_STATISTICS__)
+				ring.submit_q.stats.submit_avg.val += to_submit;
+				ring.submit_q.stats.submit_avg.cnt += 1;
+			#endif
+		}
+
+		int ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, waitcnt, IORING_ENTER_GETEVENTS, mask, _NSIG / 8);
 		if( ret < 0 ) {
 			switch((int)errno) {
@@ -497,5 +561,5 @@
 		__atomic_fetch_add( ring.completion_q.head, count, __ATOMIC_RELAXED );
 
-		return count;
+		return [count, count > 0 || to_submit > 0];
 	}
 
@@ -519,7 +583,14 @@
 		if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD ) {
 			while(!__atomic_load_n(&ring.done, __ATOMIC_SEQ_CST)) {
+
+				__atomic_store_n( &ring.poller.slow.blocked, true, __ATOMIC_SEQ_CST );
+
 				// In the user-thread approach drain and if anything was drained,
 				// batton pass to the user-thread
-				int count = __drain_io( ring, &mask, 1, true );
+				int count;
+				bool again;
+				[count, again] = __drain_io( ring, &mask, 1, true );
+
+				__atomic_store_n( &ring.poller.slow.blocked, false, __ATOMIC_SEQ_CST );
 
 				// Update statistics
@@ -529,5 +600,5 @@
 				#endif
 
-				if(count > 0) {
+				if(again) {
 					__cfadbg_print_safe(io_core, "Kernel I/O : Moving to ring %p to fast poller\n", &ring);
 					__unpark( &ring.poller.fast.thrd __cfaabi_dbg_ctx2 );
@@ -539,5 +610,7 @@
 			while(!__atomic_load_n(&ring.done, __ATOMIC_SEQ_CST)) {
 				//In the naive approach, just poll the io completion queue directly
-				int count = __drain_io( ring, &mask, 1, true );
+				int count;
+				bool again;
+				[count, again] = __drain_io( ring, &mask, 1, true );
 
 				// Update statistics
@@ -566,8 +639,11 @@
 		// Then loop until we need to start
 		while(!__atomic_load_n(&this.ring->done, __ATOMIC_SEQ_CST)) {
+
 			// Drain the io
-			this.waiting = false;
-			int count = __drain_io( *this.ring, 0p, 0, false );
-			reset += count > 0 ? 1 : 0;
+			int count;
+			bool again;
+			[count, again] = __drain_io( *this.ring, 0p, 0, false );
+
+			if(!again) reset++;
 
 			// Update statistics
@@ -577,19 +653,30 @@
 			#endif
 
-			this.waiting = true;
+			// If we got something, just yield and check again
 			if(reset < 5) {
-				// If we got something, just yield and check again
 				yield();
 			}
+			// We didn't get anything baton pass to the slow poller
 			else {
-				// We didn't get anything baton pass to the slow poller
 				__cfadbg_print_safe(io_core, "Kernel I/O : Moving to ring %p to slow poller\n", &this.ring);
+				reset = 0;
+
+				// wake up the slow poller
 				post( this.ring->poller.sem );
+
+				// park this thread
 				park( __cfaabi_dbg_ctx );
-				reset = 0;
 			}
 		}
 
 		__cfadbg_print_safe(io_core, "Kernel I/O : Fast poller for ring %p stopping\n", &this.ring);
+	}
+
+	static inline void __wake_poller( struct __io_data & ring ) __attribute__((artificial));
+	static inline void __wake_poller( struct __io_data & ring ) {
+		if(!__atomic_load_n( &ring.poller.slow.blocked, __ATOMIC_SEQ_CST)) return;
+
+		sigval val = { 1 };
+		pthread_sigqueue( ring.poller.slow.kthrd, SIGUSR1, val );
 	}
 
@@ -632,45 +719,84 @@
 		uint32_t idx = __atomic_fetch_add(&ring.submit_q.alloc, 1ul32, __ATOMIC_SEQ_CST);
 
-		// Validate that we didn't overflow anything
-		// Check that nothing overflowed
-		/* paranoid */ verify( true );
-
-		// Check that it goes head -> tail -> alloc and never head -> alloc -> tail
-		/* paranoid */ verify( true );
+		// Mask the idx now to allow make everything easier to check
+		idx &= *ring.submit_q.mask;
 
 		// Return the sqe
-		return [&ring.submit_q.sqes[ idx & (*ring.submit_q.mask)], idx];
+		return [&ring.submit_q.sqes[ idx ], idx];
 	}
 
 	static inline void __submit( struct __io_data & ring, uint32_t idx ) {
-		// get mutual exclusion
-		lock(ring.submit_q.lock __cfaabi_dbg_ctx2);
-
-		// Append to the list of ready entries
-		uint32_t * tail = ring.submit_q.tail;
+		// Get now the data we definetely need
+		uint32_t * const tail = ring.submit_q.tail;
 		const uint32_t mask = *ring.submit_q.mask;
 
-		ring.submit_q.array[ (*tail) & mask ] = idx & mask;
-		__atomic_fetch_add(tail, 1ul32, __ATOMIC_SEQ_CST);
-
-		// Submit however, many entries need to be submitted
-		int ret = syscall( __NR_io_uring_enter, ring.fd, 1, 0, 0, 0p, 0);
-		if( ret < 0 ) {
-			switch((int)errno) {
-			default:
-				abort( "KERNEL ERROR: IO_URING SUBMIT - %s\n", strerror(errno) );
-			}
-		}
-
-		// update statistics
-		#if !defined(__CFA_NO_STATISTICS__)
-			ring.submit_q.stats.submit_avg.val += 1;
-			ring.submit_q.stats.submit_avg.cnt += 1;
-		#endif
-
-		unlock(ring.submit_q.lock);
-		// Make sure that idx was submitted
-		// Be careful to not get false positive if we cycled the entire list or that someone else submitted for us
-		__cfadbg_print_safe( io, "Kernel I/O : Performed io_submit for %p, returned %d\n", active_thread(), ret );
+		// There are 2 submission schemes, check which one we are using
+		if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
+			// If the poller thread submits, then we just need to add this to the ready array
+
+			/* paranoid */ verify( idx <= mask   );
+			/* paranoid */ verify( idx != -1ul32 );
+
+			// We need to find a spot in the ready array
+			__attribute((unused)) int len   = 0;
+			__attribute((unused)) int block = 0;
+			uint32_t expected = -1ul32;
+			uint32_t ready_mask = ring.submit_q.ready_cnt - 1;
+			uint32_t off = __tls_rand();
+			LOOKING: for() {
+				for(i; ring.submit_q.ready_cnt) {
+					uint32_t ii = (i + off) & ready_mask;
+					if( __atomic_compare_exchange_n( &ring.submit_q.ready[ii], &expected, idx, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED ) ) {
+						break LOOKING;
+					}
+
+					len ++;
+				}
+
+				block++;
+				yield();
+			}
+
+			__wake_poller( ring );
+
+			// update statistics
+			#if !defined(__CFA_NO_STATISTICS__)
+				__atomic_fetch_add( &ring.submit_q.stats.look_avg.val,   len,   __ATOMIC_RELAXED );
+				__atomic_fetch_add( &ring.submit_q.stats.look_avg.block, block, __ATOMIC_RELAXED );
+				__atomic_fetch_add( &ring.submit_q.stats.look_avg.cnt,   1,     __ATOMIC_RELAXED );
+			#endif
+
+			__cfadbg_print_safe( io, "Kernel I/O : Added %u to ready for %p\n", idx, active_thread() );
+		}
+		else {
+			// get mutual exclusion
+			lock(ring.submit_q.lock __cfaabi_dbg_ctx2);
+
+			// Append to the list of ready entries
+
+			/* paranoid */ verify( idx <= mask );
+
+			ring.submit_q.array[ (*tail) & mask ] = idx & mask;
+			__atomic_fetch_add(tail, 1ul32, __ATOMIC_SEQ_CST);
+
+			// Submit however, many entries need to be submitted
+			int ret = syscall( __NR_io_uring_enter, ring.fd, 1, 0, 0, 0p, 0);
+			if( ret < 0 ) {
+				switch((int)errno) {
+				default:
+					abort( "KERNEL ERROR: IO_URING SUBMIT - %s\n", strerror(errno) );
+				}
+			}
+
+			// update statistics
+			#if !defined(__CFA_NO_STATISTICS__)
+				ring.submit_q.stats.submit_avg.val += 1;
+				ring.submit_q.stats.submit_avg.cnt += 1;
+			#endif
+
+			unlock(ring.submit_q.lock);
+
+			__cfadbg_print_safe( io, "Kernel I/O : Performed io_submit for %p, returned %d\n", active_thread(), ret );
+		}
 	}
 
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 730f4f153cdc265f2cb9a7c7ff64a8e3c629cb76)
+++ libcfa/src/concurrency/kernel.cfa	(revision 70ac8d0fbe40bd7d5685450882da3c93ffb7a479)
@@ -256,5 +256,5 @@
 }
 
-void ?{}(cluster & this, const char name[], Duration preemption_rate, int io_flags) with( this ) {
+void ?{}(cluster & this, const char name[], Duration preemption_rate, unsigned io_flags) with( this ) {
 	this.name = name;
 	this.preemption_rate = preemption_rate;
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 730f4f153cdc265f2cb9a7c7ff64a8e3c629cb76)
+++ libcfa/src/concurrency/kernel.hfa	(revision 70ac8d0fbe40bd7d5685450882da3c93ffb7a479)
@@ -116,6 +116,8 @@
 struct __io_data;
 
-#define CFA_CLUSTER_IO_POLLER_USER_THREAD 1 << 0
-// #define CFA_CLUSTER_IO_POLLER_KERNEL_SIDE 1 << 1
+#define CFA_CLUSTER_IO_POLLER_USER_THREAD    1 << 0 // 0x1
+#define CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS 1 << 1 // 0x2
+// #define CFA_CLUSTER_IO_POLLER_KERNEL_SIDE 1 << 2 // 0x4
+#define CFA_CLUSTER_IO_BUFFLEN_OFFSET        16
 
 //-----------------------------------------------------------------------------
@@ -159,13 +161,13 @@
 extern Duration default_preemption();
 
-void ?{} (cluster & this, const char name[], Duration preemption_rate, int flags);
+void ?{} (cluster & this, const char name[], Duration preemption_rate, unsigned flags);
 void ^?{}(cluster & this);
 
-static inline void ?{} (cluster & this)                                      { this{"Anonymous Cluster", default_preemption(), 0}; }
-static inline void ?{} (cluster & this, Duration preemption_rate)            { this{"Anonymous Cluster", preemption_rate, 0}; }
-static inline void ?{} (cluster & this, const char name[])                   { this{name, default_preemption(), 0}; }
-static inline void ?{} (cluster & this, int flags)                           { this{"Anonymous Cluster", default_preemption(), flags}; }
-static inline void ?{} (cluster & this, Duration preemption_rate, int flags) { this{"Anonymous Cluster", preemption_rate, flags}; }
-static inline void ?{} (cluster & this, const char name[], int flags)        { this{name, default_preemption(), flags}; }
+static inline void ?{} (cluster & this)                                           { this{"Anonymous Cluster", default_preemption(), 0}; }
+static inline void ?{} (cluster & this, Duration preemption_rate)                 { this{"Anonymous Cluster", preemption_rate, 0}; }
+static inline void ?{} (cluster & this, const char name[])                        { this{name, default_preemption(), 0}; }
+static inline void ?{} (cluster & this, unsigned flags)                           { this{"Anonymous Cluster", default_preemption(), flags}; }
+static inline void ?{} (cluster & this, Duration preemption_rate, unsigned flags) { this{"Anonymous Cluster", preemption_rate, flags}; }
+static inline void ?{} (cluster & this, const char name[], unsigned flags)        { this{name, default_preemption(), flags}; }
 
 static inline [cluster *&, cluster *& ] __get( cluster & this ) __attribute__((const)) { return this.node.[next, prev]; }
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 730f4f153cdc265f2cb9a7c7ff64a8e3c629cb76)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 70ac8d0fbe40bd7d5685450882da3c93ffb7a479)
@@ -77,5 +77,5 @@
 //-----------------------------------------------------------------------------
 // I/O
-void __kernel_io_startup     ( cluster &, int, bool );
+void __kernel_io_startup     ( cluster &, unsigned, bool );
 void __kernel_io_finish_start( cluster & );
 void __kernel_io_prepare_stop( cluster & );