Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 8e9e9a2d7a14bd29799dcae8c81b6a96b111f6b3)
+++ libcfa/src/concurrency/io.cfa	(revision e46c753cfba891c12d977b0252b3c997702db5a5)
@@ -182,4 +182,8 @@
 //=============================================================================================
 	void __kernel_io_startup( cluster & this, unsigned io_flags, bool main_cluster ) {
+		if( (io_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS) && (io_flags & CFA_CLUSTER_IO_EAGER_SUBMITS) ) {
+			abort("CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS and CFA_CLUSTER_IO_EAGER_SUBMITS cannot be mixed\n");
+		}
+
 		this.io = malloc();
 
@@ -261,5 +265,5 @@
 		}
 
-		if( io_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
+		if( io_flags & ( CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS | CFA_CLUSTER_IO_EAGER_SUBMITS ) ) {
 			/* paranoid */ verify( is_pow2( io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET ) || ((io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET) < 8)  );
 			sq.ready_cnt = max(io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET, 8);
@@ -423,27 +427,12 @@
 	// Process a single completion message from the io_uring
 	// This is NOT thread-safe
+	static unsigned __collect_submitions( struct __io_data & ring );
 	static [int, bool] __drain_io( & struct __io_data ring, * sigset_t mask, int waitcnt, bool in_kernel ) {
+		/* paranoid */ verify( !kernelTLS.preemption_state.enabled );
+
 		unsigned to_submit = 0;
 		if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
-
 			// If the poller thread also submits, then we need to aggregate the submissions which are ready
-			uint32_t tail = *ring.submit_q.tail;
-			const uint32_t mask = *ring.submit_q.mask;
-
-			// Go through the list of ready submissions
-			for( i; ring.submit_q.ready_cnt ) {
-				// replace any submission with the sentinel, to consume it.
-				uint32_t idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
-
-				// If it was already the sentinel, then we are done
-				if( idx == -1ul32 ) continue;
-
-				// If we got a real submission, append it to the list
-				ring.submit_q.array[ (tail + to_submit) & mask ] = idx & mask;
-				to_submit++;
-			}
-
-			// Increment the tail based on how many we are ready to submit
-			__atomic_fetch_add(ring.submit_q.tail, to_submit, __ATOMIC_SEQ_CST);
+			to_submit = __collect_submitions( ring );
 		}
 
@@ -455,5 +444,5 @@
 			case EAGAIN:
 			case EINTR:
-				return -EAGAIN;
+				return [0, true];
 			default:
 				abort( "KERNEL ERROR: IO_URING WAIT - %s\n", strerror(errno) );
@@ -467,16 +456,11 @@
 		}
 
-		uint32_t avail = 0;
-		uint32_t sqe_num = *ring.submit_q.num;
-		for(i; sqe_num) {
-			if( ring.submit_q.sqes[ i ].user_data == 0 ) avail++;
-		}
-
 		// update statistics
 		#if !defined(__CFA_NO_STATISTICS__)
-			__tls_stats()->io.submit_q.submit_avg.rdy += to_submit;
-			__tls_stats()->io.submit_q.submit_avg.csm += ret;
-			__tls_stats()->io.submit_q.submit_avg.avl += avail;
-			__tls_stats()->io.submit_q.submit_avg.cnt += 1;
+			if( to_submit > 0 ) {
+				__tls_stats()->io.submit_q.submit_avg.rdy += to_submit;
+				__tls_stats()->io.submit_q.submit_avg.csm += ret;
+				__tls_stats()->io.submit_q.submit_avg.cnt += 1;
+			}
 		#endif
 
@@ -491,5 +475,5 @@
 		// Nothing was new return 0
 		if (head == tail) {
-			return 0;
+			return [0, to_submit > 0];
 		}
 
@@ -576,5 +560,5 @@
 				int count;
 				bool again;
-				[count, again] = __drain_io( ring, &mask, 0, true );
+				[count, again] = __drain_io( ring, &mask, 1, true );
 
 				// Update statistics
@@ -658,20 +642,12 @@
 
 // Submition steps :
-// 1 - We need to make sure we don't overflow any of the buffer, P(ring.submit) to make sure
-//     entries are available. The semaphore make sure that there is no more operations in
-//     progress then the number of entries in the buffer. This probably limits concurrency
-//     more than necessary since submitted but not completed operations don't need any
-//     entries in user space. However, I don't know what happens if we overflow the buffers
-//     because too many requests completed at once. This is a safe approach in all cases.
-//     Furthermore, with hundreds of entries, this may be okay.
-//
-// 2 - Allocate a queue entry. The ring already has memory for all entries but only the ones
+// 1 - Allocate a queue entry. The ring already has memory for all entries but only the ones
 //     listed in sq.array are visible by the kernel. For those not listed, the kernel does not
 //     offer any assurance that an entry is not being filled by multiple flags. Therefore, we
 //     need to write an allocator that allows allocating concurrently.
 //
-// 3 - Actually fill the submit entry, this is the only simple and straightforward step.
+// 2 - Actually fill the submit entry, this is the only simple and straightforward step.
 //
-// 4 - Append the entry index to the array and adjust the tail accordingly. This operation
+// 3 - Append the entry index to the array and adjust the tail accordingly. This operation
 //     needs to arrive to two concensus at the same time:
 //     A - The order in which entries are listed in the array: no two threads must pick the
@@ -682,6 +658,5 @@
 
 	[* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring, uint64_t data ) {
-		verify( data != 0 );
-
+		/* paranoid */ verify( data != 0 );
 
 		// Prepare the data we need
@@ -762,9 +737,9 @@
 		// update statistics
 		#if !defined(__CFA_NO_STATISTICS__)
-		disable_interrupts();
-			__tls_stats()->io.submit_q.look_avg.val   += len;
-			__tls_stats()->io.submit_q.look_avg.block += block;
-			__tls_stats()->io.submit_q.look_avg.cnt   += 1;
-		enable_interrupts( __cfaabi_dbg_ctx );
+			disable_interrupts();
+				__tls_stats()->io.submit_q.look_avg.val   += len;
+				__tls_stats()->io.submit_q.look_avg.block += block;
+				__tls_stats()->io.submit_q.look_avg.cnt   += 1;
+			enable_interrupts( __cfaabi_dbg_ctx );
 		#endif
 
@@ -780,5 +755,4 @@
 		if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
 			// If the poller thread submits, then we just need to add this to the ready array
-
 			__submit_to_ready_array( ring, idx, mask );
 
@@ -786,4 +760,64 @@
 
 			__cfadbg_print_safe( io, "Kernel I/O : Added %u to ready for %p\n", idx, active_thread() );
+		}
+		else if( ring.cltr_flags & CFA_CLUSTER_IO_EAGER_SUBMITS ) {
+			uint32_t picked = __submit_to_ready_array( ring, idx, mask );
+
+			for() {
+				yield();
+
+				// If some one else collected our index, we are done
+				if( ring.submit_q.ready[picked] != idx ) {
+					#if !defined(__CFA_NO_STATISTICS__)
+						disable_interrupts();
+							__tls_stats()->io.submit_q.helped += 1;
+						enable_interrupts( __cfaabi_dbg_ctx );
+					#endif
+					return;
+				}
+
+				if( try_lock(ring.submit_q.lock __cfaabi_dbg_ctx2) ) {
+					#if !defined(__CFA_NO_STATISTICS__)
+						__tls_stats()->io.submit_q.leader += 1;
+					#endif
+					break;
+				}
+			}
+
+			// We got the lock
+			unsigned to_submit = __collect_submitions( ring );
+			// /* paranoid */ verify( to_submit > 0 );
+			if( to_submit == 0 ) abort();
+
+			const uint32_t smask = *ring.submit_q.mask;
+			uint32_t shead = *ring.submit_q.head;
+			int ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, 0, 0, 0p, _NSIG / 8);
+			if( ret < 0 ) {
+				switch((int)errno) {
+				case EAGAIN:
+				case EINTR:
+					unlock(ring.submit_q.lock);
+					return;
+				default:
+					abort( "KERNEL ERROR: IO_URING WAIT - %s\n", strerror(errno) );
+				}
+			}
+
+			/* paranoid */ verify( ret > 0 );
+
+			// Release the consumed SQEs
+			for( i; ret ) {
+				uint32_t idx = ring.submit_q.array[ (i + shead) & smask ];
+				ring.submit_q.sqes[ idx ].user_data = 0;
+			}
+
+			// update statistics
+			#if !defined(__CFA_NO_STATISTICS__)
+				__tls_stats()->io.submit_q.submit_avg.rdy += to_submit;
+				__tls_stats()->io.submit_q.submit_avg.csm += ret;
+				__tls_stats()->io.submit_q.submit_avg.cnt += 1;
+			#endif
+
+			unlock(ring.submit_q.lock);
 		}
 		else {
@@ -809,6 +843,9 @@
 			// update statistics
 			#if !defined(__CFA_NO_STATISTICS__)
-				__tls_stats()->io.submit_q.submit_avg.csm += 1;
-				__tls_stats()->io.submit_q.submit_avg.cnt += 1;
+				disable_interrupts();
+					abort();
+					__tls_stats()->io.submit_q.submit_avg.csm += 1;
+					__tls_stats()->io.submit_q.submit_avg.cnt += 1;
+				enable_interrupts( __cfaabi_dbg_ctx );
 			#endif
 
@@ -820,3 +857,30 @@
 		}
 	}
+
+	static unsigned __collect_submitions( struct __io_data & ring ) {
+		/* paranoid */ verify( ring.submit_q.ready != 0p );
+		/* paranoid */ verify( ring.submit_q.ready_cnt > 0 );
+
+		unsigned to_submit = 0;
+		uint32_t tail = *ring.submit_q.tail;
+		const uint32_t mask = *ring.submit_q.mask;
+
+		// Go through the list of ready submissions
+		for( i; ring.submit_q.ready_cnt ) {
+			// replace any submission with the sentinel, to consume it.
+			uint32_t idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
+
+			// If it was already the sentinel, then we are done
+			if( idx == -1ul32 ) continue;
+
+			// If we got a real submission, append it to the list
+			ring.submit_q.array[ (tail + to_submit) & mask ] = idx & mask;
+			to_submit++;
+		}
+
+		// Increment the tail based on how many we are ready to submit
+		__atomic_fetch_add(ring.submit_q.tail, to_submit, __ATOMIC_SEQ_CST);
+
+		return to_submit;
+	}
 #endif
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 8e9e9a2d7a14bd29799dcae8c81b6a96b111f6b3)
+++ libcfa/src/concurrency/kernel.hfa	(revision e46c753cfba891c12d977b0252b3c997702db5a5)
@@ -131,5 +131,5 @@
 #define CFA_CLUSTER_IO_POLLER_USER_THREAD    1 << 0 // 0x1
 #define CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS 1 << 1 // 0x2
-// #define CFA_CLUSTER_IO_POLLER_KERNEL_SIDE 1 << 2 // 0x4
+#define CFA_CLUSTER_IO_EAGER_SUBMITS         1 << 2 // 0x4
 #define CFA_CLUSTER_IO_BUFFLEN_OFFSET        16
 
Index: libcfa/src/concurrency/stats.cfa
===================================================================
--- libcfa/src/concurrency/stats.cfa	(revision 8e9e9a2d7a14bd29799dcae8c81b6a96b111f6b3)
+++ libcfa/src/concurrency/stats.cfa	(revision e46c753cfba891c12d977b0252b3c997702db5a5)
@@ -27,5 +27,4 @@
 			stats->io.submit_q.submit_avg.rdy = 0;
 			stats->io.submit_q.submit_avg.csm = 0;
-			stats->io.submit_q.submit_avg.avl = 0;
 			stats->io.submit_q.submit_avg.cnt = 0;
 			stats->io.submit_q.look_avg.val   = 0;
@@ -35,4 +34,6 @@
 			stats->io.submit_q.alloc_avg.cnt   = 0;
 			stats->io.submit_q.alloc_avg.block = 0;
+			stats->io.submit_q.helped = 0;
+			stats->io.submit_q.leader = 0;
 			stats->io.complete_q.completed_avg.val = 0;
 			stats->io.complete_q.completed_avg.slow_cnt = 0;
@@ -68,4 +69,6 @@
 			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.cnt           , proc->io.submit_q.alloc_avg.cnt           , __ATOMIC_SEQ_CST );
 			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.block         , proc->io.submit_q.alloc_avg.block         , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.helped                  , proc->io.submit_q.helped                  , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.leader                  , proc->io.submit_q.leader                  , __ATOMIC_SEQ_CST );
 			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.val     , proc->io.complete_q.completed_avg.val     , __ATOMIC_SEQ_CST );
 			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.slow_cnt, proc->io.complete_q.completed_avg.slow_cnt, __ATOMIC_SEQ_CST );
@@ -120,5 +123,4 @@
 				double avgrdy = ((double)io.submit_q.submit_avg.rdy) / io.submit_q.submit_avg.cnt;
 				double avgcsm = ((double)io.submit_q.submit_avg.csm) / io.submit_q.submit_avg.cnt;
-				double avgavl = ((double)io.submit_q.submit_avg.avl) / io.submit_q.submit_avg.cnt;
 
 				double lavgv = 0;
@@ -141,5 +143,6 @@
 					"- avg ready entries      : %'18.2lf\n"
 					"- avg submitted entries  : %'18.2lf\n"
-					"- avg available entries  : %'18.2lf\n"
+					"- total helped entries   : %'15" PRIu64 "\n"
+					"- total leader entries   : %'15" PRIu64 "\n"
 					"- total ready search     : %'15" PRIu64 "\n"
 					"- avg ready search len   : %'18.2lf\n"
@@ -153,5 +156,6 @@
 					, cluster ? "Cluster" : "Processor",  name, id
 					, io.submit_q.submit_avg.cnt
-					, avgrdy, avgcsm, avgavl
+					, avgrdy, avgcsm
+					, io.submit_q.helped, io.submit_q.leader
 					, io.submit_q.look_avg.cnt
 					, lavgv, lavgb
Index: libcfa/src/concurrency/stats.hfa
===================================================================
--- libcfa/src/concurrency/stats.hfa	(revision 8e9e9a2d7a14bd29799dcae8c81b6a96b111f6b3)
+++ libcfa/src/concurrency/stats.hfa	(revision e46c753cfba891c12d977b0252b3c997702db5a5)
@@ -83,4 +83,6 @@
 					volatile uint64_t block;
 				} alloc_avg;
+				volatile uint64_t helped;
+				volatile uint64_t leader;
 			} submit_q;
 			struct {
