Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision d47349b1ab1eec2bf6e908188329e1cc8dcfbb60)
+++ libcfa/src/concurrency/io.cfa	(revision 6f121b8319d9f9ac95e57b58eb4e9e0faddb194e)
@@ -124,5 +124,4 @@
 
 		// Like head/tail but not seen by the kernel
-		volatile uint32_t alloc;
 		volatile uint32_t * ready;
 		uint32_t ready_cnt;
@@ -141,7 +140,8 @@
 			struct {
 				struct {
-					volatile unsigned long long int val;
+					volatile unsigned long long int rdy;
+					volatile unsigned long long int csm;
+					volatile unsigned long long int avl;
 					volatile unsigned long long int cnt;
-					volatile unsigned long long int block;
 				} submit_avg;
 				struct {
@@ -150,4 +150,9 @@
 					volatile unsigned long long int block;
 				} look_avg;
+				struct {
+					volatile unsigned long long int val;
+					volatile unsigned long long int cnt;
+					volatile unsigned long long int block;
+				} alloc_avg;
 			} stats;
 		#endif
@@ -279,5 +284,11 @@
 		sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
 		sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
-		sq.alloc = *sq.tail;
+
+		{
+			const uint32_t num = *sq.num;
+			for( i; num ) {
+				sq.sqes[i].user_data = 0ul64;
+			}
+		}
 
 		if( io_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
@@ -322,10 +333,14 @@
 		// Initialize statistics
 		#if !defined(__CFA_NO_STATISTICS__)
-			this.io->submit_q.stats.submit_avg.val   = 0;
-			this.io->submit_q.stats.submit_avg.cnt   = 0;
-			this.io->submit_q.stats.submit_avg.block = 0;
+			this.io->submit_q.stats.submit_avg.rdy = 0;
+			this.io->submit_q.stats.submit_avg.csm = 0;
+			this.io->submit_q.stats.submit_avg.avl = 0;
+			this.io->submit_q.stats.submit_avg.cnt = 0;
 			this.io->submit_q.stats.look_avg.val   = 0;
 			this.io->submit_q.stats.look_avg.cnt   = 0;
 			this.io->submit_q.stats.look_avg.block = 0;
+			this.io->submit_q.stats.alloc_avg.val   = 0;
+			this.io->submit_q.stats.alloc_avg.cnt   = 0;
+			this.io->submit_q.stats.alloc_avg.block = 0;
 			this.io->completion_q.stats.completed_avg.val = 0;
 			this.io->completion_q.stats.completed_avg.slow_cnt = 0;
@@ -383,4 +398,5 @@
 					this.ready_queue.head = 1p;
 					thrd.next = 0p;
+					__cfaabi_dbg_debug_do( thrd.unpark_stale = true );
 
 					// Fixup the thread state
@@ -425,4 +441,8 @@
 			if(this.print_stats) {
 				with(this.io->submit_q.stats, this.io->completion_q.stats) {
+					double avgrdy = ((double)submit_avg.rdy) / submit_avg.cnt;
+					double avgcsm = ((double)submit_avg.csm) / submit_avg.cnt;
+					double avgavl = ((double)submit_avg.avl) / submit_avg.cnt;
+
 					double lavgv = 0;
 					double lavgb = 0;
@@ -432,20 +452,35 @@
 					}
 
+					double aavgv = 0;
+					double aavgb = 0;
+					if(alloc_avg.cnt != 0) {
+						aavgv = ((double)alloc_avg.val  ) / alloc_avg.cnt;
+						aavgb = ((double)alloc_avg.block) / alloc_avg.cnt;
+					}
+
 					__cfaabi_bits_print_safe( STDOUT_FILENO,
 						"----- I/O uRing Stats -----\n"
 						"- total submit calls     : %'15llu\n"
-						"- avg submit             : %'18.2lf\n"
-						"- pre-submit block %%     : %'18.2lf\n"
+						"- avg ready entries      : %'18.2lf\n"
+						"- avg submitted entries  : %'18.2lf\n"
+						"- avg available entries  : %'18.2lf\n"
 						"- total ready search     : %'15llu\n"
 						"- avg ready search len   : %'18.2lf\n"
 						"- avg ready search block : %'18.2lf\n"
+						"- total alloc search     : %'15llu\n"
+						"- avg alloc search len   : %'18.2lf\n"
+						"- avg alloc search block : %'18.2lf\n"
 						"- total wait calls       : %'15llu   (%'llu slow, %'llu fast)\n"
 						"- avg completion/wait    : %'18.2lf\n",
 						submit_avg.cnt,
-						((double)submit_avg.val) / submit_avg.cnt,
-						(100.0 * submit_avg.block) / submit_avg.cnt,
+						avgrdy,
+						avgcsm,
+						avgavl,
 						look_avg.cnt,
 						lavgv,
 						lavgb,
+						alloc_avg.cnt,
+						aavgv,
+						aavgb,
 						completed_avg.slow_cnt + completed_avg.fast_cnt,
 						completed_avg.slow_cnt,  completed_avg.fast_cnt,
@@ -493,5 +528,5 @@
 
 			// If the poller thread also submits, then we need to aggregate the submissions which are ready
-			uint32_t * tail = ring.submit_q.tail;
+			uint32_t tail = *ring.submit_q.tail;
 			const uint32_t mask = *ring.submit_q.mask;
 
@@ -505,18 +540,14 @@
 
 				// If we got a real submission, append it to the list
-				ring.submit_q.array[ ((*tail) + to_submit) & mask ] = idx & mask;
+				ring.submit_q.array[ (tail + to_submit) & mask ] = idx & mask;
 				to_submit++;
 			}
 
 			// Increment the tail based on how many we are ready to submit
-			__atomic_fetch_add(tail, to_submit, __ATOMIC_SEQ_CST);
-
-			// update statistics
-			#if !defined(__CFA_NO_STATISTICS__)
-				ring.submit_q.stats.submit_avg.val += to_submit;
-				ring.submit_q.stats.submit_avg.cnt += 1;
-			#endif
-		}
-
+			__atomic_fetch_add(ring.submit_q.tail, to_submit, __ATOMIC_SEQ_CST);
+		}
+
+		const uint32_t smask = *ring.submit_q.mask;
+		uint32_t shead = *ring.submit_q.head;
 		int ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, waitcnt, IORING_ENTER_GETEVENTS, mask, _NSIG / 8);
 		if( ret < 0 ) {
@@ -530,7 +561,33 @@
 		}
 
+		verify( (shead + ret) == *ring.submit_q.head );
+
+		// Release the consumed SQEs
+		for( i; ret ) {
+			uint32_t idx = ring.submit_q.array[ (i + shead) & smask ];
+			ring.submit_q.sqes[ idx ].user_data = 0;
+		}
+
+		uint32_t avail = 0;
+		uint32_t sqe_num = *ring.submit_q.num;
+		for(i; sqe_num) {
+			if( ring.submit_q.sqes[ i ].user_data == 0 ) avail++;
+		}
+
+		// update statistics
+		#if !defined(__CFA_NO_STATISTICS__)
+			ring.submit_q.stats.submit_avg.rdy += to_submit;
+			ring.submit_q.stats.submit_avg.csm += ret;
+			ring.submit_q.stats.submit_avg.avl += avail;
+			ring.submit_q.stats.submit_avg.cnt += 1;
+		#endif
+
 		// Drain the queue
 		unsigned head = *ring.completion_q.head;
-		unsigned tail = __atomic_load_n(ring.completion_q.tail, __ATOMIC_ACQUIRE);
+		unsigned tail = *ring.completion_q.tail;
+		const uint32_t mask = *ring.completion_q.mask;
+
+		// Memory barrier
+		__atomic_thread_fence( __ATOMIC_SEQ_CST );
 
 		// Nothing was new return 0
@@ -541,5 +598,5 @@
 		uint32_t count = tail - head;
 		for(i; count) {
-			unsigned idx = (head + i) & (*ring.completion_q.mask);
+			unsigned idx = (head + i) & mask;
 			struct io_uring_cqe & cqe = ring.completion_q.cqes[idx];
 
@@ -555,8 +612,9 @@
 
 		// Allow new submissions to happen
-		V(ring.submit, count);
+		// V(ring.submit, count);
 
 		// Mark to the kernel that the cqe has been seen
 		// Ensure that the kernel only sees the new value of the head index after the CQEs have been read.
+		__atomic_thread_fence( __ATOMIC_SEQ_CST );
 		__atomic_fetch_add( ring.completion_q.head, count, __ATOMIC_RELAXED );
 
@@ -709,19 +767,44 @@
 //
 
-	static inline [* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring ) {
-		// Wait for a spot to be available
-		__attribute__((unused)) bool blocked = P(ring.submit);
-		#if !defined(__CFA_NO_STATISTICS__)
-			__atomic_fetch_add( &ring.submit_q.stats.submit_avg.block, blocked ? 1ul64 : 0ul64, __ATOMIC_RELAXED );
-		#endif
-
-		// Allocate the sqe
-		uint32_t idx = __atomic_fetch_add(&ring.submit_q.alloc, 1ul32, __ATOMIC_SEQ_CST);
-
-		// Mask the idx now to allow make everything easier to check
-		idx &= *ring.submit_q.mask;
-
-		// Return the sqe
-		return [&ring.submit_q.sqes[ idx ], idx];
+	static inline [* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring, uint64_t data ) {
+		verify( data != 0 );
+
+		// Prepare the data we need
+		__attribute((unused)) int len   = 0;
+		__attribute((unused)) int block = 0;
+		uint32_t cnt = *ring.submit_q.num;
+		uint32_t mask = *ring.submit_q.mask;
+		uint32_t off = __tls_rand();
+
+		// Loop around looking for an available spot
+		LOOKING: for() {
+			// Look through the list starting at some offset
+			for(i; cnt) {
+				uint64_t expected = 0;
+				uint32_t idx = (i + off) & mask;
+				struct io_uring_sqe * sqe = &ring.submit_q.sqes[idx];
+				volatile uint64_t * udata = &sqe->user_data;
+
+				if( *udata == expected &&
+					__atomic_compare_exchange_n( udata, &expected, data, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED ) )
+				{
+					// update statistics
+					#if !defined(__CFA_NO_STATISTICS__)
+						__atomic_fetch_add( &ring.submit_q.stats.alloc_avg.val,   len,   __ATOMIC_RELAXED );
+						__atomic_fetch_add( &ring.submit_q.stats.alloc_avg.block, block, __ATOMIC_RELAXED );
+						__atomic_fetch_add( &ring.submit_q.stats.alloc_avg.cnt,   1,     __ATOMIC_RELAXED );
+					#endif
+
+					// Success return the data
+					return [sqe, idx];
+				}
+				verify(expected != data);
+
+				len ++;
+			}
+
+			block++;
+			yield();
+		}
 	}
 
@@ -741,5 +824,4 @@
 			__attribute((unused)) int len   = 0;
 			__attribute((unused)) int block = 0;
-			uint32_t expected = -1ul32;
 			uint32_t ready_mask = ring.submit_q.ready_cnt - 1;
 			uint32_t off = __tls_rand();
@@ -747,7 +829,9 @@
 				for(i; ring.submit_q.ready_cnt) {
 					uint32_t ii = (i + off) & ready_mask;
+					uint32_t expected = -1ul32;
 					if( __atomic_compare_exchange_n( &ring.submit_q.ready[ii], &expected, idx, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED ) ) {
 						break LOOKING;
 					}
+					verify(expected != idx);
 
 					len ++;
@@ -791,5 +875,5 @@
 			// update statistics
 			#if !defined(__CFA_NO_STATISTICS__)
-				ring.submit_q.stats.submit_avg.val += 1;
+				ring.submit_q.stats.submit_avg.csm += 1;
 				ring.submit_q.stats.submit_avg.cnt += 1;
 			#endif
@@ -830,13 +914,13 @@
 
 	#define __submit_prelude \
-		struct __io_data & ring = *active_cluster()->io; \
+		io_user_data data = { 0, active_thread() }; \
+		struct __io_data & ring = *data.thrd->curr_cluster->io; \
 		struct io_uring_sqe * sqe; \
 		uint32_t idx; \
-		[sqe, idx] = __submit_alloc( ring );
+		[sqe, idx] = __submit_alloc( ring, (uint64_t)&data );
 
 	#define __submit_wait \
-		io_user_data data = { 0, active_thread() }; \
 		/*__cfaabi_bits_print_safe( STDERR_FILENO, "Preparing user data %p for %p\n", &data, data.thrd );*/ \
-		sqe->user_data = (uint64_t)&data; \
+		verify( sqe->user_data == (uint64_t)&data ); \
 		__submit( ring, idx ); \
 		park( __cfaabi_dbg_ctx ); \
