Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision bb83b478f3b4399d8d5165aec5f3d29c2ca6514e)
+++ libcfa/src/concurrency/io.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -14,6 +14,8 @@
 //
 
-// #define __CFA_DEBUG_PRINT_IO__
-// #define __CFA_DEBUG_PRINT_IO_CORE__
+#if defined(__CFA_DEBUG__)
+	// #define __CFA_DEBUG_PRINT_IO__
+	#define __CFA_DEBUG_PRINT_IO_CORE__
+#endif
 
 #include "kernel.hfa"
@@ -109,4 +111,5 @@
 		volatile uint32_t * head;
 		volatile uint32_t * tail;
+		volatile uint32_t prev_head;
 
 		// The actual kernel ring which uses head/tail
@@ -129,4 +132,5 @@
 
 		__spinlock_t lock;
+		__spinlock_t release_lock;
 
 		// A buffer of sqes (not the actual ring)
@@ -182,4 +186,8 @@
 //=============================================================================================
 	void __kernel_io_startup( cluster & this, unsigned io_flags, bool main_cluster ) {
+		if( (io_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS) && (io_flags & CFA_CLUSTER_IO_EAGER_SUBMITS) ) {
+			abort("CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS and CFA_CLUSTER_IO_EAGER_SUBMITS cannot be mixed\n");
+		}
+
 		this.io = malloc();
 
@@ -187,4 +195,6 @@
 		struct io_uring_params params;
 		memset(&params, 0, sizeof(params));
+		if( io_flags & CFA_CLUSTER_IO_KERNEL_POLL_SUBMITS   ) params.flags |= IORING_SETUP_SQPOLL;
+		if( io_flags & CFA_CLUSTER_IO_KERNEL_POLL_COMPLETES ) params.flags |= IORING_SETUP_IOPOLL;
 
 		uint32_t nentries = entries_per_cluster();
@@ -208,5 +218,5 @@
 			// adjust the size according to the parameters
 			if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
-				cq->ring_sz = sq->ring_sz = max(cq->ring_sz, sq->ring_sz);
+				cq.ring_sz = sq.ring_sz = max(cq.ring_sz, sq.ring_sz);
 			}
 		#endif
@@ -222,5 +232,5 @@
 			// mmap the Completion Queue into existence (may or may not be needed)
 			if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
-				cq->ring_ptr = sq->ring_ptr;
+				cq.ring_ptr = sq.ring_ptr;
 			}
 			else
@@ -253,4 +263,5 @@
 		sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
 		sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
+		sq.prev_head = *sq.head;
 
 		{
@@ -261,5 +272,8 @@
 		}
 
-		if( io_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
+		(sq.lock){};
+		(sq.release_lock){};
+
+		if( io_flags & ( CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS | CFA_CLUSTER_IO_EAGER_SUBMITS ) ) {
 			/* paranoid */ verify( is_pow2( io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET ) || ((io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET) < 8)  );
 			sq.ready_cnt = max(io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET, 8);
@@ -313,5 +327,5 @@
 
 		// Create the poller thread
-		__cfadbg_print_safe(io_core, "Kernel I/O : Creating slow poller for cluter %p\n", &this);
+		__cfadbg_print_safe(io_core, "Kernel I/O : Creating slow poller for cluster %p\n", &this);
 		this.io->poller.slow.blocked = false;
 		this.io->poller.slow.stack = __create_pthread( &this.io->poller.slow.kthrd, __io_poller_slow, &this );
@@ -418,66 +432,91 @@
 	}
 
+	int __io_uring_enter( struct __io_data & ring, unsigned to_submit, bool get, sigset_t * mask ) {
+		bool need_sys_to_submit = false;
+		bool need_sys_to_complete = false;
+		unsigned min_complete = 0;
+		unsigned flags = 0;
+
+
+		TO_SUBMIT:
+		if( to_submit > 0 ) {
+			if( !(ring.ring_flags & IORING_SETUP_SQPOLL) ) {
+				need_sys_to_submit = true;
+				break TO_SUBMIT;
+			}
+			if( (*ring.submit_q.flags) & IORING_SQ_NEED_WAKEUP ) {
+				need_sys_to_submit = true;
+				flags |= IORING_ENTER_SQ_WAKEUP;
+			}
+		}
+
+		TO_COMPLETE:
+		if( get && !(ring.ring_flags & IORING_SETUP_SQPOLL) ) {
+			flags |= IORING_ENTER_GETEVENTS;
+			if( mask ) {
+				need_sys_to_complete = true;
+				min_complete = 1;
+				break TO_COMPLETE;
+			}
+			if( (ring.ring_flags & IORING_SETUP_IOPOLL) ) {
+				need_sys_to_complete = true;
+			}
+		}
+
+		int ret = 0;
+		if( need_sys_to_submit || need_sys_to_complete ) {
+			ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, min_complete, flags, mask, _NSIG / 8);
+			if( ret < 0 ) {
+				switch((int)errno) {
+				case EAGAIN:
+				case EINTR:
+					ret = -1;
+					break;
+				default:
+					abort( "KERNEL ERROR: IO_URING SYSCALL - (%d) %s\n", (int)errno, strerror(errno) );
+				}
+			}
+		}
+
+		// Memory barrier
+		__atomic_thread_fence( __ATOMIC_SEQ_CST );
+		return ret;
+	}
+
 //=============================================================================================
 // I/O Polling
 //=============================================================================================
+	static unsigned __collect_submitions( struct __io_data & ring );
+	static uint32_t __release_consumed_submission( struct __io_data & ring );
+
 	// Process a single completion message from the io_uring
 	// This is NOT thread-safe
-	static [int, bool] __drain_io( & struct __io_data ring, * sigset_t mask, int waitcnt, bool in_kernel ) {
+	static [int, bool] __drain_io( & struct __io_data ring, * sigset_t mask ) {
+		/* paranoid */ verify( !kernelTLS.preemption_state.enabled );
+
 		unsigned to_submit = 0;
 		if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
-
 			// If the poller thread also submits, then we need to aggregate the submissions which are ready
-			uint32_t tail = *ring.submit_q.tail;
-			const uint32_t mask = *ring.submit_q.mask;
-
-			// Go through the list of ready submissions
-			for( i; ring.submit_q.ready_cnt ) {
-				// replace any submission with the sentinel, to consume it.
-				uint32_t idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
-
-				// If it was already the sentinel, then we are done
-				if( idx == -1ul32 ) continue;
-
-				// If we got a real submission, append it to the list
-				ring.submit_q.array[ (tail + to_submit) & mask ] = idx & mask;
-				to_submit++;
-			}
-
-			// Increment the tail based on how many we are ready to submit
-			__atomic_fetch_add(ring.submit_q.tail, to_submit, __ATOMIC_SEQ_CST);
-		}
-
-		const uint32_t smask = *ring.submit_q.mask;
-		uint32_t shead = *ring.submit_q.head;
-		int ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, waitcnt, IORING_ENTER_GETEVENTS, mask, _NSIG / 8);
+			to_submit = __collect_submitions( ring );
+		}
+
+		int ret = __io_uring_enter(ring, to_submit, true, mask);
 		if( ret < 0 ) {
-			switch((int)errno) {
-			case EAGAIN:
-			case EINTR:
-				return -EAGAIN;
-			default:
-				abort( "KERNEL ERROR: IO_URING WAIT - %s\n", strerror(errno) );
-			}
+			return [0, true];
+		}
+
+		// update statistics
+		if (to_submit > 0) {
+			__STATS__( true,
+				if( to_submit > 0 ) {
+					io.submit_q.submit_avg.rdy += to_submit;
+					io.submit_q.submit_avg.csm += ret;
+					io.submit_q.submit_avg.cnt += 1;
+				}
+			)
 		}
 
 		// Release the consumed SQEs
-		for( i; ret ) {
-			uint32_t idx = ring.submit_q.array[ (i + shead) & smask ];
-			ring.submit_q.sqes[ idx ].user_data = 0;
-		}
-
-		uint32_t avail = 0;
-		uint32_t sqe_num = *ring.submit_q.num;
-		for(i; sqe_num) {
-			if( ring.submit_q.sqes[ i ].user_data == 0 ) avail++;
-		}
-
-		// update statistics
-		#if !defined(__CFA_NO_STATISTICS__)
-			__tls_stats()->io.submit_q.submit_avg.rdy += to_submit;
-			__tls_stats()->io.submit_q.submit_avg.csm += ret;
-			__tls_stats()->io.submit_q.submit_avg.avl += avail;
-			__tls_stats()->io.submit_q.submit_avg.cnt += 1;
-		#endif
+		__release_consumed_submission( ring );
 
 		// Drain the queue
@@ -486,13 +525,11 @@
 		const uint32_t mask = *ring.completion_q.mask;
 
-		// Memory barrier
-		__atomic_thread_fence( __ATOMIC_SEQ_CST );
-
 		// Nothing was new return 0
 		if (head == tail) {
-			return 0;
+			return [0, to_submit > 0];
 		}
 
 		uint32_t count = tail - head;
+		/* paranoid */ verify( count != 0 );
 		for(i; count) {
 			unsigned idx = (head + i) & mask;
@@ -505,6 +542,6 @@
 
 			data->result = cqe.res;
-			if(!in_kernel) { unpark( data->thrd __cfaabi_dbg_ctx2 ); }
-			else         { __unpark( &ring.poller.slow.id, data->thrd __cfaabi_dbg_ctx2 ); }
+			if(!mask) { unpark( data->thrd __cfaabi_dbg_ctx2 ); }
+			else      { __unpark( &ring.poller.slow.id, data->thrd __cfaabi_dbg_ctx2 ); }
 		}
 
@@ -554,13 +591,13 @@
 				int count;
 				bool again;
-				[count, again] = __drain_io( ring, &mask, 1, true );
+				[count, again] = __drain_io( ring, &mask );
 
 				__atomic_store_n( &ring.poller.slow.blocked, false, __ATOMIC_SEQ_CST );
 
 				// Update statistics
-				#if !defined(__CFA_NO_STATISTICS__)
-					__tls_stats()->io.complete_q.completed_avg.val += count;
-					__tls_stats()->io.complete_q.completed_avg.slow_cnt += 1;
-				#endif
+				__STATS__( true,
+					io.complete_q.completed_avg.val += count;
+					io.complete_q.completed_avg.slow_cnt += 1;
+				)
 
 				if(again) {
@@ -576,11 +613,11 @@
 				int count;
 				bool again;
-				[count, again] = __drain_io( ring, &mask, 0, true );
+				[count, again] = __drain_io( ring, &mask );
 
 				// Update statistics
-				#if !defined(__CFA_NO_STATISTICS__)
-					__tls_stats()->io.complete_q.completed_avg.val += count;
-					__tls_stats()->io.complete_q.completed_avg.slow_cnt += 1;
-				#endif
+				__STATS__( true,
+					io.complete_q.completed_avg.val += count;
+					io.complete_q.completed_avg.slow_cnt += 1;
+				)
 			}
 		}
@@ -614,13 +651,13 @@
 			bool again;
 			disable_interrupts();
-				[count, again] = __drain_io( *this.ring, 0p, 0, false );
+				[count, again] = __drain_io( *this.ring, 0p );
 
 				if(!again) reset++;
 
 				// Update statistics
-				#if !defined(__CFA_NO_STATISTICS__)
-					__tls_stats()->io.complete_q.completed_avg.val += count;
-					__tls_stats()->io.complete_q.completed_avg.fast_cnt += 1;
-				#endif
+				__STATS__( true,
+					io.complete_q.completed_avg.val += count;
+					io.complete_q.completed_avg.fast_cnt += 1;
+				)
 			enable_interrupts( __cfaabi_dbg_ctx );
 
@@ -658,20 +695,12 @@
 
 // Submition steps :
-// 1 - We need to make sure we don't overflow any of the buffer, P(ring.submit) to make sure
-//     entries are available. The semaphore make sure that there is no more operations in
-//     progress then the number of entries in the buffer. This probably limits concurrency
-//     more than necessary since submitted but not completed operations don't need any
-//     entries in user space. However, I don't know what happens if we overflow the buffers
-//     because too many requests completed at once. This is a safe approach in all cases.
-//     Furthermore, with hundreds of entries, this may be okay.
-//
-// 2 - Allocate a queue entry. The ring already has memory for all entries but only the ones
+// 1 - Allocate a queue entry. The ring already has memory for all entries but only the ones
 //     listed in sq.array are visible by the kernel. For those not listed, the kernel does not
 //     offer any assurance that an entry is not being filled by multiple flags. Therefore, we
 //     need to write an allocator that allows allocating concurrently.
 //
-// 3 - Actually fill the submit entry, this is the only simple and straightforward step.
+// 2 - Actually fill the submit entry, this is the only simple and straightforward step.
 //
-// 4 - Append the entry index to the array and adjust the tail accordingly. This operation
+// 3 - Append the entry index to the array and adjust the tail accordingly. This operation
 //     needs to arrive to two concensus at the same time:
 //     A - The order in which entries are listed in the array: no two threads must pick the
@@ -682,6 +711,5 @@
 
 	[* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring, uint64_t data ) {
-		verify( data != 0 );
-
+		/* paranoid */ verify( data != 0 );
 
 		// Prepare the data we need
@@ -708,11 +736,9 @@
 				{
 					// update statistics
-					#if !defined(__CFA_NO_STATISTICS__)
-						disable_interrupts();
-							__tls_stats()->io.submit_q.alloc_avg.val   += len;
-							__tls_stats()->io.submit_q.alloc_avg.block += block;
-							__tls_stats()->io.submit_q.alloc_avg.cnt   += 1;
-						enable_interrupts( __cfaabi_dbg_ctx );
-					#endif
+					__STATS__( false,
+						io.submit_q.alloc_avg.val   += len;
+						io.submit_q.alloc_avg.block += block;
+						io.submit_q.alloc_avg.cnt   += 1;
+					)
 
 
@@ -757,15 +783,19 @@
 
 			block++;
-			yield();
+			if( try_lock(ring.submit_q.lock __cfaabi_dbg_ctx2) ) {
+				__release_consumed_submission( ring );
+				unlock( ring.submit_q.lock );
+			}
+			else {
+				yield();
+			}
 		}
 
 		// update statistics
-		#if !defined(__CFA_NO_STATISTICS__)
-		disable_interrupts();
-			__tls_stats()->io.submit_q.look_avg.val   += len;
-			__tls_stats()->io.submit_q.look_avg.block += block;
-			__tls_stats()->io.submit_q.look_avg.cnt   += 1;
-		enable_interrupts( __cfaabi_dbg_ctx );
-		#endif
+		__STATS__( false,
+			io.submit_q.look_avg.val   += len;
+			io.submit_q.look_avg.block += block;
+			io.submit_q.look_avg.cnt   += 1;
+		)
 
 		return picked;
@@ -780,5 +810,4 @@
 		if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
 			// If the poller thread submits, then we just need to add this to the ready array
-
 			__submit_to_ready_array( ring, idx, mask );
 
@@ -786,4 +815,53 @@
 
 			__cfadbg_print_safe( io, "Kernel I/O : Added %u to ready for %p\n", idx, active_thread() );
+		}
+		else if( ring.cltr_flags & CFA_CLUSTER_IO_EAGER_SUBMITS ) {
+			uint32_t picked = __submit_to_ready_array( ring, idx, mask );
+
+			for() {
+				yield();
+
+				// If some one else collected our index, we are done
+				#warning ABA problem
+				if( ring.submit_q.ready[picked] != idx ) {
+					__STATS__( false,
+						io.submit_q.helped += 1;
+					)
+					return;
+				}
+
+				if( try_lock(ring.submit_q.lock __cfaabi_dbg_ctx2) ) {
+					__STATS__( false,
+						io.submit_q.leader += 1;
+					)
+					break;
+				}
+
+				__STATS__( false,
+					io.submit_q.busy += 1;
+				)
+			}
+
+			// We got the lock
+			unsigned to_submit = __collect_submitions( ring );
+			int ret = __io_uring_enter( ring, to_submit, false, 0p );
+			if( ret < 0 ) {
+				unlock(ring.submit_q.lock);
+				return;
+			}
+
+			/* paranoid */ verify( ret > 0 || (ring.ring_flags & IORING_SETUP_SQPOLL) );
+
+			// Release the consumed SQEs
+			__release_consumed_submission( ring );
+
+			// update statistics
+			__STATS__( true,
+				io.submit_q.submit_avg.rdy += to_submit;
+				io.submit_q.submit_avg.csm += ret;
+				io.submit_q.submit_avg.cnt += 1;
+			)
+
+			unlock(ring.submit_q.lock);
 		}
 		else {
@@ -791,13 +869,24 @@
 			lock(ring.submit_q.lock __cfaabi_dbg_ctx2);
 
+			/* paranoid */ verifyf( ring.submit_q.sqes[ idx ].user_data != 0,
+			/* paranoid */ 	"index %u already reclaimed\n"
+			/* paranoid */ 	"head %u, prev %u, tail %u\n"
+			/* paranoid */ 	"[-0: %u,-1: %u,-2: %u,-3: %u]\n",
+			/* paranoid */ 	idx,
+			/* paranoid */ 	*ring.submit_q.head, ring.submit_q.prev_head, *tail
+			/* paranoid */ 	,ring.submit_q.array[ ((*ring.submit_q.head) - 0) & (*ring.submit_q.mask) ]
+			/* paranoid */ 	,ring.submit_q.array[ ((*ring.submit_q.head) - 1) & (*ring.submit_q.mask) ]
+			/* paranoid */ 	,ring.submit_q.array[ ((*ring.submit_q.head) - 2) & (*ring.submit_q.mask) ]
+			/* paranoid */ 	,ring.submit_q.array[ ((*ring.submit_q.head) - 3) & (*ring.submit_q.mask) ]
+			/* paranoid */ );
+
 			// Append to the list of ready entries
 
 			/* paranoid */ verify( idx <= mask );
-
-			ring.submit_q.array[ (*tail) & mask ] = idx & mask;
+			ring.submit_q.array[ (*tail) & mask ] = idx;
 			__atomic_fetch_add(tail, 1ul32, __ATOMIC_SEQ_CST);
 
 			// Submit however, many entries need to be submitted
-			int ret = syscall( __NR_io_uring_enter, ring.fd, 1, 0, 0, 0p, 0);
+			int ret = __io_uring_enter( ring, 1, false, 0p );
 			if( ret < 0 ) {
 				switch((int)errno) {
@@ -808,10 +897,11 @@
 
 			// update statistics
-			#if !defined(__CFA_NO_STATISTICS__)
-				__tls_stats()->io.submit_q.submit_avg.csm += 1;
-				__tls_stats()->io.submit_q.submit_avg.cnt += 1;
-			#endif
-
-			ring.submit_q.sqes[ idx & mask ].user_data = 0;
+			__STATS__( false,
+				io.submit_q.submit_avg.csm += 1;
+				io.submit_q.submit_avg.cnt += 1;
+			)
+
+			// Release the consumed SQEs
+			__release_consumed_submission( ring );
 
 			unlock(ring.submit_q.lock);
@@ -820,3 +910,60 @@
 		}
 	}
+
+	static unsigned __collect_submitions( struct __io_data & ring ) {
+		/* paranoid */ verify( ring.submit_q.ready != 0p );
+		/* paranoid */ verify( ring.submit_q.ready_cnt > 0 );
+
+		unsigned to_submit = 0;
+		uint32_t tail = *ring.submit_q.tail;
+		const uint32_t mask = *ring.submit_q.mask;
+
+		// Go through the list of ready submissions
+		for( i; ring.submit_q.ready_cnt ) {
+			// replace any submission with the sentinel, to consume it.
+			uint32_t idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
+
+			// If it was already the sentinel, then we are done
+			if( idx == -1ul32 ) continue;
+
+			// If we got a real submission, append it to the list
+			ring.submit_q.array[ (tail + to_submit) & mask ] = idx & mask;
+			to_submit++;
+		}
+
+		// Increment the tail based on how many we are ready to submit
+		__atomic_fetch_add(ring.submit_q.tail, to_submit, __ATOMIC_SEQ_CST);
+
+		return to_submit;
+	}
+
+	static uint32_t __release_consumed_submission( struct __io_data & ring ) {
+		const uint32_t smask = *ring.submit_q.mask;
+
+		if( !try_lock(ring.submit_q.release_lock __cfaabi_dbg_ctx2) ) return 0;
+		uint32_t chead = *ring.submit_q.head;
+		uint32_t phead = ring.submit_q.prev_head;
+		ring.submit_q.prev_head = chead;
+		unlock(ring.submit_q.release_lock);
+
+		uint32_t count = chead - phead;
+		for( i; count ) {
+			uint32_t idx = ring.submit_q.array[ (phead + i) & smask ];
+			ring.submit_q.sqes[ idx ].user_data = 0;
+		}
+		return count;
+	}
+
+//=============================================================================================
+// I/O Submissions
+//=============================================================================================
+
+	void register_fixed_files( cluster & cl, int * files, unsigned count ) {
+		int ret = syscall( __NR_io_uring_register, cl.io->fd, IORING_REGISTER_FILES, files, count );
+		if( ret < 0 ) {
+			abort( "KERNEL ERROR: IO_URING SYSCALL - (%d) %s\n", (int)errno, strerror(errno) );
+		}
+
+		__cfadbg_print_safe( io_core, "Kernel I/O : Performed io_register for %p, returned %d\n", active_thread(), ret );
+	}
 #endif
Index: libcfa/src/concurrency/iocall.cfa
===================================================================
--- libcfa/src/concurrency/iocall.cfa	(revision bb83b478f3b4399d8d5165aec5f3d29c2ca6514e)
+++ libcfa/src/concurrency/iocall.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -108,4 +108,7 @@
 
 	extern ssize_t read (int fd, void *buf, size_t count);
+
+	extern ssize_t splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags);
+	extern ssize_t tee(int fd_in, int fd_out, size_t len, unsigned int flags);
 }
 
@@ -128,4 +131,17 @@
 		#endif
 	}
+
+	ssize_t cfa_preadv2_fixed(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) {
+		#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_READV)
+			return preadv2(fd, iov, iovcnt, offset, flags);
+		#else
+			__submit_prelude
+
+			(*sqe){ IORING_OP_READV, fd, iov, iovcnt, offset };
+			sqe->flags |= IOSQE_FIXED_FILE;
+
+			__submit_wait
+		#endif
+	}
 #endif
 
@@ -329,5 +345,4 @@
 }
 
-
 ssize_t cfa_read(int fd, void *buf, size_t count) {
 	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_READ)
@@ -349,4 +364,33 @@
 
 		(*sqe){ IORING_OP_WRITE, fd, buf, count, 0 };
+
+		__submit_wait
+	#endif
+}
+
+ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_SPLICE)
+		return splice( fd_in, off_in, fd_out, off_out, len, flags );
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_SPLICE, fd_out, 0p, len, off_out };
+		sqe->splice_fd_in  = fd_in;
+		sqe->splice_off_in = off_in;
+		sqe->splice_flags  = flags;
+
+		__submit_wait
+	#endif
+}
+
+ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_TEE)
+		return tee( fd_in, fd_out, len, flags );
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_TEE, fd_out, 0p, len, 0 };
+		sqe->splice_fd_in = fd_in;
+		sqe->splice_flags = flags;
 
 		__submit_wait
@@ -453,4 +497,14 @@
 			#define _CFA_IO_FEATURE_IORING_OP_WRITE ,
 			return IS_DEFINED(IORING_OP_WRITE);
+
+		if( /*func == (fptr_t)splice || */
+			func == (fptr_t)cfa_splice )
+			#define _CFA_IO_FEATURE_IORING_OP_SPLICE ,
+			return IS_DEFINED(IORING_OP_SPLICE);
+
+		if( /*func == (fptr_t)tee || */
+			func == (fptr_t)cfa_tee )
+			#define _CFA_IO_FEATURE_IORING_OP_TEE ,
+			return IS_DEFINED(IORING_OP_TEE);
 	#endif
 
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision bb83b478f3b4399d8d5165aec5f3d29c2ca6514e)
+++ libcfa/src/concurrency/kernel.hfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -129,7 +129,9 @@
 struct __io_data;
 
-#define CFA_CLUSTER_IO_POLLER_USER_THREAD    1 << 0 // 0x1
-#define CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS 1 << 1 // 0x2
-// #define CFA_CLUSTER_IO_POLLER_KERNEL_SIDE 1 << 2 // 0x4
+#define CFA_CLUSTER_IO_POLLER_USER_THREAD    (1 << 0) // 0x01
+#define CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS (1 << 1) // 0x02
+#define CFA_CLUSTER_IO_EAGER_SUBMITS         (1 << 2) // 0x04
+#define CFA_CLUSTER_IO_KERNEL_POLL_SUBMITS   (1 << 3) // 0x08
+#define CFA_CLUSTER_IO_KERNEL_POLL_COMPLETES (1 << 4) // 0x10
 #define CFA_CLUSTER_IO_BUFFLEN_OFFSET        16
 
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision bb83b478f3b4399d8d5165aec5f3d29c2ca6514e)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -286,9 +286,19 @@
 // Statics call at the end of each thread to register statistics
 #if !defined(__CFA_NO_STATISTICS__)
-static inline struct __stats_t * __tls_stats() {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	/* paranoid */ verify( kernelTLS.this_stats );
-	return kernelTLS.this_stats;
-}
+	static inline struct __stats_t * __tls_stats() {
+		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+		/* paranoid */ verify( kernelTLS.this_stats );
+		return kernelTLS.this_stats;
+	}
+
+	#define __STATS__(in_kernel, ...) { \
+		if( !(in_kernel) ) disable_interrupts(); \
+		with( *__tls_stats() ) { \
+			__VA_ARGS__ \
+		} \
+		if( !(in_kernel) ) enable_interrupts( __cfaabi_dbg_ctx ); \
+	}
+#else
+	#define __STATS__(in_kernel, ...)
 #endif
 
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision bb83b478f3b4399d8d5165aec5f3d29c2ca6514e)
+++ libcfa/src/concurrency/preemption.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -186,4 +186,5 @@
 	void enable_interrupts( __cfaabi_dbg_ctx_param ) {
 		processor   * proc = kernelTLS.this_processor; // Cache the processor now since interrupts can start happening after the atomic store
+		/* paranoid */ verify( proc );
 
 		with( kernelTLS.preemption_state ){
Index: libcfa/src/concurrency/stats.cfa
===================================================================
--- libcfa/src/concurrency/stats.cfa	(revision bb83b478f3b4399d8d5165aec5f3d29c2ca6514e)
+++ libcfa/src/concurrency/stats.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -27,5 +27,4 @@
 			stats->io.submit_q.submit_avg.rdy = 0;
 			stats->io.submit_q.submit_avg.csm = 0;
-			stats->io.submit_q.submit_avg.avl = 0;
 			stats->io.submit_q.submit_avg.cnt = 0;
 			stats->io.submit_q.look_avg.val   = 0;
@@ -35,4 +34,7 @@
 			stats->io.submit_q.alloc_avg.cnt   = 0;
 			stats->io.submit_q.alloc_avg.block = 0;
+			stats->io.submit_q.helped = 0;
+			stats->io.submit_q.leader = 0;
+			stats->io.submit_q.busy   = 0;
 			stats->io.complete_q.completed_avg.val = 0;
 			stats->io.complete_q.completed_avg.slow_cnt = 0;
@@ -68,4 +70,7 @@
 			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.cnt           , proc->io.submit_q.alloc_avg.cnt           , __ATOMIC_SEQ_CST );
 			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.block         , proc->io.submit_q.alloc_avg.block         , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.helped                  , proc->io.submit_q.helped                  , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.leader                  , proc->io.submit_q.leader                  , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.busy                    , proc->io.submit_q.busy                    , __ATOMIC_SEQ_CST );
 			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.val     , proc->io.complete_q.completed_avg.val     , __ATOMIC_SEQ_CST );
 			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.slow_cnt, proc->io.complete_q.completed_avg.slow_cnt, __ATOMIC_SEQ_CST );
@@ -120,5 +125,4 @@
 				double avgrdy = ((double)io.submit_q.submit_avg.rdy) / io.submit_q.submit_avg.cnt;
 				double avgcsm = ((double)io.submit_q.submit_avg.csm) / io.submit_q.submit_avg.cnt;
-				double avgavl = ((double)io.submit_q.submit_avg.avl) / io.submit_q.submit_avg.cnt;
 
 				double lavgv = 0;
@@ -141,5 +145,7 @@
 					"- avg ready entries      : %'18.2lf\n"
 					"- avg submitted entries  : %'18.2lf\n"
-					"- avg available entries  : %'18.2lf\n"
+					"- total helped entries   : %'15" PRIu64 "\n"
+					"- total leader entries   : %'15" PRIu64 "\n"
+					"- total busy submit      : %'15" PRIu64 "\n"
 					"- total ready search     : %'15" PRIu64 "\n"
 					"- avg ready search len   : %'18.2lf\n"
@@ -153,5 +159,6 @@
 					, cluster ? "Cluster" : "Processor",  name, id
 					, io.submit_q.submit_avg.cnt
-					, avgrdy, avgcsm, avgavl
+					, avgrdy, avgcsm
+					, io.submit_q.helped, io.submit_q.leader, io.submit_q.busy
 					, io.submit_q.look_avg.cnt
 					, lavgv, lavgb
Index: libcfa/src/concurrency/stats.hfa
===================================================================
--- libcfa/src/concurrency/stats.hfa	(revision bb83b478f3b4399d8d5165aec5f3d29c2ca6514e)
+++ libcfa/src/concurrency/stats.hfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -83,4 +83,7 @@
 					volatile uint64_t block;
 				} alloc_avg;
+				volatile uint64_t helped;
+				volatile uint64_t leader;
+				volatile uint64_t busy;
 			} submit_q;
 			struct {
