Index: libcfa/src/concurrency/alarm.cfa
===================================================================
--- libcfa/src/concurrency/alarm.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/alarm.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -23,5 +23,5 @@
 
 #include "alarm.hfa"
-#include "kernel_private.hfa"
+#include "kernel/fwd.hfa"
 #include "preemption.hfa"
 
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/invoke.h	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -17,4 +17,5 @@
 #include "bits/defs.hfa"
 #include "bits/locks.hfa"
+#include "kernel/fwd.hfa"
 
 #ifdef __cforall
@@ -25,44 +26,4 @@
 #ifndef _INVOKE_H_
 #define _INVOKE_H_
-
-#ifdef __ARM_ARCH
-	// function prototypes are only really used by these macros on ARM
-	void disable_global_interrupts();
-	void enable_global_interrupts();
-
-	#define TL_GET( member ) ( { __typeof__( kernelTLS.member ) target; \
-                disable_global_interrupts(); \
-                target = kernelTLS.member; \
-                enable_global_interrupts(); \
-                target; } )
-	#define TL_SET( member, value ) disable_global_interrupts(); \
-		kernelTLS.member = value; \
-		enable_global_interrupts();
-#else
-	#define TL_GET( member ) kernelTLS.member
-	#define TL_SET( member, value ) kernelTLS.member = value;
-#endif
-
-	#ifdef __cforall
-	extern "Cforall" {
-		extern __attribute__((aligned(128))) thread_local struct KernelThreadData {
-			struct $thread    * volatile this_thread;
-			struct processor  * volatile this_processor;
-			struct __stats_t  * volatile this_stats;
-
-			struct {
-				volatile unsigned short disable_count;
-				volatile bool enabled;
-				volatile bool in_progress;
-			} preemption_state;
-
-			#if defined(__SIZEOF_INT128__)
-				__uint128_t rand_seed;
-			#else
-				uint64_t rand_seed;
-			#endif
-		} kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
-	}
-	#endif
 
 	struct __stack_context_t {
@@ -98,5 +59,4 @@
 
 	enum __Coroutine_State { Halted, Start, Primed, Blocked, Ready, Active };
-	enum __Preemption_Reason { __NO_PREEMPTION, __ALARM_PREEMPTION, __POLL_PREEMPTION, __MANUAL_PREEMPTION };
 
 	struct $coroutine {
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/io.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -14,4 +14,6 @@
 //
 
+#define __cforall_thread__
+
 #if defined(__CFA_DEBUG__)
 	// #define __CFA_DEBUG_PRINT_IO__
@@ -19,33 +21,15 @@
 #endif
 
-#include "kernel.hfa"
-#include "bitmanip.hfa"
-
-#if !defined(CFA_HAVE_LINUX_IO_URING_H)
-	void __kernel_io_startup( cluster &, unsigned, bool ) {
-		// Nothing to do without io_uring
-	}
-
-	void __kernel_io_finish_start( cluster & ) {
-		// Nothing to do without io_uring
-	}
-
-	void __kernel_io_prepare_stop( cluster & ) {
-		// Nothing to do without io_uring
-	}
-
-	void __kernel_io_shutdown( cluster &, bool ) {
-		// Nothing to do without io_uring
-	}
-
-#else
+
+#if defined(CFA_HAVE_LINUX_IO_URING_H)
 	#define _GNU_SOURCE         /* See feature_test_macros(7) */
 	#include <errno.h>
+	#include <signal.h>
 	#include <stdint.h>
 	#include <string.h>
 	#include <unistd.h>
-	#include <sys/mman.h>
 
 	extern "C" {
+		#include <sys/epoll.h>
 		#include <sys/syscall.h>
 
@@ -53,389 +37,16 @@
 	}
 
-	#include "bits/signal.hfa"
-	#include "kernel_private.hfa"
-	#include "thread.hfa"
-
-	uint32_t entries_per_cluster() {
-		return 256;
-	}
-
-	static void * __io_poller_slow( void * arg );
-
-	// Weirdly, some systems that do support io_uring don't actually define these
-	#ifdef __alpha__
-		/*
-		* alpha is the only exception, all other architectures
-		* have common numbers for new system calls.
-		*/
-		#ifndef __NR_io_uring_setup
-			#define __NR_io_uring_setup           535
-		#endif
-		#ifndef __NR_io_uring_enter
-			#define __NR_io_uring_enter           536
-		#endif
-		#ifndef __NR_io_uring_register
-			#define __NR_io_uring_register        537
-		#endif
-	#else /* !__alpha__ */
-		#ifndef __NR_io_uring_setup
-			#define __NR_io_uring_setup           425
-		#endif
-		#ifndef __NR_io_uring_enter
-			#define __NR_io_uring_enter           426
-		#endif
-		#ifndef __NR_io_uring_register
-			#define __NR_io_uring_register        427
-		#endif
-	#endif
-
-	// Fast poller user-thread
-	// Not using the "thread" keyword because we want to control
-	// more carefully when to start/stop it
-	struct __io_poller_fast {
-		struct __io_data * ring;
-		$thread thrd;
-	};
-
-	void ?{}( __io_poller_fast & this, struct cluster & cltr ) {
-		this.ring = cltr.io;
-		(this.thrd){ "Fast I/O Poller", cltr };
-	}
-	void ^?{}( __io_poller_fast & mutex this );
-	void main( __io_poller_fast & this );
-	static inline $thread * get_thread( __io_poller_fast & this ) { return &this.thrd; }
-	void ^?{}( __io_poller_fast & mutex this ) {}
-
-	struct __submition_data {
-		// Head and tail of the ring (associated with array)
-		volatile uint32_t * head;
-		volatile uint32_t * tail;
-		volatile uint32_t prev_head;
-
-		// The actual kernel ring which uses head/tail
-		// indexes into the sqes arrays
-		uint32_t * array;
-
-		// number of entries and mask to go with it
-		const uint32_t * num;
-		const uint32_t * mask;
-
-		// Submission flags (Not sure what for)
-		uint32_t * flags;
-
-		// number of sqes not submitted (whatever that means)
-		uint32_t * dropped;
-
-		// Like head/tail but not seen by the kernel
-		volatile uint32_t * ready;
-		uint32_t ready_cnt;
-
-		__spinlock_t lock;
-		__spinlock_t release_lock;
-
-		// A buffer of sqes (not the actual ring)
-		struct io_uring_sqe * sqes;
-
-		// The location and size of the mmaped area
-		void * ring_ptr;
-		size_t ring_sz;
-	};
-
-	struct __completion_data {
-		// Head and tail of the ring
-		volatile uint32_t * head;
-		volatile uint32_t * tail;
-
-		// number of entries and mask to go with it
-		const uint32_t * mask;
-		const uint32_t * num;
-
-		// number of cqes not submitted (whatever that means)
-		uint32_t * overflow;
-
-		// the kernel ring
-		struct io_uring_cqe * cqes;
-
-		// The location and size of the mmaped area
-		void * ring_ptr;
-		size_t ring_sz;
-	};
-
-	struct __io_data {
-		struct __submition_data submit_q;
-		struct __completion_data completion_q;
-		uint32_t ring_flags;
-		int cltr_flags;
-		int fd;
-		semaphore submit;
-		volatile bool done;
-		struct {
-			struct {
-				__processor_id_t id;
-				void * stack;
-				pthread_t kthrd;
-				volatile bool blocked;
-			} slow;
-			__io_poller_fast fast;
-			__bin_sem_t sem;
-		} poller;
-	};
-
-//=============================================================================================
-// I/O Startup / Shutdown logic
-//=============================================================================================
-	void __kernel_io_startup( cluster & this, unsigned io_flags, bool main_cluster ) {
-		if( (io_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS) && (io_flags & CFA_CLUSTER_IO_EAGER_SUBMITS) ) {
-			abort("CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS and CFA_CLUSTER_IO_EAGER_SUBMITS cannot be mixed\n");
-		}
-
-		this.io = malloc();
-
-		// Step 1 : call to setup
-		struct io_uring_params params;
-		memset(&params, 0, sizeof(params));
-		if( io_flags & CFA_CLUSTER_IO_KERNEL_POLL_SUBMITS   ) params.flags |= IORING_SETUP_SQPOLL;
-		if( io_flags & CFA_CLUSTER_IO_KERNEL_POLL_COMPLETES ) params.flags |= IORING_SETUP_IOPOLL;
-
-		uint32_t nentries = entries_per_cluster();
-
-		int fd = syscall(__NR_io_uring_setup, nentries, &params );
-		if(fd < 0) {
-			abort("KERNEL ERROR: IO_URING SETUP - %s\n", strerror(errno));
-		}
-
-		// Step 2 : mmap result
-		memset( this.io, 0, sizeof(struct __io_data) );
-		struct __submition_data  & sq = this.io->submit_q;
-		struct __completion_data & cq = this.io->completion_q;
-
-		// calculate the right ring size
-		sq.ring_sz = params.sq_off.array + (params.sq_entries * sizeof(unsigned)           );
-		cq.ring_sz = params.cq_off.cqes  + (params.cq_entries * sizeof(struct io_uring_cqe));
-
-		// Requires features
-		#if defined(IORING_FEAT_SINGLE_MMAP)
-			// adjust the size according to the parameters
-			if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
-				cq.ring_sz = sq.ring_sz = max(cq.ring_sz, sq.ring_sz);
-			}
-		#endif
-
-		// mmap the Submit Queue into existence
-		sq.ring_ptr = mmap(0, sq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
-		if (sq.ring_ptr == (void*)MAP_FAILED) {
-			abort("KERNEL ERROR: IO_URING MMAP1 - %s\n", strerror(errno));
-		}
-
-		// Requires features
-		#if defined(IORING_FEAT_SINGLE_MMAP)
-			// mmap the Completion Queue into existence (may or may not be needed)
-			if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
-				cq.ring_ptr = sq.ring_ptr;
-			}
-			else
-		#endif
-		{
-			// We need multiple call to MMAP
-			cq.ring_ptr = mmap(0, cq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
-			if (cq.ring_ptr == (void*)MAP_FAILED) {
-				munmap(sq.ring_ptr, sq.ring_sz);
-				abort("KERNEL ERROR: IO_URING MMAP2 - %s\n", strerror(errno));
-			}
-		}
-
-		// mmap the submit queue entries
-		size_t size = params.sq_entries * sizeof(struct io_uring_sqe);
-		sq.sqes = (struct io_uring_sqe *)mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
-		if (sq.sqes == (struct io_uring_sqe *)MAP_FAILED) {
-			munmap(sq.ring_ptr, sq.ring_sz);
-			if (cq.ring_ptr != sq.ring_ptr) munmap(cq.ring_ptr, cq.ring_sz);
-			abort("KERNEL ERROR: IO_URING MMAP3 - %s\n", strerror(errno));
-		}
-
-		// Get the pointers from the kernel to fill the structure
-		// submit queue
-		sq.head    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
-		sq.tail    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
-		sq.mask    = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
-		sq.num     = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
-		sq.flags   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
-		sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
-		sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
-		sq.prev_head = *sq.head;
-
-		{
-			const uint32_t num = *sq.num;
-			for( i; num ) {
-				sq.sqes[i].user_data = 0ul64;
-			}
-		}
-
-		(sq.lock){};
-		(sq.release_lock){};
-
-		if( io_flags & ( CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS | CFA_CLUSTER_IO_EAGER_SUBMITS ) ) {
-			/* paranoid */ verify( is_pow2( io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET ) || ((io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET) < 8)  );
-			sq.ready_cnt = max(io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET, 8);
-			sq.ready = alloc_align( 64, sq.ready_cnt );
-			for(i; sq.ready_cnt) {
-				sq.ready[i] = -1ul32;
-			}
-		}
-		else {
-			sq.ready_cnt = 0;
-			sq.ready = 0p;
-		}
-
-		// completion queue
-		cq.head     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
-		cq.tail     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
-		cq.mask     = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
-		cq.num      = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
-		cq.overflow = (         uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
-		cq.cqes   = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
-
-		// some paranoid checks
-		/* paranoid */ verifyf( (*cq.mask) == ((*cq.num) - 1ul32), "IO_URING Expected mask to be %u (%u entries), was %u", (*cq.num) - 1ul32, *cq.num, *cq.mask  );
-		/* paranoid */ verifyf( (*cq.num)  >= nentries, "IO_URING Expected %u entries, got %u", nentries, *cq.num );
-		/* paranoid */ verifyf( (*cq.head) == 0, "IO_URING Expected head to be 0, got %u", *cq.head );
-		/* paranoid */ verifyf( (*cq.tail) == 0, "IO_URING Expected tail to be 0, got %u", *cq.tail );
-
-		/* paranoid */ verifyf( (*sq.mask) == ((*sq.num) - 1ul32), "IO_URING Expected mask to be %u (%u entries), was %u", (*sq.num) - 1ul32, *sq.num, *sq.mask );
-		/* paranoid */ verifyf( (*sq.num) >= nentries, "IO_URING Expected %u entries, got %u", nentries, *sq.num );
-		/* paranoid */ verifyf( (*sq.head) == 0, "IO_URING Expected head to be 0, got %u", *sq.head );
-		/* paranoid */ verifyf( (*sq.tail) == 0, "IO_URING Expected tail to be 0, got %u", *sq.tail );
-
-		// Update the global ring info
-		this.io->ring_flags = params.flags;
-		this.io->cltr_flags = io_flags;
-		this.io->fd         = fd;
-		this.io->done       = false;
-		(this.io->submit){ min(*sq.num, *cq.num) };
-
-		if(!main_cluster) {
-			__kernel_io_finish_start( this );
-		}
-	}
-
-	void __kernel_io_finish_start( cluster & this ) {
-		if( this.io->cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD ) {
-			__cfadbg_print_safe(io_core, "Kernel I/O : Creating fast poller for cluter %p\n", &this);
-			(this.io->poller.fast){ this };
-			__thrd_start( this.io->poller.fast, main );
-		}
-
-		// Create the poller thread
-		__cfadbg_print_safe(io_core, "Kernel I/O : Creating slow poller for cluster %p\n", &this);
-		this.io->poller.slow.blocked = false;
-		this.io->poller.slow.stack = __create_pthread( &this.io->poller.slow.kthrd, __io_poller_slow, &this );
-	}
-
-	void __kernel_io_prepare_stop( cluster & this ) {
-		__cfadbg_print_safe(io_core, "Kernel I/O : Stopping pollers for cluster\n", &this);
-		// Notify the poller thread of the shutdown
-		__atomic_store_n(&this.io->done, true, __ATOMIC_SEQ_CST);
-
-		// Stop the IO Poller
-		sigval val = { 1 };
-		pthread_sigqueue( this.io->poller.slow.kthrd, SIGUSR1, val );
-		post( this.io->poller.sem );
-
-		// Wait for the poller thread to finish
-		pthread_join( this.io->poller.slow.kthrd, 0p );
-		free( this.io->poller.slow.stack );
-
-		__cfadbg_print_safe(io_core, "Kernel I/O : Slow poller stopped for cluster\n", &this);
-
-		if( this.io->cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD ) {
-			with( this.io->poller.fast ) {
-				/* paranoid */ verify( this.nprocessors == 0 || &this == mainCluster );
-				/* paranoid */ verify( !ready_mutate_islocked() );
-
-				// We need to adjust the clean-up based on where the thread is
-				if( thrd.state == Ready || thrd.preempted != __NO_PREEMPTION ) {
-
-					ready_schedule_lock( (struct __processor_id_t *)active_processor() );
-
-						// This is the tricky case
-						// The thread was preempted and now it is on the ready queue
-						// The thread should be the last on the list
-						/* paranoid */ verify( thrd.link.next != 0p );
-
-						// Remove the thread from the ready queue of this cluster
-						__attribute__((unused)) bool removed = remove_head( &this, &thrd );
-						/* paranoid */ verify( removed );
-						thrd.link.next = 0p;
-						thrd.link.prev = 0p;
-						__cfaabi_dbg_debug_do( thrd.unpark_stale = true );
-
-						// Fixup the thread state
-						thrd.state = Blocked;
-						thrd.ticket = 0;
-						thrd.preempted = __NO_PREEMPTION;
-
-					ready_schedule_unlock( (struct __processor_id_t *)active_processor() );
-
-					// Pretend like the thread was blocked all along
-				}
-				// !!! This is not an else if !!!
-				if( thrd.state == Blocked ) {
-
-					// This is the "easy case"
-					// The thread is parked and can easily be moved to active cluster
-					verify( thrd.curr_cluster != active_cluster() || thrd.curr_cluster == mainCluster );
-					thrd.curr_cluster = active_cluster();
-
-					// unpark the fast io_poller
-					unpark( &thrd __cfaabi_dbg_ctx2 );
-				}
-				else {
-
-					// The thread is in a weird state
-					// I don't know what to do here
-					abort("Fast poller thread is in unexpected state, cannot clean-up correctly\n");
-				}
-
-			}
-
-			^(this.io->poller.fast){};
-
-			__cfadbg_print_safe(io_core, "Kernel I/O : Fast poller stopped for cluster\n", &this);
-		}
-	}
-
-	void __kernel_io_shutdown( cluster & this, bool main_cluster ) {
-		if(!main_cluster) {
-			__kernel_io_prepare_stop( this );
-		}
-
-		// Shutdown the io rings
-		struct __submition_data  & sq = this.io->submit_q;
-		struct __completion_data & cq = this.io->completion_q;
-
-		// unmap the submit queue entries
-		munmap(sq.sqes, (*sq.num) * sizeof(struct io_uring_sqe));
-
-		// unmap the Submit Queue ring
-		munmap(sq.ring_ptr, sq.ring_sz);
-
-		// unmap the Completion Queue ring, if it is different
-		if (cq.ring_ptr != sq.ring_ptr) {
-			munmap(cq.ring_ptr, cq.ring_sz);
-		}
-
-		// close the file descriptor
-		close(this.io->fd);
-
-		free( this.io->submit_q.ready ); // Maybe null, doesn't matter
-		free( this.io );
-	}
-
-	int __io_uring_enter( struct __io_data & ring, unsigned to_submit, bool get, sigset_t * mask ) {
+	#include "stats.hfa"
+	#include "kernel.hfa"
+	#include "kernel/fwd.hfa"
+	#include "io/types.hfa"
+
+//=============================================================================================
+// I/O Syscall
+//=============================================================================================
+	static int __io_uring_enter( struct __io_data & ring, unsigned to_submit, bool get ) {
 		bool need_sys_to_submit = false;
 		bool need_sys_to_complete = false;
-		unsigned min_complete = 0;
 		unsigned flags = 0;
-
 
 		TO_SUBMIT:
@@ -451,12 +62,6 @@
 		}
 
-		TO_COMPLETE:
 		if( get && !(ring.ring_flags & IORING_SETUP_SQPOLL) ) {
 			flags |= IORING_ENTER_GETEVENTS;
-			if( mask ) {
-				need_sys_to_complete = true;
-				min_complete = 1;
-				break TO_COMPLETE;
-			}
 			if( (ring.ring_flags & IORING_SETUP_IOPOLL) ) {
 				need_sys_to_complete = true;
@@ -466,5 +71,5 @@
 		int ret = 0;
 		if( need_sys_to_submit || need_sys_to_complete ) {
-			ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, min_complete, flags, mask, _NSIG / 8);
+			ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, 0, flags, 0p, _NSIG / 8);
 			if( ret < 0 ) {
 				switch((int)errno) {
@@ -490,25 +95,24 @@
 	static uint32_t __release_consumed_submission( struct __io_data & ring );
 
-	static inline void process(struct io_uring_cqe & cqe, struct __processor_id_t * id ) {
+	static inline void process(struct io_uring_cqe & cqe ) {
 		struct __io_user_data_t * data = (struct __io_user_data_t *)(uintptr_t)cqe.user_data;
 		__cfadbg_print_safe( io, "Kernel I/O : Syscall completed : cqe %p, result %d for %p\n", data, cqe.res, data->thrd );
 
 		data->result = cqe.res;
-		if(!id) { unpark(     data->thrd __cfaabi_dbg_ctx2 ); }
-		else  { __unpark( id, data->thrd __cfaabi_dbg_ctx2 ); }
+		unpark( data->thrd __cfaabi_dbg_ctx2 );
 	}
 
 	// Process a single completion message from the io_uring
 	// This is NOT thread-safe
-	static [int, bool] __drain_io( & struct __io_data ring, * sigset_t mask ) {
+	static [int, bool] __drain_io( & struct __io_data ring ) {
 		/* paranoid */ verify( !kernelTLS.preemption_state.enabled );
 
 		unsigned to_submit = 0;
-		if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
+		if( ring.poller_submits ) {
 			// If the poller thread also submits, then we need to aggregate the submissions which are ready
 			to_submit = __collect_submitions( ring );
 		}
 
-		int ret = __io_uring_enter(ring, to_submit, true, mask);
+		int ret = __io_uring_enter(ring, to_submit, true);
 		if( ret < 0 ) {
 			return [0, true];
@@ -547,9 +151,6 @@
 			/* paranoid */ verify(&cqe);
 
-			process( cqe, !mask ? (struct __processor_id_t *)0p : &ring.poller.slow.id );
-		}
-
-		// Allow new submissions to happen
-		// V(ring.submit, count);
+			process( cqe );
+		}
 
 		// Mark to the kernel that the cqe has been seen
@@ -561,99 +162,18 @@
 	}
 
-	static void * __io_poller_slow( void * arg ) {
-		#if !defined( __CFA_NO_STATISTICS__ )
-			__stats_t local_stats;
-			__init_stats( &local_stats );
-			kernelTLS.this_stats = &local_stats;
-		#endif
-
-		cluster * cltr = (cluster *)arg;
-		struct __io_data & ring = *cltr->io;
-
-		ring.poller.slow.id.id = doregister( &ring.poller.slow.id );
-
-		sigset_t mask;
-		sigfillset(&mask);
-		if ( pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
-			abort( "KERNEL ERROR: IO_URING - pthread_sigmask" );
-		}
-
-		sigdelset( &mask, SIGUSR1 );
-
-		verify( (*ring.submit_q.head) == (*ring.submit_q.tail) );
-		verify( (*ring.completion_q.head) == (*ring.completion_q.tail) );
-
-		__cfadbg_print_safe(io_core, "Kernel I/O : Slow poller for ring %p ready\n", &ring);
-
-		if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD ) {
-			while(!__atomic_load_n(&ring.done, __ATOMIC_SEQ_CST)) {
-
-				__atomic_store_n( &ring.poller.slow.blocked, true, __ATOMIC_SEQ_CST );
-
-				// In the user-thread approach drain and if anything was drained,
-				// batton pass to the user-thread
-				int count;
-				bool again;
-				[count, again] = __drain_io( ring, &mask );
-
-				__atomic_store_n( &ring.poller.slow.blocked, false, __ATOMIC_SEQ_CST );
-
-				// Update statistics
-				__STATS__( true,
-					io.complete_q.completed_avg.val += count;
-					io.complete_q.completed_avg.slow_cnt += 1;
-				)
-
-				if(again) {
-					__cfadbg_print_safe(io_core, "Kernel I/O : Moving to ring %p to fast poller\n", &ring);
-					__unpark( &ring.poller.slow.id, &ring.poller.fast.thrd __cfaabi_dbg_ctx2 );
-					wait( ring.poller.sem );
-				}
-			}
-		}
-		else {
-			while(!__atomic_load_n(&ring.done, __ATOMIC_SEQ_CST)) {
-				//In the naive approach, just poll the io completion queue directly
-				int count;
-				bool again;
-				[count, again] = __drain_io( ring, &mask );
-
-				// Update statistics
-				__STATS__( true,
-					io.complete_q.completed_avg.val += count;
-					io.complete_q.completed_avg.slow_cnt += 1;
-				)
-			}
-		}
-
-		__cfadbg_print_safe(io_core, "Kernel I/O : Slow poller for ring %p stopping\n", &ring);
-
-		unregister( &ring.poller.slow.id );
-
-		#if !defined(__CFA_NO_STATISTICS__)
-			__tally_stats(cltr->stats, &local_stats);
-		#endif
-
-		return 0p;
-	}
-
-	void main( __io_poller_fast & this ) {
-		verify( this.ring->cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD );
-
-		// Start parked
-		park( __cfaabi_dbg_ctx );
-
-		__cfadbg_print_safe(io_core, "Kernel I/O : Fast poller for ring %p ready\n", &this.ring);
+	void main( $io_ctx_thread & this ) {
+		epoll_event ev;
+		__ioctx_register( this, ev );
+
+		__cfadbg_print_safe(io_core, "Kernel I/O : IO poller %p for ring %p ready\n", &this, &this.ring);
 
 		int reset = 0;
-
 		// Then loop until we need to start
-		while(!__atomic_load_n(&this.ring->done, __ATOMIC_SEQ_CST)) {
-
+		while(!__atomic_load_n(&this.done, __ATOMIC_SEQ_CST)) {
 			// Drain the io
 			int count;
 			bool again;
 			disable_interrupts();
-				[count, again] = __drain_io( *this.ring, 0p );
+				[count, again] = __drain_io( *this.ring );
 
 				if(!again) reset++;
@@ -672,24 +192,14 @@
 			// We didn't get anything baton pass to the slow poller
 			else {
-				__cfadbg_print_safe(io_core, "Kernel I/O : Moving to ring %p to slow poller\n", &this.ring);
+				__cfadbg_print_safe(io_core, "Kernel I/O : Parking io poller %p\n", &this.self);
 				reset = 0;
 
-				// wake up the slow poller
-				post( this.ring->poller.sem );
-
-				// park this thread
-				park( __cfaabi_dbg_ctx );
+				// block this thread
+				__ioctx_prepare_block( this, ev );
+				wait( this.sem );
 			}
 		}
 
 		__cfadbg_print_safe(io_core, "Kernel I/O : Fast poller for ring %p stopping\n", &this.ring);
-	}
-
-	static inline void __wake_poller( struct __io_data & ring ) __attribute__((artificial));
-	static inline void __wake_poller( struct __io_data & ring ) {
-		if(!__atomic_load_n( &ring.poller.slow.blocked, __ATOMIC_SEQ_CST)) return;
-
-		sigval val = { 1 };
-		pthread_sigqueue( ring.poller.slow.kthrd, SIGUSR1, val );
 	}
 
@@ -806,19 +316,20 @@
 	}
 
-	void __submit( struct __io_data & ring, uint32_t idx ) {
+	void __submit( struct io_context * ctx, uint32_t idx ) __attribute__((nonnull (1))) {
+		__io_data & ring = *ctx->thrd.ring;
 		// Get now the data we definetely need
 		uint32_t * const tail = ring.submit_q.tail;
-		const uint32_t mask = *ring.submit_q.mask;
+		const uint32_t mask  = *ring.submit_q.mask;
 
 		// There are 2 submission schemes, check which one we are using
-		if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
+		if( ring.poller_submits ) {
 			// If the poller thread submits, then we just need to add this to the ready array
 			__submit_to_ready_array( ring, idx, mask );
 
-			__wake_poller( ring );
+			post( ctx->thrd.sem );
 
 			__cfadbg_print_safe( io, "Kernel I/O : Added %u to ready for %p\n", idx, active_thread() );
 		}
-		else if( ring.cltr_flags & CFA_CLUSTER_IO_EAGER_SUBMITS ) {
+		else if( ring.eager_submits ) {
 			uint32_t picked = __submit_to_ready_array( ring, idx, mask );
 
@@ -849,5 +360,5 @@
 			// We got the lock
 			unsigned to_submit = __collect_submitions( ring );
-			int ret = __io_uring_enter( ring, to_submit, false, 0p );
+			int ret = __io_uring_enter( ring, to_submit, false );
 			if( ret < 0 ) {
 				unlock(ring.submit_q.lock);
@@ -892,5 +403,5 @@
 
 			// Submit however, many entries need to be submitted
-			int ret = __io_uring_enter( ring, 1, false, 0p );
+			int ret = __io_uring_enter( ring, 1, false );
 			if( ret < 0 ) {
 				switch((int)errno) {
@@ -958,16 +469,3 @@
 		return count;
 	}
-
-//=============================================================================================
-// I/O Submissions
-//=============================================================================================
-
-	void register_fixed_files( cluster & cl, int * files, unsigned count ) {
-		int ret = syscall( __NR_io_uring_register, cl.io->fd, IORING_REGISTER_FILES, files, count );
-		if( ret < 0 ) {
-			abort( "KERNEL ERROR: IO_URING SYSCALL - (%d) %s\n", (int)errno, strerror(errno) );
-		}
-
-		__cfadbg_print_safe( io_core, "Kernel I/O : Performed io_register for %p, returned %d\n", active_thread(), ret );
-	}
 #endif
Index: libcfa/src/concurrency/io/setup.cfa
===================================================================
--- libcfa/src/concurrency/io/setup.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
+++ libcfa/src/concurrency/io/setup.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -0,0 +1,475 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// io/setup.cfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Fri Jul 31 16:25:51 2020
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#define __cforall_thread__
+#define _GNU_SOURCE         /* See feature_test_macros(7) */
+
+#include "io/types.hfa"
+#include "kernel.hfa"
+
+#if !defined(CFA_HAVE_LINUX_IO_URING_H)
+	void __kernel_io_startup() {
+		// Nothing to do without io_uring
+	}
+
+	void __kernel_io_shutdown() {
+		// Nothing to do without io_uring
+	}
+
+	void ?{}(io_context_params & this) {}
+
+	void ?{}(io_context & this, struct cluster & cl) {}
+	void ?{}(io_context & this, struct cluster & cl, const io_context_params & params) {}
+
+	void ^?{}(io_context & this) {}
+	void ^?{}(io_context & this, bool cluster_context) {}
+
+#else
+	#include <errno.h>
+	#include <stdint.h>
+	#include <string.h>
+	#include <signal.h>
+	#include <unistd.h>
+
+	extern "C" {
+		#include <pthread.h>
+		#include <sys/epoll.h>
+		#include <sys/mman.h>
+		#include <sys/syscall.h>
+
+		#include <linux/io_uring.h>
+	}
+
+	#include "bitmanip.hfa"
+	#include "kernel_private.hfa"
+	#include "thread.hfa"
+
+	void ?{}(io_context_params & this) {
+		this.num_entries = 256;
+		this.num_ready = 256;
+		this.submit_aff = -1;
+		this.eager_submits = false;
+		this.poller_submits = false;
+		this.poll_submit = false;
+		this.poll_complete = false;
+	}
+
+	static void * __io_poller_slow( void * arg );
+
+	// Weirdly, some systems that do support io_uring don't actually define these
+	#ifdef __alpha__
+		/*
+		* alpha is the only exception, all other architectures
+		* have common numbers for new system calls.
+		*/
+		#ifndef __NR_io_uring_setup
+			#define __NR_io_uring_setup           535
+		#endif
+		#ifndef __NR_io_uring_enter
+			#define __NR_io_uring_enter           536
+		#endif
+		#ifndef __NR_io_uring_register
+			#define __NR_io_uring_register        537
+		#endif
+	#else /* !__alpha__ */
+		#ifndef __NR_io_uring_setup
+			#define __NR_io_uring_setup           425
+		#endif
+		#ifndef __NR_io_uring_enter
+			#define __NR_io_uring_enter           426
+		#endif
+		#ifndef __NR_io_uring_register
+			#define __NR_io_uring_register        427
+		#endif
+	#endif
+
+//=============================================================================================
+// I/O Startup / Shutdown logic + Master Poller
+//=============================================================================================
+
+	// IO Master poller loop forward
+	static void * iopoll_loop( __attribute__((unused)) void * args );
+
+	static struct {
+		pthread_t     thrd;    // pthread handle to io poller thread
+		void *        stack;   // pthread stack for io poller thread
+		int           epollfd; // file descriptor to the epoll instance
+		volatile bool run;     // Whether or not to continue
+	} iopoll;
+
+	void __kernel_io_startup(void) {
+		__cfaabi_dbg_print_safe( "Kernel : Creating EPOLL instance\n" );
+
+		iopoll.epollfd = epoll_create1(0);
+		if (iopoll.epollfd == -1) {
+			abort( "internal error, epoll_create1\n");
+		}
+
+		__cfaabi_dbg_print_safe( "Kernel : Starting io poller thread\n" );
+
+		iopoll.run = true;
+		iopoll.stack = __create_pthread( &iopoll.thrd, iopoll_loop, 0p );
+	}
+
+	void __kernel_io_shutdown(void) {
+		// Notify the io poller thread of the shutdown
+		iopoll.run = false;
+		sigval val = { 1 };
+		pthread_sigqueue( iopoll.thrd, SIGUSR1, val );
+
+		// Wait for the io poller thread to finish
+
+		pthread_join( iopoll.thrd, 0p );
+		free( iopoll.stack );
+
+		int ret = close(iopoll.epollfd);
+		if (ret == -1) {
+			abort( "internal error, close epoll\n");
+		}
+
+		// Io polling is now fully stopped
+
+		__cfaabi_dbg_print_safe( "Kernel : IO poller stopped\n" );
+	}
+
+	static void * iopoll_loop( __attribute__((unused)) void * args ) {
+		__processor_id_t id;
+		id.id = doregister(&id);
+		__cfaabi_dbg_print_safe( "Kernel : IO poller thread starting\n" );
+
+		// Block signals to control when they arrive
+		sigset_t mask;
+		sigfillset(&mask);
+		if ( pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
+		abort( "internal error, pthread_sigmask" );
+		}
+
+		sigdelset( &mask, SIGUSR1 );
+
+		// Create sufficient events
+		struct epoll_event events[10];
+		// Main loop
+		while( iopoll.run ) {
+			// Wait for events
+			int nfds = epoll_pwait( iopoll.epollfd, events, 10, -1, &mask );
+
+			// Check if an error occured
+			if (nfds == -1) {
+				if( errno == EINTR ) continue;
+				abort( "internal error, pthread_sigmask" );
+			}
+
+			for(i; nfds) {
+				$io_ctx_thread * io_ctx = ($io_ctx_thread *)(uintptr_t)events[i].data.u64;
+				/* paranoid */ verify( io_ctx );
+				__cfadbg_print_safe(io_core, "Kernel I/O : Unparking io poller %p\n", io_ctx);
+				#if !defined( __CFA_NO_STATISTICS__ )
+					kernelTLS.this_stats = io_ctx->self.curr_cluster->stats;
+				#endif
+				__post( io_ctx->sem, &id );
+			}
+		}
+
+		__cfaabi_dbg_print_safe( "Kernel : IO poller thread stopping\n" );
+		unregister(&id);
+		return 0p;
+	}
+
+//=============================================================================================
+// I/O Context Constrution/Destruction
+//=============================================================================================
+
+	void ?{}($io_ctx_thread & this, struct cluster & cl) { (this.self){ "IO Poller", cl }; }
+	void main( $io_ctx_thread & this );
+	static inline $thread * get_thread( $io_ctx_thread & this ) { return &this.self; }
+	void ^?{}( $io_ctx_thread & mutex this ) {}
+
+	static void __io_create ( __io_data & this, const io_context_params & params_in );
+	static void __io_destroy( __io_data & this );
+
+	void ?{}(io_context & this, struct cluster & cl, const io_context_params & params) {
+		(this.thrd){ cl };
+		this.thrd.ring = malloc();
+		__cfadbg_print_safe(io_core, "Kernel I/O : Creating ring for io_context %p\n", &this);
+		__io_create( *this.thrd.ring, params );
+
+		__cfadbg_print_safe(io_core, "Kernel I/O : Starting poller thread for io_context %p\n", &this);
+		this.thrd.done = false;
+		__thrd_start( this.thrd, main );
+
+		__cfadbg_print_safe(io_core, "Kernel I/O : io_context %p ready\n", &this);
+	}
+
+	void ?{}(io_context & this, struct cluster & cl) {
+		io_context_params params;
+		(this){ cl, params };
+	}
+
+	void ^?{}(io_context & this, bool cluster_context) {
+		__cfadbg_print_safe(io_core, "Kernel I/O : tearing down io_context %p\n", &this);
+
+		// Notify the thread of the shutdown
+		__atomic_store_n(&this.thrd.done, true, __ATOMIC_SEQ_CST);
+
+		// If this is an io_context within a cluster, things get trickier
+		$thread & thrd = this.thrd.self;
+		if( cluster_context ) {
+			cluster & cltr = *thrd.curr_cluster;
+			/* paranoid */ verify( cltr.nprocessors == 0 || &cltr == mainCluster );
+			/* paranoid */ verify( !ready_mutate_islocked() );
+
+			// We need to adjust the clean-up based on where the thread is
+			if( thrd.state == Ready || thrd.preempted != __NO_PREEMPTION ) {
+
+				ready_schedule_lock( (struct __processor_id_t *)active_processor() );
+
+					// This is the tricky case
+					// The thread was preempted and now it is on the ready queue
+					// The thread should be the last on the list
+					/* paranoid */ verify( thrd.link.next != 0p );
+
+					// Remove the thread from the ready queue of this cluster
+					__attribute__((unused)) bool removed = remove_head( &cltr, &thrd );
+					/* paranoid */ verify( removed );
+					thrd.link.next = 0p;
+					thrd.link.prev = 0p;
+					__cfaabi_dbg_debug_do( thrd.unpark_stale = true );
+
+					// Fixup the thread state
+					thrd.state = Blocked;
+					thrd.ticket = 0;
+					thrd.preempted = __NO_PREEMPTION;
+
+				ready_schedule_unlock( (struct __processor_id_t *)active_processor() );
+
+				// Pretend like the thread was blocked all along
+			}
+			// !!! This is not an else if !!!
+			if( thrd.state == Blocked ) {
+
+				// This is the "easy case"
+				// The thread is parked and can easily be moved to active cluster
+				verify( thrd.curr_cluster != active_cluster() || thrd.curr_cluster == mainCluster );
+				thrd.curr_cluster = active_cluster();
+
+				// unpark the fast io_poller
+				unpark( &thrd __cfaabi_dbg_ctx2 );
+			}
+			else {
+
+				// The thread is in a weird state
+				// I don't know what to do here
+				abort("io_context poller thread is in unexpected state, cannot clean-up correctly\n");
+			}
+		} else {
+			unpark( &thrd __cfaabi_dbg_ctx2 );
+		}
+
+		^(this.thrd){};
+		__cfadbg_print_safe(io_core, "Kernel I/O : Stopped poller thread for io_context %p\n", &this);
+
+		__io_destroy( *this.thrd.ring );
+		__cfadbg_print_safe(io_core, "Kernel I/O : Destroyed ring for io_context %p\n", &this);
+
+		free(this.thrd.ring);
+	}
+
+	void ^?{}(io_context & this) {
+		^(this){ false };
+	}
+
+	static void __io_create( __io_data & this, const io_context_params & params_in ) {
+		// Step 1 : call to setup
+		struct io_uring_params params;
+		memset(&params, 0, sizeof(params));
+		if( params_in.poll_submit   ) params.flags |= IORING_SETUP_SQPOLL;
+		if( params_in.poll_complete ) params.flags |= IORING_SETUP_IOPOLL;
+
+		uint32_t nentries = params_in.num_entries;
+
+		int fd = syscall(__NR_io_uring_setup, nentries, &params );
+		if(fd < 0) {
+			abort("KERNEL ERROR: IO_URING SETUP - %s\n", strerror(errno));
+		}
+
+		// Step 2 : mmap result
+		memset( &this, 0, sizeof(struct __io_data) );
+		struct __submition_data  & sq = this.submit_q;
+		struct __completion_data & cq = this.completion_q;
+
+		// calculate the right ring size
+		sq.ring_sz = params.sq_off.array + (params.sq_entries * sizeof(unsigned)           );
+		cq.ring_sz = params.cq_off.cqes  + (params.cq_entries * sizeof(struct io_uring_cqe));
+
+		// Requires features
+		#if defined(IORING_FEAT_SINGLE_MMAP)
+			// adjust the size according to the parameters
+			if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
+				cq.ring_sz = sq.ring_sz = max(cq.ring_sz, sq.ring_sz);
+			}
+		#endif
+
+		// mmap the Submit Queue into existence
+		sq.ring_ptr = mmap(0, sq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
+		if (sq.ring_ptr == (void*)MAP_FAILED) {
+			abort("KERNEL ERROR: IO_URING MMAP1 - %s\n", strerror(errno));
+		}
+
+		// Requires features
+		#if defined(IORING_FEAT_SINGLE_MMAP)
+			// mmap the Completion Queue into existence (may or may not be needed)
+			if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
+				cq.ring_ptr = sq.ring_ptr;
+			}
+			else
+		#endif
+		{
+			// We need multiple call to MMAP
+			cq.ring_ptr = mmap(0, cq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
+			if (cq.ring_ptr == (void*)MAP_FAILED) {
+				munmap(sq.ring_ptr, sq.ring_sz);
+				abort("KERNEL ERROR: IO_URING MMAP2 - %s\n", strerror(errno));
+			}
+		}
+
+		// mmap the submit queue entries
+		size_t size = params.sq_entries * sizeof(struct io_uring_sqe);
+		sq.sqes = (struct io_uring_sqe *)mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
+		if (sq.sqes == (struct io_uring_sqe *)MAP_FAILED) {
+			munmap(sq.ring_ptr, sq.ring_sz);
+			if (cq.ring_ptr != sq.ring_ptr) munmap(cq.ring_ptr, cq.ring_sz);
+			abort("KERNEL ERROR: IO_URING MMAP3 - %s\n", strerror(errno));
+		}
+
+		// Get the pointers from the kernel to fill the structure
+		// submit queue
+		sq.head    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
+		sq.tail    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
+		sq.mask    = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
+		sq.num     = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
+		sq.flags   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
+		sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
+		sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
+		sq.prev_head = *sq.head;
+
+		{
+			const uint32_t num = *sq.num;
+			for( i; num ) {
+				sq.sqes[i].user_data = 0ul64;
+			}
+		}
+
+		(sq.lock){};
+		(sq.release_lock){};
+
+		if( params_in.poller_submits || params_in.eager_submits ) {
+			/* paranoid */ verify( is_pow2( params_in.num_ready ) || (params_in.num_ready < 8) );
+			sq.ready_cnt = max( params_in.num_ready, 8 );
+			sq.ready = alloc_align( 64, sq.ready_cnt );
+			for(i; sq.ready_cnt) {
+				sq.ready[i] = -1ul32;
+			}
+		}
+		else {
+			sq.ready_cnt = 0;
+			sq.ready = 0p;
+		}
+
+		// completion queue
+		cq.head     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
+		cq.tail     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
+		cq.mask     = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
+		cq.num      = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
+		cq.overflow = (         uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
+		cq.cqes   = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
+
+		// some paranoid checks
+		/* paranoid */ verifyf( (*cq.mask) == ((*cq.num) - 1ul32), "IO_URING Expected mask to be %u (%u entries), was %u", (*cq.num) - 1ul32, *cq.num, *cq.mask  );
+		/* paranoid */ verifyf( (*cq.num)  >= nentries, "IO_URING Expected %u entries, got %u", nentries, *cq.num );
+		/* paranoid */ verifyf( (*cq.head) == 0, "IO_URING Expected head to be 0, got %u", *cq.head );
+		/* paranoid */ verifyf( (*cq.tail) == 0, "IO_URING Expected tail to be 0, got %u", *cq.tail );
+
+		/* paranoid */ verifyf( (*sq.mask) == ((*sq.num) - 1ul32), "IO_URING Expected mask to be %u (%u entries), was %u", (*sq.num) - 1ul32, *sq.num, *sq.mask );
+		/* paranoid */ verifyf( (*sq.num) >= nentries, "IO_URING Expected %u entries, got %u", nentries, *sq.num );
+		/* paranoid */ verifyf( (*sq.head) == 0, "IO_URING Expected head to be 0, got %u", *sq.head );
+		/* paranoid */ verifyf( (*sq.tail) == 0, "IO_URING Expected tail to be 0, got %u", *sq.tail );
+
+		// Update the global ring info
+		this.ring_flags = params.flags;
+		this.fd         = fd;
+		this.eager_submits  = params_in.eager_submits;
+		this.poller_submits = params_in.poller_submits;
+	}
+
+	static void __io_destroy( __io_data & this ) {
+		// Shutdown the io rings
+		struct __submition_data  & sq = this.submit_q;
+		struct __completion_data & cq = this.completion_q;
+
+		// unmap the submit queue entries
+		munmap(sq.sqes, (*sq.num) * sizeof(struct io_uring_sqe));
+
+		// unmap the Submit Queue ring
+		munmap(sq.ring_ptr, sq.ring_sz);
+
+		// unmap the Completion Queue ring, if it is different
+		if (cq.ring_ptr != sq.ring_ptr) {
+			munmap(cq.ring_ptr, cq.ring_sz);
+		}
+
+		// close the file descriptor
+		close(this.fd);
+
+		free( this.submit_q.ready ); // Maybe null, doesn't matter
+	}
+
+//=============================================================================================
+// I/O Context Sleep
+//=============================================================================================
+
+	void __ioctx_register($io_ctx_thread & ctx, struct epoll_event & ev) {
+		ev.events = EPOLLIN | EPOLLONESHOT;
+		ev.data.u64 = (uint64_t)&ctx;
+		int ret = epoll_ctl(iopoll.epollfd, EPOLL_CTL_ADD, ctx.ring->fd, &ev);
+		if (ret < 0) {
+			abort( "KERNEL ERROR: EPOLL ADD - (%d) %s\n", (int)errno, strerror(errno) );
+		}
+	}
+
+	void __ioctx_prepare_block($io_ctx_thread & ctx, struct epoll_event & ev) {
+		int ret = epoll_ctl(iopoll.epollfd, EPOLL_CTL_MOD, ctx.ring->fd, &ev);
+		if (ret < 0) {
+			abort( "KERNEL ERROR: EPOLL REARM - (%d) %s\n", (int)errno, strerror(errno) );
+		}
+	}
+
+//=============================================================================================
+// I/O Context Misc Setup
+//=============================================================================================
+	void register_fixed_files( io_context & ctx, int * files, unsigned count ) {
+		int ret = syscall( __NR_io_uring_register, ctx.thrd.ring->fd, IORING_REGISTER_FILES, files, count );
+		if( ret < 0 ) {
+			abort( "KERNEL ERROR: IO_URING SYSCALL - (%d) %s\n", (int)errno, strerror(errno) );
+		}
+
+		__cfadbg_print_safe( io_core, "Kernel I/O : Performed io_register for %p, returned %d\n", active_thread(), ret );
+	}
+
+	void register_fixed_files( cluster & cltr, int * files, unsigned count ) {
+		for(i; cltr.io.cnt) {
+			register_fixed_files( cltr.io.ctxs[i], files, count );
+		}
+	}
+#endif
Index: libcfa/src/concurrency/io/types.hfa
===================================================================
--- libcfa/src/concurrency/io/types.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
+++ libcfa/src/concurrency/io/types.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -0,0 +1,128 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// io/types.hfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Fri Jul 31 16:22:47 2020
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#pragma once
+
+#if defined(CFA_HAVE_LINUX_IO_URING_H)
+      #include "bits/locks.hfa"
+
+	//-----------------------------------------------------------------------
+	// Ring Data structure
+      struct __submition_data {
+		// Head and tail of the ring (associated with array)
+		volatile uint32_t * head;
+		volatile uint32_t * tail;
+		volatile uint32_t prev_head;
+
+		// The actual kernel ring which uses head/tail
+		// indexes into the sqes arrays
+		uint32_t * array;
+
+		// number of entries and mask to go with it
+		const uint32_t * num;
+		const uint32_t * mask;
+
+		// Submission flags (Not sure what for)
+		uint32_t * flags;
+
+		// number of sqes not submitted (whatever that means)
+		uint32_t * dropped;
+
+		// Like head/tail but not seen by the kernel
+		volatile uint32_t * ready;
+		uint32_t ready_cnt;
+
+		__spinlock_t lock;
+		__spinlock_t release_lock;
+
+		// A buffer of sqes (not the actual ring)
+		struct io_uring_sqe * sqes;
+
+		// The location and size of the mmaped area
+		void * ring_ptr;
+		size_t ring_sz;
+	};
+
+	struct __completion_data {
+		// Head and tail of the ring
+		volatile uint32_t * head;
+		volatile uint32_t * tail;
+
+		// number of entries and mask to go with it
+		const uint32_t * mask;
+		const uint32_t * num;
+
+		// number of cqes not submitted (whatever that means)
+		uint32_t * overflow;
+
+		// the kernel ring
+		struct io_uring_cqe * cqes;
+
+		// The location and size of the mmaped area
+		void * ring_ptr;
+		size_t ring_sz;
+	};
+
+	struct __io_data {
+		struct __submition_data submit_q;
+		struct __completion_data completion_q;
+		uint32_t ring_flags;
+		int fd;
+		bool eager_submits:1;
+		bool poller_submits:1;
+	};
+
+
+	//-----------------------------------------------------------------------
+	// IO user data
+	struct __io_user_data_t {
+		int32_t result;
+		$thread * thrd;
+	};
+
+	//-----------------------------------------------------------------------
+	// Misc
+	// Weirdly, some systems that do support io_uring don't actually define these
+	#ifdef __alpha__
+		/*
+		* alpha is the only exception, all other architectures
+		* have common numbers for new system calls.
+		*/
+		#ifndef __NR_io_uring_setup
+			#define __NR_io_uring_setup           535
+		#endif
+		#ifndef __NR_io_uring_enter
+			#define __NR_io_uring_enter           536
+		#endif
+		#ifndef __NR_io_uring_register
+			#define __NR_io_uring_register        537
+		#endif
+	#else /* !__alpha__ */
+		#ifndef __NR_io_uring_setup
+			#define __NR_io_uring_setup           425
+		#endif
+		#ifndef __NR_io_uring_enter
+			#define __NR_io_uring_enter           426
+		#endif
+		#ifndef __NR_io_uring_register
+			#define __NR_io_uring_register        427
+		#endif
+	#endif
+
+	struct epoll_event;
+	struct $io_ctx_thread;
+	void __ioctx_register($io_ctx_thread & ctx, struct epoll_event & ev);
+	void __ioctx_prepare_block($io_ctx_thread & ctx, struct epoll_event & ev);
+#endif
Index: libcfa/src/concurrency/iocall.cfa
===================================================================
--- libcfa/src/concurrency/iocall.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/iocall.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -14,5 +14,8 @@
 //
 
+#define __cforall_thread__
+
 #include "bits/defs.hfa"
+#include "kernel.hfa"
 
 //=============================================================================================
@@ -21,11 +24,14 @@
 
 #if defined(CFA_HAVE_LINUX_IO_URING_H)
+	#include <assert.h>
 	#include <stdint.h>
+	#include <errno.h>
 	#include <linux/io_uring.h>
 
-	#include "kernel_private.hfa"
+	#include "kernel/fwd.hfa"
+	#include "io/types.hfa"
 
 	extern [* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring, uint64_t data );
-	extern void __submit( struct __io_data & ring, uint32_t idx );
+	extern void __submit( struct io_context * ctx, uint32_t idx ) __attribute__((nonnull (1)));
 
 	static inline void ?{}(struct io_uring_sqe & this, uint8_t opcode, int fd) {
@@ -52,16 +58,68 @@
 	}
 
+	static inline io_context * __get_io_context( void ) {
+		cluster * cltr = active_cluster();
+		/* paranoid */ verifyf( cltr, "No active cluster for io operation\n");
+		assertf( cltr->io.cnt > 0, "Cluster %p has no default io contexts and no context was specified\n", cltr );
+		/* paranoid */ verifyf( cltr->io.ctxs, "default io contexts for cluster %p are missing\n", cltr);
+		return &cltr->io.ctxs[ __tls_rand() % cltr->io.cnt ];
+	}
+
+
+      #if defined(CFA_HAVE_IOSQE_FIXED_FILE) && defined(CFA_HAVE_IOSQE_IO_DRAIN) && defined(CFA_HAVE_IOSQE_ASYNC)
+		#define REGULAR_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_DRAIN | IOSQE_ASYNC)
+	#elif defined(CFA_HAVE_IOSQE_FIXED_FILE) && defined(CFA_HAVE_IOSQE_ASYNC)
+		#define REGULAR_FLAGS (IOSQE_FIXED_FILE | IOSQE_ASYNC)
+      #elif defined(CFA_HAVE_IOSQE_FIXED_FILE) && defined(CFA_HAVE_IOSQE_IO_DRAIN)
+		#define REGULAR_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_DRAIN)
+      #elif defined(CFA_HAVE_IOSQE_IO_DRAIN) && defined(CFA_HAVE_IOSQE_ASYNC)
+		#define REGULAR_FLAGS (IOSQE_IO_DRAIN | IOSQE_ASYNC)
+	#elif defined(CFA_HAVE_IOSQE_FIXED_FILE)
+		#define REGULAR_FLAGS (IOSQE_FIXED_FILE)
+      #elif defined(CFA_HAVE_IOSQE_IO_DRAIN)
+		#define REGULAR_FLAGS (IOSQE_IO_DRAIN)
+      #elif defined(CFA_HAVE_IOSQE_ASYNC)
+		#define REGULAR_FLAGS (IOSQE_ASYNC)
+	#else
+		#define REGULAR_FLAGS (0)
+	#endif
+
+	#if defined(CFA_HAVE_IOSQE_IO_LINK) && defined(CFA_HAVE_IOSQE_IO_HARDLINK)
+		#define LINK_FLAGS (IOSQE_IO_LINK | IOSQE_IO_HARDLINK)
+	#elif defined(CFA_HAVE_IOSQE_IO_LINK)
+		#define LINK_FLAGS (IOSQE_IO_LINK)
+	#elif defined(CFA_HAVE_IOSQE_IO_HARDLINK)
+		#define LINK_FLAGS (IOSQE_IO_HARDLINK)
+	#else
+		#define LINK_FLAGS (0)
+	#endif
+
+	#if defined(CFA_HAVE_SPLICE_F_FD_IN_FIXED)
+		#define SPLICE_FLAGS (SPLICE_F_FD_IN_FIXED)
+	#else
+		#define SPLICE_FLAGS (0)
+	#endif
+
+
 	#define __submit_prelude \
+		if( 0 != (submit_flags & LINK_FLAGS) ) { errno = ENOTSUP; return -1; } \
+		(void)timeout; (void)cancellation; \
+		if( !context ) context = __get_io_context(); \
 		__io_user_data_t data = { 0, active_thread() }; \
-		struct __io_data & ring = *data.thrd->curr_cluster->io; \
+		struct __io_data & ring = *context->thrd.ring; \
 		struct io_uring_sqe * sqe; \
 		uint32_t idx; \
-		[sqe, idx] = __submit_alloc( ring, (uint64_t)(uintptr_t)&data );
+		[sqe, idx] = __submit_alloc( ring, (uint64_t)(uintptr_t)&data ); \
+		sqe->flags = REGULAR_FLAGS & submit_flags;
 
 	#define __submit_wait \
 		/*__cfaabi_bits_print_safe( STDERR_FILENO, "Preparing user data %p for %p\n", &data, data.thrd );*/ \
 		verify( sqe->user_data == (uint64_t)(uintptr_t)&data ); \
-		__submit( ring, idx ); \
+		__submit( context, idx ); \
 		park( __cfaabi_dbg_ctx ); \
+		if( data.result < 0 ) { \
+			errno = -data.result; \
+			return -1; \
+		} \
 		return data.result;
 #endif
@@ -70,4 +128,5 @@
 // I/O Forwards
 //=============================================================================================
+#include <time.hfa>
 
 // Some forward declarations
@@ -121,5 +180,5 @@
 // Asynchronous operations
 #if defined(HAVE_PREADV2)
-	ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) {
+	ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 		#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_READV)
 			return preadv2(fd, iov, iovcnt, offset, flags);
@@ -132,13 +191,14 @@
 		#endif
 	}
-
-	ssize_t cfa_preadv2_fixed(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) {
-		#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_READV)
-			return preadv2(fd, iov, iovcnt, offset, flags);
+#endif
+
+#if defined(HAVE_PWRITEV2)
+	ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
+		#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_WRITEV)
+			return pwritev2(fd, iov, iovcnt, offset, flags);
 		#else
 			__submit_prelude
 
-			(*sqe){ IORING_OP_READV, fd, iov, iovcnt, offset };
-			sqe->flags |= IOSQE_FIXED_FILE;
+			(*sqe){ IORING_OP_WRITEV, fd, iov, iovcnt, offset };
 
 			__submit_wait
@@ -147,19 +207,5 @@
 #endif
 
-#if defined(HAVE_PWRITEV2)
-	ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) {
-		#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_WRITEV)
-			return pwritev2(fd, iov, iovcnt, offset, flags);
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_WRITEV, fd, iov, iovcnt, offset };
-
-			__submit_wait
-		#endif
-	}
-#endif
-
-int cfa_fsync(int fd) {
+int cfa_fsync(int fd, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FSYNC)
 		return fsync(fd);
@@ -173,5 +219,5 @@
 }
 
-int cfa_sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags) {
+int cfa_sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SYNC_FILE_RANGE)
 		return sync_file_range(fd, offset, nbytes, flags);
@@ -189,5 +235,5 @@
 
 
-ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags) {
+ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SENDMSG)
 		return sendmsg(sockfd, msg, flags);
@@ -202,5 +248,5 @@
 }
 
-ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags) {
+ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_RECVMSG)
 		return recvmsg(sockfd, msg, flags);
@@ -215,5 +261,5 @@
 }
 
-ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags) {
+ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SEND)
 		return send( sockfd, buf, len, flags );
@@ -230,5 +276,5 @@
 }
 
-ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags) {
+ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_RECV)
 		return recv( sockfd, buf, len, flags );
@@ -245,5 +291,5 @@
 }
 
-int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags) {
+int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_ACCEPT)
 		return accept4( sockfd, addr, addrlen, flags );
@@ -260,5 +306,5 @@
 }
 
-int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen) {
+int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_CONNECT)
 		return connect( sockfd, addr, addrlen );
@@ -274,5 +320,5 @@
 }
 
-int cfa_fallocate(int fd, int mode, uint64_t offset, uint64_t len) {
+int cfa_fallocate(int fd, int mode, uint64_t offset, uint64_t len, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FALLOCATE)
 		return fallocate( fd, mode, offset, len );
@@ -291,5 +337,5 @@
 }
 
-int cfa_fadvise(int fd, uint64_t offset, uint64_t len, int advice) {
+int cfa_fadvise(int fd, uint64_t offset, uint64_t len, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FADVISE)
 		return posix_fadvise( fd, offset, len, advice );
@@ -306,5 +352,5 @@
 }
 
-int cfa_madvise(void *addr, size_t length, int advice) {
+int cfa_madvise(void *addr, size_t length, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_MADVISE)
 		return madvise( addr, length, advice );
@@ -321,5 +367,5 @@
 }
 
-int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode) {
+int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_OPENAT)
 		return openat( dirfd, pathname, flags, mode );
@@ -336,5 +382,5 @@
 }
 
-int cfa_close(int fd) {
+int cfa_close(int fd, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_CLOSE)
 		return close( fd );
@@ -350,5 +396,5 @@
 // Forward declare in case it is not supported
 struct statx;
-int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf) {
+int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_STATX)
 		#if defined(__NR_statx)
@@ -362,11 +408,11 @@
 
 		(*sqe){ IORING_OP_STATX, dirfd, pathname, mask, (uint64_t)statxbuf };
-		sqe->flags = flags;
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_read(int fd, void *buf, size_t count) {
+		sqe->statx_flags = flags;
+
+		__submit_wait
+	#endif
+}
+
+ssize_t cfa_read(int fd, void *buf, size_t count, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_READ)
 		return read( fd, buf, count );
@@ -380,5 +426,5 @@
 }
 
-ssize_t cfa_write(int fd, void *buf, size_t count) {
+ssize_t cfa_write(int fd, void *buf, size_t count, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_WRITE)
 		return read( fd, buf, count );
@@ -392,5 +438,5 @@
 }
 
-ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags) {
+ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SPLICE)
 		return splice( fd_in, off_in, fd_out, off_out, len, flags );
@@ -413,39 +459,11 @@
 			sqe->splice_off_in = (uint64_t)-1;
 		}
-		sqe->splice_flags  = flags;
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags, int in_flags, int out_flags) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SPLICE)
-		return splice( fd_in, off_in, fd_out, off_out, len, flags );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_SPLICE, fd_out };
-		if( off_out ) {
-			sqe->off = *off_out;
-		}
-		else {
-			sqe->off = (uint64_t)-1;
-		}
-		sqe->len = len;
-		sqe->splice_fd_in  = fd_in;
-		if( off_in ) {
-			sqe->splice_off_in = *off_in;
-		}
-		else {
-			sqe->splice_off_in = (uint64_t)-1;
-		}
-		sqe->splice_flags  = flags | out_flags;
-		sqe->flags = in_flags;
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags) {
+		sqe->splice_flags  = flags | (SPLICE_FLAGS & submit_flags);
+
+		__submit_wait
+	#endif
+}
+
+ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_TEE)
 		return tee( fd_in, fd_out, len, flags );
@@ -455,5 +473,5 @@
 		(*sqe){ IORING_OP_TEE, fd_out, 0p, len, 0 };
 		sqe->splice_fd_in = fd_in;
-		sqe->splice_flags = flags;
+		sqe->splice_flags  = flags | (SPLICE_FLAGS & submit_flags);
 
 		__submit_wait
@@ -562,6 +580,5 @@
 
 		if( /*func == (fptr_t)splice || */
-			func == (fptr_t)(ssize_t (*)(int, loff_t *, int, loff_t *, size_t, unsigned int))cfa_splice,
-			func == (fptr_t)(ssize_t (*)(int, loff_t *, int, loff_t *, size_t, unsigned int, int, int))cfa_splice )
+			func == (fptr_t)cfa_splice )
 			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_SPLICE ,
 			return IS_DEFINED(CFA_HAVE_IORING_OP_SPLICE);
Index: libcfa/src/concurrency/iofwd.hfa
===================================================================
--- libcfa/src/concurrency/iofwd.hfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/iofwd.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -19,6 +19,27 @@
 extern "C" {
 	#include <sys/types.h>
+	#if CFA_HAVE_LINUX_IO_URING_H
+		#include <linux/io_uring.h>
+	#endif
 }
 #include "bits/defs.hfa"
+#include "time.hfa"
+
+#if defined(CFA_HAVE_IOSQE_FIXED_FILE)
+	#define CFA_IO_FIXED_FD1 IOSQE_FIXED_FILE
+#endif
+#if defined(CFA_HAVE_SPLICE_F_FD_IN_FIXED)
+	#define CFA_IO_FIXED_FD2 SPLICE_F_FD_IN_FIXED
+#endif
+#if defined(CFA_HAVE_IOSQE_IO_DRAIN)
+	#define CFA_IO_DRAIN IOSQE_IO_DRAIN
+#endif
+#if defined(CFA_HAVE_IOSQE_ASYNC)
+	#define CFA_IO_ASYNC IOSQE_ASYNC
+#endif
+
+struct cluster;
+struct io_context;
+struct io_cancellation;
 
 struct iovec;
@@ -27,26 +48,30 @@
 struct statx;
 
-extern ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
-extern ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
-extern int cfa_fsync(int fd);
-extern int cfa_sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags);
-extern ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags);
-extern ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags);
-extern ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags);
-extern ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags);
-extern int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
-extern int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
-extern int cfa_fallocate(int fd, int mode, uint64_t offset, uint64_t len);
-extern int cfa_fadvise(int fd, uint64_t offset, uint64_t len, int advice);
-extern int cfa_madvise(void *addr, size_t length, int advice);
-extern int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode);
-extern int cfa_close(int fd);
-extern int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf);
-extern ssize_t cfa_read(int fd, void *buf, size_t count);
-extern ssize_t cfa_write(int fd, void *buf, size_t count);
-extern ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags);
-extern ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags);
+extern ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_fsync(int fd, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_fallocate(int fd, int mode, uint64_t offset, uint64_t len, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_fadvise(int fd, uint64_t offset, uint64_t len, int advice, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_madvise(void *addr, size_t length, int advice, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_close(int fd, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern ssize_t cfa_read(int fd, void *buf, size_t count, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern ssize_t cfa_write(int fd, void *buf, size_t count, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
 
 //-----------------------------------------------------------------------------
 // Check if a function is blocks a only the user thread
 bool has_user_level_blocking( fptr_t func );
+
+//-----------------------------------------------------------------------------
+void register_fixed_files( io_context & ctx , int * files, unsigned count );
+void register_fixed_files( cluster    & cltr, int * files, unsigned count );
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/kernel.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -18,22 +18,12 @@
 
 //C Includes
-#include <stddef.h>
 #include <errno.h>
-#include <string.h>
 #include <stdio.h>
-#include <fenv.h>
 #include <signal.h>
 #include <unistd.h>
-#include <limits.h>										// PTHREAD_STACK_MIN
-#include <sys/mman.h>									// mprotect
-extern "C" {
-#include <sys/resource.h>
-}
 
 //CFA Includes
-#include "time.hfa"
 #include "kernel_private.hfa"
 #include "preemption.hfa"
-#include "startup.hfa"
 
 //Private includes
@@ -45,12 +35,4 @@
 // Some assembly required
 #if defined( __i386 )
-	#define CtxGet( ctx )        \
-		__asm__ volatile (     \
-			"movl %%esp,%0\n"\
-			"movl %%ebp,%1\n"\
-			: "=rm" (ctx.SP),\
-				"=rm" (ctx.FP) \
-		)
-
 	// mxcr : SSE Status and Control bits (control bits are preserved across function calls)
 	// fcw  : X87 FPU control word (preserved across function calls)
@@ -74,12 +56,4 @@
 
 #elif defined( __x86_64 )
-	#define CtxGet( ctx )        \
-		__asm__ volatile (     \
-			"movq %%rsp,%0\n"\
-			"movq %%rbp,%1\n"\
-			: "=rm" (ctx.SP),\
-				"=rm" (ctx.FP) \
-		)
-
 	#define __x87_store         \
 		uint32_t __mxcr;      \
@@ -102,16 +76,10 @@
 
 #elif defined( __ARM_ARCH )
-#define CtxGet( ctx ) __asm__ ( \
-		"mov %0,%%sp\n"   \
-		"mov %1,%%r11\n"   \
-	: "=rm" (ctx.SP), "=rm" (ctx.FP) )
 #else
 	#error unknown hardware architecture
 #endif
 
-//-----------------------------------------------------------------------------
-//Start and stop routine for the kernel, declared first to make sure they run first
-static void __kernel_startup (void) __attribute__(( constructor( STARTUP_PRIORITY_KERNEL ) ));
-static void __kernel_shutdown(void) __attribute__(( destructor ( STARTUP_PRIORITY_KERNEL ) ));
+extern $thread * mainThread;
+extern processor * mainProcessor;
 
 //-----------------------------------------------------------------------------
@@ -120,244 +88,7 @@
 static bool __has_next_thread(cluster * this);
 static void __run_thread(processor * this, $thread * dst);
-static bool __wake_proc(processor *);
 static bool __wake_one(struct __processor_id_t * id, cluster * cltr);
 static void __halt(processor * this);
-
-//-----------------------------------------------------------------------------
-// Kernel storage
-KERNEL_STORAGE(cluster,	             mainCluster);
-KERNEL_STORAGE(processor,            mainProcessor);
-KERNEL_STORAGE($thread,	             mainThread);
-KERNEL_STORAGE(__stack_t,            mainThreadCtx);
-KERNEL_STORAGE(__scheduler_RWLock_t, __scheduler_lock);
-#if !defined(__CFA_NO_STATISTICS__)
-KERNEL_STORAGE(__stats_t, mainProcStats);
-#endif
-
-cluster              * mainCluster;
-processor            * mainProcessor;
-$thread              * mainThread;
-__scheduler_RWLock_t * __scheduler_lock;
-
-extern "C" {
-	struct { __dllist_t(cluster) list; __spinlock_t lock; } __cfa_dbg_global_clusters;
-}
-
-size_t __page_size = 0;
-
-//-----------------------------------------------------------------------------
-// Global state
-thread_local struct KernelThreadData kernelTLS __attribute__ ((tls_model ( "initial-exec" ))) @= {
-	NULL,												// cannot use 0p
-	NULL,
-	NULL,
-	{ 1, false, false },
-};
-
-//-----------------------------------------------------------------------------
-// Struct to steal stack
-struct current_stack_info_t {
-	__stack_t * storage;								// pointer to stack object
-	void * base;										// base of stack
-	void * limit;										// stack grows towards stack limit
-	void * context;										// address of cfa_context_t
-};
-
-void ?{}( current_stack_info_t & this ) {
-	__stack_context_t ctx;
-	CtxGet( ctx );
-	this.base = ctx.FP;
-
-	rlimit r;
-	getrlimit( RLIMIT_STACK, &r);
-	size_t size = r.rlim_cur;
-
-	this.limit = (void *)(((intptr_t)this.base) - size);
-	this.context = &storage_mainThreadCtx;
-}
-
-//-----------------------------------------------------------------------------
-// Main thread construction
-
-void ?{}( $coroutine & this, current_stack_info_t * info) with( this ) {
-	stack.storage = info->storage;
-	with(*stack.storage) {
-		limit     = info->limit;
-		base      = info->base;
-	}
-	__attribute__((may_alias)) intptr_t * istorage = (intptr_t*) &stack.storage;
-	*istorage |= 0x1;
-	name = "Main Thread";
-	state = Start;
-	starter = 0p;
-	last = 0p;
-	cancellation = 0p;
-}
-
-void ?{}( $thread & this, current_stack_info_t * info) with( this ) {
-	ticket = 1;
-	state = Start;
-	self_cor{ info };
-	curr_cor = &self_cor;
-	curr_cluster = mainCluster;
-	self_mon.owner = &this;
-	self_mon.recursion = 1;
-	self_mon_p = &self_mon;
-	link.next = 0p;
-	link.prev = 0p;
-
-	node.next = 0p;
-	node.prev = 0p;
-	doregister(curr_cluster, this);
-
-	monitors{ &self_mon_p, 1, (fptr_t)0 };
-}
-
-//-----------------------------------------------------------------------------
-// Processor coroutine
-void ?{}(processorCtx_t & this) {
-
-}
-
-// Construct the processor context of non-main processors
-static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info) {
-	(this.__cor){ info };
-	this.proc = proc;
-}
-
-static void * __invoke_processor(void * arg);
-
-static init(processor & this, const char name[], cluster & _cltr) with( this ) {
-	this.name = name;
-	this.cltr = &_cltr;
-	id = -1u;
-	destroyer = 0p;
-	do_terminate = false;
-	preemption_alarm = 0p;
-	pending_preemption = false;
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		print_stats = 0;
-		print_halts = false;
-	#endif
-
-	int target = __atomic_add_fetch( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
-
-	id = doregister((__processor_id_t*)&this);
-
-	// Lock the RWlock so no-one pushes/pops while we are changing the queue
-	uint_fast32_t last_size = ready_mutate_lock();
-
-		// Adjust the ready queue size
-		ready_queue_grow( cltr, target );
-
-	// Unlock the RWlock
-	ready_mutate_unlock( last_size );
-
-	__cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
-}
-
-// Not a ctor, it just preps the destruction but should not destroy members
-void deinit(processor & this) {
-
-	int target = __atomic_sub_fetch( &this.cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
-
-	// Lock the RWlock so no-one pushes/pops while we are changing the queue
-	uint_fast32_t last_size = ready_mutate_lock();
-
-		// Adjust the ready queue size
-		ready_queue_shrink( this.cltr, target );
-
-		// Make sure we aren't on the idle queue
-		unsafe_remove( this.cltr->idles, &this );
-
-	// Unlock the RWlock
-	ready_mutate_unlock( last_size );
-
-	// Finally we don't need the read_lock any more
-	unregister((__processor_id_t*)&this);
-}
-
-void ?{}(processor & this, const char name[], cluster & _cltr) {
-	( this.idle ){};
-	( this.terminated ){ 0 };
-	( this.runner ){};
-	init( this, name, _cltr );
-
-	__cfadbg_print_safe(runtime_core, "Kernel : Starting core %p\n", &this);
-
-	this.stack = __create_pthread( &this.kernel_thread, __invoke_processor, (void *)&this );
-
-}
-
-void ^?{}(processor & this) with( this ){
-	if( ! __atomic_load_n(&do_terminate, __ATOMIC_ACQUIRE) ) {
-		__cfadbg_print_safe(runtime_core, "Kernel : core %p signaling termination\n", &this);
-
-		__atomic_store_n(&do_terminate, true, __ATOMIC_RELAXED);
-		__wake_proc( &this );
-
-		P( terminated );
-		verify( kernelTLS.this_processor != &this);
-	}
-
-	int err = pthread_join( kernel_thread, 0p );
-	if( err != 0 ) abort("KERNEL ERROR: joining processor %p caused error %s\n", &this, strerror(err));
-
-	free( this.stack );
-
-	deinit( this );
-}
-
-void ?{}(cluster & this, const char name[], Duration preemption_rate, unsigned io_flags) with( this ) {
-	this.name = name;
-	this.preemption_rate = preemption_rate;
-	this.nprocessors = 0;
-	ready_queue{};
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		print_stats = 0;
-		stats = alloc();
-		__init_stats( stats );
-	#endif
-
-	threads{ __get };
-
-	doregister(this);
-
-	// Lock the RWlock so no-one pushes/pops while we are changing the queue
-	uint_fast32_t last_size = ready_mutate_lock();
-
-		// Adjust the ready queue size
-		ready_queue_grow( &this, 0 );
-
-	// Unlock the RWlock
-	ready_mutate_unlock( last_size );
-
-
-	__kernel_io_startup( this, io_flags, &this == mainCluster );
-}
-
-void ^?{}(cluster & this) {
-	__kernel_io_shutdown( this, &this == mainCluster );
-
-	// Lock the RWlock so no-one pushes/pops while we are changing the queue
-	uint_fast32_t last_size = ready_mutate_lock();
-
-		// Adjust the ready queue size
-		ready_queue_shrink( &this, 0 );
-
-	// Unlock the RWlock
-	ready_mutate_unlock( last_size );
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		if( 0 != this.print_stats ) {
-			__print_stats( this.stats, this.print_stats, true, this.name, (void*)&this );
-		}
-		free( this.stats );
-	#endif
-
-	unregister(this);
-}
+bool __wake_proc(processor *);
 
 //=============================================================================================
@@ -550,147 +281,4 @@
 }
 
-// KERNEL_ONLY
-// Context invoker for processors
-// This is the entry point for processors (kernel threads)
-// It effectively constructs a coroutine by stealing the pthread stack
-static void * __invoke_processor(void * arg) {
-	#if !defined( __CFA_NO_STATISTICS__ )
-		__stats_t local_stats;
-		__init_stats( &local_stats );
-		kernelTLS.this_stats = &local_stats;
-	#endif
-
-	processor * proc = (processor *) arg;
-	kernelTLS.this_processor = proc;
-	kernelTLS.this_thread    = 0p;
-	kernelTLS.preemption_state.[enabled, disable_count] = [false, 1];
-	// SKULLDUGGERY: We want to create a context for the processor coroutine
-	// which is needed for the 2-step context switch. However, there is no reason
-	// to waste the perfectly valid stack create by pthread.
-	current_stack_info_t info;
-	__stack_t ctx;
-	info.storage = &ctx;
-	(proc->runner){ proc, &info };
-
-	__cfaabi_dbg_print_safe("Coroutine : created stack %p\n", get_coroutine(proc->runner)->stack.storage);
-
-	//Set global state
-	kernelTLS.this_thread = 0p;
-
-	//We now have a proper context from which to schedule threads
-	__cfadbg_print_safe(runtime_core, "Kernel : core %p created (%p, %p)\n", proc, &proc->runner, &ctx);
-
-	// SKULLDUGGERY: Since the coroutine doesn't have its own stack, we can't
-	// resume it to start it like it normally would, it will just context switch
-	// back to here. Instead directly call the main since we already are on the
-	// appropriate stack.
-	get_coroutine(proc->runner)->state = Active;
-	main( proc->runner );
-	get_coroutine(proc->runner)->state = Halted;
-
-	// Main routine of the core returned, the core is now fully terminated
-	__cfadbg_print_safe(runtime_core, "Kernel : core %p main ended (%p)\n", proc, &proc->runner);
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		__tally_stats(proc->cltr->stats, &local_stats);
-		if( 0 != proc->print_stats ) {
-			__print_stats( &local_stats, proc->print_stats, true, proc->name, (void*)proc );
-		}
-	#endif
-
-	return 0p;
-}
-
-static void Abort( int ret, const char func[] ) {
-	if ( ret ) {										// pthread routines return errno values
-		abort( "%s : internal error, error(%d) %s.", func, ret, strerror( ret ) );
-	} // if
-} // Abort
-
-void * __create_pthread( pthread_t * pthread, void * (*start)(void *), void * arg ) {
-	pthread_attr_t attr;
-
-	Abort( pthread_attr_init( &attr ), "pthread_attr_init" ); // initialize attribute
-
-	size_t stacksize;
-	// default stack size, normally defined by shell limit
-	Abort( pthread_attr_getstacksize( &attr, &stacksize ), "pthread_attr_getstacksize" );
-	assert( stacksize >= PTHREAD_STACK_MIN );
-
-	void * stack;
-	__cfaabi_dbg_debug_do(
-		stack = memalign( __page_size, stacksize + __page_size );
-		// pthread has no mechanism to create the guard page in user supplied stack.
-		if ( mprotect( stack, __page_size, PROT_NONE ) == -1 ) {
-			abort( "mprotect : internal error, mprotect failure, error(%d) %s.", errno, strerror( errno ) );
-		} // if
-	);
-	__cfaabi_dbg_no_debug_do(
-		stack = malloc( stacksize );
-	);
-
-	Abort( pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" );
-
-	Abort( pthread_create( pthread, &attr, start, arg ), "pthread_create" );
-	return stack;
-}
-
-// KERNEL_ONLY
-static void __kernel_first_resume( processor * this ) {
-	$thread * src = mainThread;
-	$coroutine * dst = get_coroutine(this->runner);
-
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	kernelTLS.this_thread->curr_cor = dst;
-	__stack_prepare( &dst->stack, 65000 );
-	__cfactx_start(main, dst, this->runner, __cfactx_invoke_coroutine);
-
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	dst->last = &src->self_cor;
-	dst->starter = dst->starter ? dst->starter : &src->self_cor;
-
-	// make sure the current state is still correct
-	/* paranoid */ verify(src->state == Ready);
-
-	// context switch to specified coroutine
-	verify( dst->context.SP );
-	__cfactx_switch( &src->context, &dst->context );
-	// when __cfactx_switch returns we are back in the src coroutine
-
-	mainThread->curr_cor = &mainThread->self_cor;
-
-	// make sure the current state has been update
-	/* paranoid */ verify(src->state == Active);
-
-	verify( ! kernelTLS.preemption_state.enabled );
-}
-
-// KERNEL_ONLY
-static void __kernel_last_resume( processor * this ) {
-	$coroutine * src = &mainThread->self_cor;
-	$coroutine * dst = get_coroutine(this->runner);
-
-	verify( ! kernelTLS.preemption_state.enabled );
-	verify( dst->starter == src );
-	verify( dst->context.SP );
-
-	// SKULLDUGGERY in debug the processors check that the
-	// stack is still within the limit of the stack limits after running a thread.
-	// that check doesn't make sense if we context switch to the processor using the
-	// coroutine semantics. Since this is a special case, use the current context
-	// info to populate these fields.
-	__cfaabi_dbg_debug_do(
-		__stack_context_t ctx;
-		CtxGet( ctx );
-		mainThread->context.SP = ctx.SP;
-		mainThread->context.FP = ctx.FP;
-	)
-
-	// context switch to the processor
-	__cfactx_switch( &src->context, &dst->context );
-}
-
 //-----------------------------------------------------------------------------
 // Scheduler routines
@@ -834,148 +422,4 @@
 
 //=============================================================================================
-// Kernel Setup logic
-//=============================================================================================
-//-----------------------------------------------------------------------------
-// Kernel boot procedures
-static void __kernel_startup(void) {
-	verify( ! kernelTLS.preemption_state.enabled );
-	__cfadbg_print_safe(runtime_core, "Kernel : Starting\n");
-
-	__page_size = sysconf( _SC_PAGESIZE );
-
-	__cfa_dbg_global_clusters.list{ __get };
-	__cfa_dbg_global_clusters.lock{};
-
-	// Initialize the global scheduler lock
-	__scheduler_lock = (__scheduler_RWLock_t*)&storage___scheduler_lock;
-	(*__scheduler_lock){};
-
-	// Initialize the main cluster
-	mainCluster = (cluster *)&storage_mainCluster;
-	(*mainCluster){"Main Cluster"};
-
-	__cfadbg_print_safe(runtime_core, "Kernel : Main cluster ready\n");
-
-	// Start by initializing the main thread
-	// SKULLDUGGERY: the mainThread steals the process main thread
-	// which will then be scheduled by the mainProcessor normally
-	mainThread = ($thread *)&storage_mainThread;
-	current_stack_info_t info;
-	info.storage = (__stack_t*)&storage_mainThreadCtx;
-	(*mainThread){ &info };
-
-	__cfadbg_print_safe(runtime_core, "Kernel : Main thread ready\n");
-
-
-
-	// Construct the processor context of the main processor
-	void ?{}(processorCtx_t & this, processor * proc) {
-		(this.__cor){ "Processor" };
-		this.__cor.starter = 0p;
-		this.proc = proc;
-	}
-
-	void ?{}(processor & this) with( this ) {
-		( this.idle ){};
-		( this.terminated ){ 0 };
-		( this.runner ){};
-		init( this, "Main Processor", *mainCluster );
-		kernel_thread = pthread_self();
-
-		runner{ &this };
-		__cfadbg_print_safe(runtime_core, "Kernel : constructed main processor context %p\n", &runner);
-	}
-
-	// Initialize the main processor and the main processor ctx
-	// (the coroutine that contains the processing control flow)
-	mainProcessor = (processor *)&storage_mainProcessor;
-	(*mainProcessor){};
-
-	//initialize the global state variables
-	kernelTLS.this_processor = mainProcessor;
-	kernelTLS.this_thread    = mainThread;
-
-	#if !defined( __CFA_NO_STATISTICS__ )
-		kernelTLS.this_stats = (__stats_t *)& storage_mainProcStats;
-		__init_stats( kernelTLS.this_stats );
-	#endif
-
-	// Enable preemption
-	kernel_start_preemption();
-
-	// Add the main thread to the ready queue
-	// once resume is called on mainProcessor->runner the mainThread needs to be scheduled like any normal thread
-	__schedule_thread((__processor_id_t *)mainProcessor, mainThread);
-
-	// SKULLDUGGERY: Force a context switch to the main processor to set the main thread's context to the current UNIX
-	// context. Hence, the main thread does not begin through __cfactx_invoke_thread, like all other threads. The trick here is that
-	// mainThread is on the ready queue when this call is made.
-	__kernel_first_resume( kernelTLS.this_processor );
-
-
-	// THE SYSTEM IS NOW COMPLETELY RUNNING
-
-
-	// Now that the system is up, finish creating systems that need threading
-	__kernel_io_finish_start( *mainCluster );
-
-
-	__cfadbg_print_safe(runtime_core, "Kernel : Started\n--------------------------------------------------\n\n");
-
-	verify( ! kernelTLS.preemption_state.enabled );
-	enable_interrupts( __cfaabi_dbg_ctx );
-	verify( TL_GET( preemption_state.enabled ) );
-}
-
-static void __kernel_shutdown(void) {
-	//Before we start shutting things down, wait for systems that need threading to shutdown
-	__kernel_io_prepare_stop( *mainCluster );
-
-	/* paranoid */ verify( TL_GET( preemption_state.enabled ) );
-	disable_interrupts();
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-
-	__cfadbg_print_safe(runtime_core, "\n--------------------------------------------------\nKernel : Shutting down\n");
-
-	// SKULLDUGGERY: Notify the mainProcessor it needs to terminates.
-	// When its coroutine terminates, it return control to the mainThread
-	// which is currently here
-	__atomic_store_n(&mainProcessor->do_terminate, true, __ATOMIC_RELEASE);
-	__kernel_last_resume( kernelTLS.this_processor );
-	mainThread->self_cor.state = Halted;
-
-	// THE SYSTEM IS NOW COMPLETELY STOPPED
-
-	// Disable preemption
-	kernel_stop_preemption();
-
-	// Destroy the main processor and its context in reverse order of construction
-	// These were manually constructed so we need manually destroy them
-	void ^?{}(processor & this) with( this ){
-		deinit( this );
-
-		/* paranoid */ verify( this.do_terminate == true );
-		__cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
-	}
-
-	^(*mainProcessor){};
-
-	// Final step, destroy the main thread since it is no longer needed
-
-	// Since we provided a stack to this taxk it will not destroy anything
-	/* paranoid */ verify(mainThread->self_cor.stack.storage == (__stack_t*)(((uintptr_t)&storage_mainThreadCtx)| 0x1));
-	^(*mainThread){};
-
-	^(*mainCluster){};
-
-	^(*__scheduler_lock){};
-
-	^(__cfa_dbg_global_clusters.list){};
-	^(__cfa_dbg_global_clusters.lock){};
-
-	__cfadbg_print_safe(runtime_core, "Kernel : Shutdown complete\n");
-}
-
-//=============================================================================================
 // Kernel Idle Sleep
 //=============================================================================================
@@ -997,5 +441,5 @@
 
 // Unconditionnaly wake a thread
-static bool __wake_proc(processor * this) {
+bool __wake_proc(processor * this) {
 	__cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this);
 
@@ -1075,5 +519,5 @@
 
 void kernel_abort_msg( void * kernel_data, char * abort_text, int abort_text_size ) {
-	$thread * thrd = kernel_data;
+	$thread * thrd = ( $thread * ) kernel_data;
 
 	if(thrd) {
@@ -1170,32 +614,4 @@
 
 	return thrd != 0p;
-}
-
-//-----------------------------------------------------------------------------
-// Global Queues
-void doregister( cluster     & cltr ) {
-	lock      ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
-	push_front( __cfa_dbg_global_clusters.list, cltr );
-	unlock    ( __cfa_dbg_global_clusters.lock );
-}
-
-void unregister( cluster     & cltr ) {
-	lock  ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
-	remove( __cfa_dbg_global_clusters.list, cltr );
-	unlock( __cfa_dbg_global_clusters.lock );
-}
-
-void doregister( cluster * cltr, $thread & thrd ) {
-	lock      (cltr->thread_list_lock __cfaabi_dbg_ctx2);
-	cltr->nthreads += 1;
-	push_front(cltr->threads, thrd);
-	unlock    (cltr->thread_list_lock);
-}
-
-void unregister( cluster * cltr, $thread & thrd ) {
-	lock  (cltr->thread_list_lock __cfaabi_dbg_ctx2);
-	remove(cltr->threads, thrd );
-	cltr->nthreads -= 1;
-	unlock(cltr->thread_list_lock);
 }
 
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/kernel.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -16,7 +16,4 @@
 #pragma once
 
-#include <stdbool.h>
-#include <stdint.h>
-
 #include "invoke.h"
 #include "time_t.hfa"
@@ -26,6 +23,5 @@
 
 extern "C" {
-#include <pthread.h>
-#include <semaphore.h>
+#include <bits/pthreadtypes.h>
 }
 
@@ -129,11 +125,42 @@
 struct __io_data;
 
-#define CFA_CLUSTER_IO_POLLER_USER_THREAD    (1 << 0) // 0x01
-#define CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS (1 << 1) // 0x02
-#define CFA_CLUSTER_IO_EAGER_SUBMITS         (1 << 2) // 0x04
-#define CFA_CLUSTER_IO_KERNEL_POLL_SUBMITS   (1 << 3) // 0x08
-#define CFA_CLUSTER_IO_KERNEL_POLL_COMPLETES (1 << 4) // 0x10
-#define CFA_CLUSTER_IO_BUFFLEN_OFFSET        16
-
+// IO poller user-thread
+// Not using the "thread" keyword because we want to control
+// more carefully when to start/stop it
+struct $io_ctx_thread {
+	struct __io_data * ring;
+	single_sem sem;
+	volatile bool done;
+	$thread self;
+};
+
+
+struct io_context {
+	$io_ctx_thread thrd;
+};
+
+struct io_context_params {
+	int num_entries;
+	int num_ready;
+	int submit_aff;
+	bool eager_submits:1;
+	bool poller_submits:1;
+	bool poll_submit:1;
+	bool poll_complete:1;
+};
+
+void  ?{}(io_context_params & this);
+
+void  ?{}(io_context & this, struct cluster & cl);
+void  ?{}(io_context & this, struct cluster & cl, const io_context_params & params);
+void ^?{}(io_context & this);
+
+struct io_cancellation {
+	uint32_t target;
+};
+
+static inline void  ?{}(io_cancellation & this) { this.target = -1u; }
+static inline void ^?{}(io_cancellation & this) {}
+bool cancel(io_cancellation & this);
 
 //-----------------------------------------------------------------------------
@@ -206,5 +233,8 @@
 	} node;
 
-	struct __io_data * io;
+	struct {
+		io_context * ctxs;
+		unsigned cnt;
+	} io;
 
 	#if !defined(__CFA_NO_STATISTICS__)
@@ -215,13 +245,19 @@
 extern Duration default_preemption();
 
-void ?{} (cluster & this, const char name[], Duration preemption_rate, unsigned flags);
+void ?{} (cluster & this, const char name[], Duration preemption_rate, unsigned num_io, const io_context_params & io_params);
 void ^?{}(cluster & this);
 
-static inline void ?{} (cluster & this)                                           { this{"Anonymous Cluster", default_preemption(), 0}; }
-static inline void ?{} (cluster & this, Duration preemption_rate)                 { this{"Anonymous Cluster", preemption_rate, 0}; }
-static inline void ?{} (cluster & this, const char name[])                        { this{name, default_preemption(), 0}; }
-static inline void ?{} (cluster & this, unsigned flags)                           { this{"Anonymous Cluster", default_preemption(), flags}; }
-static inline void ?{} (cluster & this, Duration preemption_rate, unsigned flags) { this{"Anonymous Cluster", preemption_rate, flags}; }
-static inline void ?{} (cluster & this, const char name[], unsigned flags)        { this{name, default_preemption(), flags}; }
+static inline void ?{} (cluster & this)                                            { io_context_params default_params;    this{"Anonymous Cluster", default_preemption(), 1, default_params}; }
+static inline void ?{} (cluster & this, Duration preemption_rate)                  { io_context_params default_params;    this{"Anonymous Cluster", preemption_rate, 1, default_params}; }
+static inline void ?{} (cluster & this, const char name[])                         { io_context_params default_params;    this{name, default_preemption(), 1, default_params}; }
+static inline void ?{} (cluster & this, unsigned num_io)                           { io_context_params default_params;    this{"Anonymous Cluster", default_preemption(), num_io, default_params}; }
+static inline void ?{} (cluster & this, Duration preemption_rate, unsigned num_io) { io_context_params default_params;    this{"Anonymous Cluster", preemption_rate, num_io, default_params}; }
+static inline void ?{} (cluster & this, const char name[], unsigned num_io)        { io_context_params default_params;    this{name, default_preemption(), num_io, default_params}; }
+static inline void ?{} (cluster & this, const io_context_params & io_params)                                            { this{"Anonymous Cluster", default_preemption(), 1, io_params}; }
+static inline void ?{} (cluster & this, Duration preemption_rate, const io_context_params & io_params)                  { this{"Anonymous Cluster", preemption_rate, 1, io_params}; }
+static inline void ?{} (cluster & this, const char name[], const io_context_params & io_params)                         { this{name, default_preemption(), 1, io_params}; }
+static inline void ?{} (cluster & this, unsigned num_io, const io_context_params & io_params)                           { this{"Anonymous Cluster", default_preemption(), num_io, io_params}; }
+static inline void ?{} (cluster & this, Duration preemption_rate, unsigned num_io, const io_context_params & io_params) { this{"Anonymous Cluster", preemption_rate, num_io, io_params}; }
+static inline void ?{} (cluster & this, const char name[], unsigned num_io, const io_context_params & io_params)        { this{name, default_preemption(), num_io, io_params}; }
 
 static inline [cluster *&, cluster *& ] __get( cluster & this ) __attribute__((const)) { return this.node.[next, prev]; }
Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -0,0 +1,124 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// kernel/fwd.hfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Thu Jul 30 16:46:41 2020
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#pragma once
+
+#include "bits/defs.hfa"
+#include "bits/debug.hfa"
+
+#ifdef __cforall
+#include "bits/random.hfa"
+#endif
+
+struct $thread;
+struct processor;
+struct cluster;
+
+enum __Preemption_Reason { __NO_PREEMPTION, __ALARM_PREEMPTION, __POLL_PREEMPTION, __MANUAL_PREEMPTION };
+
+#define KERNEL_STORAGE(T,X) __attribute((aligned(__alignof__(T)))) static char storage_##X[sizeof(T)]
+
+#ifdef __cforall
+extern "C" {
+	extern "Cforall" {
+		extern __attribute__((aligned(128))) thread_local struct KernelThreadData {
+			struct $thread    * volatile this_thread;
+			struct processor  * volatile this_processor;
+			struct __stats_t  * volatile this_stats;
+
+			struct {
+				volatile unsigned short disable_count;
+				volatile bool enabled;
+				volatile bool in_progress;
+			} preemption_state;
+
+			#if defined(__SIZEOF_INT128__)
+				__uint128_t rand_seed;
+			#else
+				uint64_t rand_seed;
+			#endif
+		} kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
+
+		static inline uint64_t __tls_rand() {
+			#if defined(__SIZEOF_INT128__)
+				return __lehmer64( kernelTLS.rand_seed );
+			#else
+				return __xorshift64( kernelTLS.rand_seed );
+			#endif
+		}
+	}
+
+	#ifdef __ARM_ARCH
+		// function prototypes are only really used by these macros on ARM
+		void disable_global_interrupts();
+		void enable_global_interrupts();
+
+		#define TL_GET( member ) ( { __typeof__( kernelTLS.member ) target; \
+			disable_global_interrupts(); \
+			target = kernelTLS.member; \
+			enable_global_interrupts(); \
+			target; } )
+		#define TL_SET( member, value ) disable_global_interrupts(); \
+			kernelTLS.member = value; \
+			enable_global_interrupts();
+	#else
+		#define TL_GET( member ) kernelTLS.member
+		#define TL_SET( member, value ) kernelTLS.member = value;
+	#endif
+
+	extern void disable_interrupts();
+	extern void enable_interrupts_noPoll();
+	extern void enable_interrupts( __cfaabi_dbg_ctx_param );
+
+	extern "Cforall" {
+		extern void park( __cfaabi_dbg_ctx_param );
+		extern void unpark( struct $thread * this __cfaabi_dbg_ctx_param2 );
+		static inline struct $thread * active_thread () { return TL_GET( this_thread ); }
+
+		extern bool force_yield( enum __Preemption_Reason );
+
+		static inline void yield() {
+			force_yield(__MANUAL_PREEMPTION);
+		}
+
+		// Yield: yield N times
+		static inline void yield( unsigned times ) {
+			for( times ) {
+				yield();
+			}
+		}
+
+		//-----------------------------------------------------------------------
+		// Statics call at the end of each thread to register statistics
+		#if !defined(__CFA_NO_STATISTICS__)
+			static inline struct __stats_t * __tls_stats() {
+				/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+				/* paranoid */ verify( kernelTLS.this_stats );
+				return kernelTLS.this_stats;
+			}
+
+			#define __STATS__(in_kernel, ...) { \
+				if( !(in_kernel) ) disable_interrupts(); \
+				with( *__tls_stats() ) { \
+					__VA_ARGS__ \
+				} \
+				if( !(in_kernel) ) enable_interrupts( __cfaabi_dbg_ctx ); \
+			}
+		#else
+			#define __STATS__(in_kernel, ...)
+		#endif
+	}
+}
+#endif
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -0,0 +1,667 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// kernel/startup.cfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Thu Jul 30 15:12:54 2020
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#define __cforall_thread__
+
+// C Includes
+#include <errno.h>              // errno
+#include <string.h>             // strerror
+#include <unistd.h>             // sysconf
+extern "C" {
+      #include <limits.h>       // PTHREAD_STACK_MIN
+      #include <sys/mman.h>     // mprotect
+      #include <sys/resource.h> // getrlimit
+}
+
+// CFA Includes
+#include "kernel_private.hfa"
+#include "startup.hfa"          // STARTUP_PRIORITY_XXX
+
+//-----------------------------------------------------------------------------
+// Some assembly required
+#if defined( __i386 )
+	#define CtxGet( ctx )        \
+		__asm__ volatile (     \
+			"movl %%esp,%0\n"\
+			"movl %%ebp,%1\n"\
+			: "=rm" (ctx.SP),\
+				"=rm" (ctx.FP) \
+		)
+#elif defined( __x86_64 )
+	#define CtxGet( ctx )        \
+		__asm__ volatile (     \
+			"movq %%rsp,%0\n"\
+			"movq %%rbp,%1\n"\
+			: "=rm" (ctx.SP),\
+				"=rm" (ctx.FP) \
+		)
+#elif defined( __ARM_ARCH )
+#define CtxGet( ctx ) __asm__ ( \
+		"mov %0,%%sp\n"   \
+		"mov %1,%%r11\n"   \
+	: "=rm" (ctx.SP), "=rm" (ctx.FP) )
+#else
+	#error unknown hardware architecture
+#endif
+
+//-----------------------------------------------------------------------------
+// Start and stop routine for the kernel, declared first to make sure they run first
+static void __kernel_startup (void) __attribute__(( constructor( STARTUP_PRIORITY_KERNEL ) ));
+static void __kernel_shutdown(void) __attribute__(( destructor ( STARTUP_PRIORITY_KERNEL ) ));
+
+//-----------------------------------------------------------------------------
+// Static Forward Declarations
+struct current_stack_info_t;
+
+static void * __invoke_processor(void * arg);
+static void __kernel_first_resume( processor * this );
+static void __kernel_last_resume ( processor * this );
+static void init(processor & this, const char name[], cluster & _cltr);
+static void deinit(processor & this);
+static void doregister( struct cluster & cltr );
+static void unregister( struct cluster & cltr );
+static void ?{}( $coroutine & this, current_stack_info_t * info);
+static void ?{}( $thread & this, current_stack_info_t * info);
+static void ?{}(processorCtx_t & this) {}
+static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info);
+
+//-----------------------------------------------------------------------------
+// Forward Declarations for other modules
+extern void __kernel_alarm_startup(void);
+extern void __kernel_alarm_shutdown(void);
+extern void __kernel_io_startup (void);
+extern void __kernel_io_shutdown(void);
+
+//-----------------------------------------------------------------------------
+// Other Forward Declarations
+extern bool __wake_proc(processor *);
+
+//-----------------------------------------------------------------------------
+// Kernel storage
+KERNEL_STORAGE(cluster,	             mainCluster);
+KERNEL_STORAGE(processor,            mainProcessor);
+KERNEL_STORAGE($thread,	             mainThread);
+KERNEL_STORAGE(__stack_t,            mainThreadCtx);
+KERNEL_STORAGE(io_context,           mainPollerThread);
+KERNEL_STORAGE(__scheduler_RWLock_t, __scheduler_lock);
+#if !defined(__CFA_NO_STATISTICS__)
+KERNEL_STORAGE(__stats_t, mainProcStats);
+#endif
+
+cluster              * mainCluster;
+processor            * mainProcessor;
+$thread              * mainThread;
+__scheduler_RWLock_t * __scheduler_lock;
+
+extern "C" {
+	struct { __dllist_t(cluster) list; __spinlock_t lock; } __cfa_dbg_global_clusters;
+}
+
+size_t __page_size = 0;
+
+//-----------------------------------------------------------------------------
+// Global state
+thread_local struct KernelThreadData kernelTLS __attribute__ ((tls_model ( "initial-exec" ))) @= {
+	NULL,												// cannot use 0p
+	NULL,
+	NULL,
+	{ 1, false, false },
+};
+
+//-----------------------------------------------------------------------------
+// Struct to steal stack
+struct current_stack_info_t {
+	__stack_t * storage;  // pointer to stack object
+	void * base;          // base of stack
+	void * limit;         // stack grows towards stack limit
+	void * context;       // address of cfa_context_t
+};
+
+void ?{}( current_stack_info_t & this ) {
+	__stack_context_t ctx;
+	CtxGet( ctx );
+	this.base = ctx.FP;
+
+	rlimit r;
+	getrlimit( RLIMIT_STACK, &r);
+	size_t size = r.rlim_cur;
+
+	this.limit = (void *)(((intptr_t)this.base) - size);
+	this.context = &storage_mainThreadCtx;
+}
+
+
+
+//=============================================================================================
+// Kernel Setup logic
+//=============================================================================================
+//-----------------------------------------------------------------------------
+// Kernel boot procedures
+static void __kernel_startup(void) {
+	verify( ! kernelTLS.preemption_state.enabled );
+	__cfadbg_print_safe(runtime_core, "Kernel : Starting\n");
+
+	__page_size = sysconf( _SC_PAGESIZE );
+
+	__cfa_dbg_global_clusters.list{ __get };
+	__cfa_dbg_global_clusters.lock{};
+
+	// Initialize the global scheduler lock
+	__scheduler_lock = (__scheduler_RWLock_t*)&storage___scheduler_lock;
+	(*__scheduler_lock){};
+
+	// Initialize the main cluster
+	mainCluster = (cluster *)&storage_mainCluster;
+	(*mainCluster){"Main Cluster", 0};
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Main cluster ready\n");
+
+	// Start by initializing the main thread
+	// SKULLDUGGERY: the mainThread steals the process main thread
+	// which will then be scheduled by the mainProcessor normally
+	mainThread = ($thread *)&storage_mainThread;
+	current_stack_info_t info;
+	info.storage = (__stack_t*)&storage_mainThreadCtx;
+	(*mainThread){ &info };
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Main thread ready\n");
+
+
+
+	// Construct the processor context of the main processor
+	void ?{}(processorCtx_t & this, processor * proc) {
+		(this.__cor){ "Processor" };
+		this.__cor.starter = 0p;
+		this.proc = proc;
+	}
+
+	void ?{}(processor & this) with( this ) {
+		( this.idle ){};
+		( this.terminated ){ 0 };
+		( this.runner ){};
+		init( this, "Main Processor", *mainCluster );
+		kernel_thread = pthread_self();
+
+		runner{ &this };
+		__cfadbg_print_safe(runtime_core, "Kernel : constructed main processor context %p\n", &runner);
+	}
+
+	// Initialize the main processor and the main processor ctx
+	// (the coroutine that contains the processing control flow)
+	mainProcessor = (processor *)&storage_mainProcessor;
+	(*mainProcessor){};
+
+	//initialize the global state variables
+	kernelTLS.this_processor = mainProcessor;
+	kernelTLS.this_thread    = mainThread;
+
+	#if !defined( __CFA_NO_STATISTICS__ )
+		kernelTLS.this_stats = (__stats_t *)& storage_mainProcStats;
+		__init_stats( kernelTLS.this_stats );
+	#endif
+
+	// Enable preemption
+	__kernel_alarm_startup();
+
+	// Start IO
+	__kernel_io_startup();
+
+	// Add the main thread to the ready queue
+	// once resume is called on mainProcessor->runner the mainThread needs to be scheduled like any normal thread
+	__schedule_thread((__processor_id_t *)mainProcessor, mainThread);
+
+	// SKULLDUGGERY: Force a context switch to the main processor to set the main thread's context to the current UNIX
+	// context. Hence, the main thread does not begin through __cfactx_invoke_thread, like all other threads. The trick here is that
+	// mainThread is on the ready queue when this call is made.
+	__kernel_first_resume( kernelTLS.this_processor );
+
+
+	// THE SYSTEM IS NOW COMPLETELY RUNNING
+
+
+	// SKULLDUGGERY: The constructor for the mainCluster will call alloc with a dimension of 0
+	// malloc *can* return a non-null value, we should free it if that is the case
+	free( mainCluster->io.ctxs );
+
+	// Now that the system is up, finish creating systems that need threading
+	mainCluster->io.ctxs = (io_context *)&storage_mainPollerThread;
+	mainCluster->io.cnt  = 1;
+	(*mainCluster->io.ctxs){ *mainCluster };
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Started\n--------------------------------------------------\n\n");
+
+	verify( ! kernelTLS.preemption_state.enabled );
+	enable_interrupts( __cfaabi_dbg_ctx );
+	verify( TL_GET( preemption_state.enabled ) );
+}
+
+static void __kernel_shutdown(void) {
+	//Before we start shutting things down, wait for systems that need threading to shutdown
+	^(*mainCluster->io.ctxs){};
+	mainCluster->io.cnt  = 0;
+	mainCluster->io.ctxs = 0p;
+
+	/* paranoid */ verify( TL_GET( preemption_state.enabled ) );
+	disable_interrupts();
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+
+	__cfadbg_print_safe(runtime_core, "\n--------------------------------------------------\nKernel : Shutting down\n");
+
+	// SKULLDUGGERY: Notify the mainProcessor it needs to terminates.
+	// When its coroutine terminates, it return control to the mainThread
+	// which is currently here
+	__atomic_store_n(&mainProcessor->do_terminate, true, __ATOMIC_RELEASE);
+	__kernel_last_resume( kernelTLS.this_processor );
+	mainThread->self_cor.state = Halted;
+
+	// THE SYSTEM IS NOW COMPLETELY STOPPED
+
+	// Disable preemption
+	__kernel_alarm_shutdown();
+
+	// Stop IO
+	__kernel_io_shutdown();
+
+	// Destroy the main processor and its context in reverse order of construction
+	// These were manually constructed so we need manually destroy them
+	void ^?{}(processor & this) with( this ){
+		deinit( this );
+
+		/* paranoid */ verify( this.do_terminate == true );
+		__cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
+	}
+
+	^(*mainProcessor){};
+
+	// Final step, destroy the main thread since it is no longer needed
+
+	// Since we provided a stack to this taxk it will not destroy anything
+	/* paranoid */ verify(mainThread->self_cor.stack.storage == (__stack_t*)(((uintptr_t)&storage_mainThreadCtx)| 0x1));
+	^(*mainThread){};
+
+	^(*mainCluster){};
+
+	^(*__scheduler_lock){};
+
+	^(__cfa_dbg_global_clusters.list){};
+	^(__cfa_dbg_global_clusters.lock){};
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Shutdown complete\n");
+}
+
+//=============================================================================================
+// Kernel Initial Scheduling logic
+//=============================================================================================
+
+// Context invoker for processors
+// This is the entry point for processors (kernel threads) *except* for the main processor
+// It effectively constructs a coroutine by stealing the pthread stack
+static void * __invoke_processor(void * arg) {
+	#if !defined( __CFA_NO_STATISTICS__ )
+		__stats_t local_stats;
+		__init_stats( &local_stats );
+		kernelTLS.this_stats = &local_stats;
+	#endif
+
+	processor * proc = (processor *) arg;
+	kernelTLS.this_processor = proc;
+	kernelTLS.this_thread    = 0p;
+	kernelTLS.preemption_state.[enabled, disable_count] = [false, 1];
+	// SKULLDUGGERY: We want to create a context for the processor coroutine
+	// which is needed for the 2-step context switch. However, there is no reason
+	// to waste the perfectly valid stack create by pthread.
+	current_stack_info_t info;
+	__stack_t ctx;
+	info.storage = &ctx;
+	(proc->runner){ proc, &info };
+
+	__cfaabi_dbg_print_safe("Coroutine : created stack %p\n", get_coroutine(proc->runner)->stack.storage);
+
+	//Set global state
+	kernelTLS.this_thread = 0p;
+
+	//We now have a proper context from which to schedule threads
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p created (%p, %p)\n", proc, &proc->runner, &ctx);
+
+	// SKULLDUGGERY: Since the coroutine doesn't have its own stack, we can't
+	// resume it to start it like it normally would, it will just context switch
+	// back to here. Instead directly call the main since we already are on the
+	// appropriate stack.
+	get_coroutine(proc->runner)->state = Active;
+	main( proc->runner );
+	get_coroutine(proc->runner)->state = Halted;
+
+	// Main routine of the core returned, the core is now fully terminated
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p main ended (%p)\n", proc, &proc->runner);
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		__tally_stats(proc->cltr->stats, &local_stats);
+		if( 0 != proc->print_stats ) {
+			__print_stats( &local_stats, proc->print_stats, true, proc->name, (void*)proc );
+		}
+	#endif
+
+	return 0p;
+}
+
+static void __kernel_first_resume( processor * this ) {
+	$thread * src = mainThread;
+	$coroutine * dst = get_coroutine(this->runner);
+
+	verify( ! kernelTLS.preemption_state.enabled );
+
+	kernelTLS.this_thread->curr_cor = dst;
+	__stack_prepare( &dst->stack, 65000 );
+	__cfactx_start(main, dst, this->runner, __cfactx_invoke_coroutine);
+
+	verify( ! kernelTLS.preemption_state.enabled );
+
+	dst->last = &src->self_cor;
+	dst->starter = dst->starter ? dst->starter : &src->self_cor;
+
+	// make sure the current state is still correct
+	/* paranoid */ verify(src->state == Ready);
+
+	// context switch to specified coroutine
+	verify( dst->context.SP );
+	__cfactx_switch( &src->context, &dst->context );
+	// when __cfactx_switch returns we are back in the src coroutine
+
+	mainThread->curr_cor = &mainThread->self_cor;
+
+	// make sure the current state has been update
+	/* paranoid */ verify(src->state == Active);
+
+	verify( ! kernelTLS.preemption_state.enabled );
+}
+
+// KERNEL_ONLY
+static void __kernel_last_resume( processor * this ) {
+	$coroutine * src = &mainThread->self_cor;
+	$coroutine * dst = get_coroutine(this->runner);
+
+	verify( ! kernelTLS.preemption_state.enabled );
+	verify( dst->starter == src );
+	verify( dst->context.SP );
+
+	// SKULLDUGGERY in debug the processors check that the
+	// stack is still within the limit of the stack limits after running a thread.
+	// that check doesn't make sense if we context switch to the processor using the
+	// coroutine semantics. Since this is a special case, use the current context
+	// info to populate these fields.
+	__cfaabi_dbg_debug_do(
+		__stack_context_t ctx;
+		CtxGet( ctx );
+		mainThread->context.SP = ctx.SP;
+		mainThread->context.FP = ctx.FP;
+	)
+
+	// context switch to the processor
+	__cfactx_switch( &src->context, &dst->context );
+}
+
+
+//=============================================================================================
+// Kernel Object Constructors logic
+//=============================================================================================
+//-----------------------------------------------------------------------------
+// Main thread construction
+static void ?{}( $coroutine & this, current_stack_info_t * info) with( this ) {
+	stack.storage = info->storage;
+	with(*stack.storage) {
+		limit     = info->limit;
+		base      = info->base;
+	}
+	__attribute__((may_alias)) intptr_t * istorage = (intptr_t*) &stack.storage;
+	*istorage |= 0x1;
+	name = "Main Thread";
+	state = Start;
+	starter = 0p;
+	last = 0p;
+	cancellation = 0p;
+}
+
+static void ?{}( $thread & this, current_stack_info_t * info) with( this ) {
+	ticket = 1;
+	state = Start;
+	self_cor{ info };
+	curr_cor = &self_cor;
+	curr_cluster = mainCluster;
+	self_mon.owner = &this;
+	self_mon.recursion = 1;
+	self_mon_p = &self_mon;
+	link.next = 0p;
+	link.prev = 0p;
+
+	node.next = 0p;
+	node.prev = 0p;
+	doregister(curr_cluster, this);
+
+	monitors{ &self_mon_p, 1, (fptr_t)0 };
+}
+
+//-----------------------------------------------------------------------------
+// Processor
+// Construct the processor context of non-main processors
+static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info) {
+	(this.__cor){ info };
+	this.proc = proc;
+}
+
+static void init(processor & this, const char name[], cluster & _cltr) with( this ) {
+	this.name = name;
+	this.cltr = &_cltr;
+	id = -1u;
+	destroyer = 0p;
+	do_terminate = false;
+	preemption_alarm = 0p;
+	pending_preemption = false;
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		print_stats = 0;
+		print_halts = false;
+	#endif
+
+	int target = __atomic_add_fetch( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+
+	id = doregister((__processor_id_t*)&this);
+
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock();
+
+		// Adjust the ready queue size
+		ready_queue_grow( cltr, target );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
+}
+
+// Not a ctor, it just preps the destruction but should not destroy members
+static void deinit(processor & this) {
+
+	int target = __atomic_sub_fetch( &this.cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock();
+
+		// Adjust the ready queue size
+		ready_queue_shrink( this.cltr, target );
+
+		// Make sure we aren't on the idle queue
+		unsafe_remove( this.cltr->idles, &this );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+
+	// Finally we don't need the read_lock any more
+	unregister((__processor_id_t*)&this);
+}
+
+void ?{}(processor & this, const char name[], cluster & _cltr) {
+	( this.idle ){};
+	( this.terminated ){ 0 };
+	( this.runner ){};
+	init( this, name, _cltr );
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Starting core %p\n", &this);
+
+	this.stack = __create_pthread( &this.kernel_thread, __invoke_processor, (void *)&this );
+
+}
+
+void ^?{}(processor & this) with( this ){
+	if( ! __atomic_load_n(&do_terminate, __ATOMIC_ACQUIRE) ) {
+		__cfadbg_print_safe(runtime_core, "Kernel : core %p signaling termination\n", &this);
+
+		__atomic_store_n(&do_terminate, true, __ATOMIC_RELAXED);
+		__wake_proc( &this );
+
+		P( terminated );
+		verify( kernelTLS.this_processor != &this);
+	}
+
+	int err = pthread_join( kernel_thread, 0p );
+	if( err != 0 ) abort("KERNEL ERROR: joining processor %p caused error %s\n", &this, strerror(err));
+
+	free( this.stack );
+
+	deinit( this );
+}
+
+//-----------------------------------------------------------------------------
+// Cluster
+void ?{}(cluster & this, const char name[], Duration preemption_rate, unsigned num_io, const io_context_params & io_params) with( this ) {
+	this.name = name;
+	this.preemption_rate = preemption_rate;
+	this.nprocessors = 0;
+	ready_queue{};
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		print_stats = 0;
+		stats = alloc();
+		__init_stats( stats );
+	#endif
+
+	threads{ __get };
+
+	doregister(this);
+
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock();
+
+		// Adjust the ready queue size
+		ready_queue_grow( &this, 0 );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+
+	this.io.cnt  = num_io;
+	this.io.ctxs = aalloc(num_io);
+	for(i; this.io.cnt) {
+		(this.io.ctxs[i]){ this, io_params };
+	}
+}
+
+void ^?{}(cluster & this) {
+	for(i; this.io.cnt) {
+		^(this.io.ctxs[i]){ true };
+	}
+	free(this.io.ctxs);
+
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock();
+
+		// Adjust the ready queue size
+		ready_queue_shrink( &this, 0 );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		if( 0 != this.print_stats ) {
+			__print_stats( this.stats, this.print_stats, true, this.name, (void*)&this );
+		}
+		free( this.stats );
+	#endif
+
+	unregister(this);
+}
+
+//=============================================================================================
+// Miscellaneous Initialization
+//=============================================================================================
+//-----------------------------------------------------------------------------
+// Global Queues
+static void doregister( cluster     & cltr ) {
+	lock      ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
+	push_front( __cfa_dbg_global_clusters.list, cltr );
+	unlock    ( __cfa_dbg_global_clusters.lock );
+}
+
+static void unregister( cluster     & cltr ) {
+	lock  ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
+	remove( __cfa_dbg_global_clusters.list, cltr );
+	unlock( __cfa_dbg_global_clusters.lock );
+}
+
+void doregister( cluster * cltr, $thread & thrd ) {
+	lock      (cltr->thread_list_lock __cfaabi_dbg_ctx2);
+	cltr->nthreads += 1;
+	push_front(cltr->threads, thrd);
+	unlock    (cltr->thread_list_lock);
+}
+
+void unregister( cluster * cltr, $thread & thrd ) {
+	lock  (cltr->thread_list_lock __cfaabi_dbg_ctx2);
+	remove(cltr->threads, thrd );
+	cltr->nthreads -= 1;
+	unlock(cltr->thread_list_lock);
+}
+
+static void check( int ret, const char func[] ) {
+	if ( ret ) {										// pthread routines return errno values
+		abort( "%s : internal error, error(%d) %s.", func, ret, strerror( ret ) );
+	} // if
+} // Abort
+
+void * __create_pthread( pthread_t * pthread, void * (*start)(void *), void * arg ) {
+	pthread_attr_t attr;
+
+	check( pthread_attr_init( &attr ), "pthread_attr_init" ); // initialize attribute
+
+	size_t stacksize;
+	// default stack size, normally defined by shell limit
+	check( pthread_attr_getstacksize( &attr, &stacksize ), "pthread_attr_getstacksize" );
+	assert( stacksize >= PTHREAD_STACK_MIN );
+
+	void * stack;
+	__cfaabi_dbg_debug_do(
+		stack = memalign( __page_size, stacksize + __page_size );
+		// pthread has no mechanism to create the guard page in user supplied stack.
+		if ( mprotect( stack, __page_size, PROT_NONE ) == -1 ) {
+			abort( "mprotect : internal error, mprotect failure, error(%d) %s.", errno, strerror( errno ) );
+		} // if
+	);
+	__cfaabi_dbg_no_debug_do(
+		stack = malloc( stacksize );
+	);
+
+	check( pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" );
+
+	check( pthread_create( pthread, &attr, start, arg ), "pthread_create" );
+	return stack;
+}
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -22,7 +22,4 @@
 #include "stats.hfa"
 
-#include "bits/random.hfa"
-
-
 //-----------------------------------------------------------------------------
 // Scheduler
@@ -53,19 +50,4 @@
 
 
-struct event_kernel_t {
-	alarm_list_t alarms;
-	__spinlock_t lock;
-};
-
-extern event_kernel_t * event_kernel;
-
-struct __cfa_kernel_preemption_state_t {
-	bool enabled;
-	bool in_progress;
-	unsigned short disable_count;
-};
-
-extern volatile thread_local __cfa_kernel_preemption_state_t preemption_state __attribute__ ((tls_model ( "initial-exec" )));
-
 extern cluster * mainCluster;
 
@@ -84,29 +66,30 @@
 void __unpark( struct __processor_id_t *, $thread * thrd __cfaabi_dbg_ctx_param2 );
 
-//-----------------------------------------------------------------------------
-// I/O
-void __kernel_io_startup     ( cluster &, unsigned, bool );
-void __kernel_io_finish_start( cluster & );
-void __kernel_io_prepare_stop( cluster & );
-void __kernel_io_shutdown    ( cluster &, bool );
+static inline bool __post(single_sem & this, struct __processor_id_t * id) {
+	for() {
+		struct $thread * expected = this.ptr;
+		if(expected == 1p) return false;
+		if(expected == 0p) {
+			if(__atomic_compare_exchange_n(&this.ptr, &expected, 1p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+				return false;
+			}
+		}
+		else {
+			if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+				__unpark( id, expected __cfaabi_dbg_ctx2 );
+				return true;
+			}
+		}
+	}
+}
 
 //-----------------------------------------------------------------------------
 // Utils
-#define KERNEL_STORAGE(T,X) __attribute((aligned(__alignof__(T)))) static char storage_##X[sizeof(T)]
-
-static inline uint64_t __tls_rand() {
-	#if defined(__SIZEOF_INT128__)
-		return __lehmer64( kernelTLS.rand_seed );
-	#else
-		return __xorshift64( kernelTLS.rand_seed );
-	#endif
-}
-
-
-void doregister( struct cluster & cltr );
-void unregister( struct cluster & cltr );
-
 void doregister( struct cluster * cltr, struct $thread & thrd );
 void unregister( struct cluster * cltr, struct $thread & thrd );
+
+//-----------------------------------------------------------------------------
+// I/O
+void ^?{}(io_context & this, bool );
 
 //=======================================================================
@@ -280,30 +263,4 @@
 void ready_queue_shrink(struct cluster * cltr, int target);
 
-//-----------------------------------------------------------------------
-// IO user data
-struct __io_user_data_t {
-	int32_t result;
-	$thread * thrd;
-};
-
-//-----------------------------------------------------------------------
-// Statics call at the end of each thread to register statistics
-#if !defined(__CFA_NO_STATISTICS__)
-	static inline struct __stats_t * __tls_stats() {
-		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-		/* paranoid */ verify( kernelTLS.this_stats );
-		return kernelTLS.this_stats;
-	}
-
-	#define __STATS__(in_kernel, ...) { \
-		if( !(in_kernel) ) disable_interrupts(); \
-		with( *__tls_stats() ) { \
-			__VA_ARGS__ \
-		} \
-		if( !(in_kernel) ) enable_interrupts( __cfaabi_dbg_ctx ); \
-	}
-#else
-	#define __STATS__(in_kernel, ...)
-#endif
 
 // Local Variables: //
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/preemption.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -26,4 +26,5 @@
 
 #include "bits/signal.hfa"
+#include "kernel_private.hfa"
 
 #if !defined(__CFA_DEFAULT_PREEMPTION__)
@@ -293,5 +294,5 @@
 // Startup routine to activate preemption
 // Called from kernel_startup
-void kernel_start_preemption() {
+void __kernel_alarm_startup() {
 	__cfaabi_dbg_print_safe( "Kernel : Starting preemption\n" );
 
@@ -315,5 +316,5 @@
 // Shutdown routine to deactivate preemption
 // Called from kernel_shutdown
-void kernel_stop_preemption() {
+void __kernel_alarm_shutdown() {
 	__cfaabi_dbg_print_safe( "Kernel : Preemption stopping\n" );
 
@@ -481,5 +482,5 @@
 	sigset_t oldset;
 	int ret;
-	ret = pthread_sigmask(0, 0p, &oldset);
+	ret = pthread_sigmask(0, ( const sigset_t * ) 0p, &oldset);  // workaround trac#208: cast should be unnecessary
 	if(ret != 0) { abort("ERROR sigprocmask returned %d", ret); }
 
Index: libcfa/src/concurrency/preemption.hfa
===================================================================
--- libcfa/src/concurrency/preemption.hfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/preemption.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -16,9 +16,14 @@
 #pragma once
 
+#include "bits/locks.hfa"
 #include "alarm.hfa"
-#include "kernel_private.hfa"
 
-void kernel_start_preemption();
-void kernel_stop_preemption();
+struct event_kernel_t {
+	alarm_list_t alarms;
+	__spinlock_t lock;
+};
+
+extern event_kernel_t * event_kernel;
+
 void update_preemption( processor * this, Duration duration );
 
Index: libcfa/src/concurrency/thread.hfa
===================================================================
--- libcfa/src/concurrency/thread.hfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/thread.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -84,8 +84,4 @@
 
 //-----------------------------------------------------------------------------
-// Thread getters
-static inline struct $thread * active_thread () { return TL_GET( this_thread ); }
-
-//-----------------------------------------------------------------------------
 // Scheduler API
 
@@ -106,15 +102,4 @@
 bool force_yield( enum __Preemption_Reason );
 
-static inline void yield() {
-	force_yield(__MANUAL_PREEMPTION);
-}
-
-// Yield: yield N times
-static inline void yield( unsigned times ) {
-	for( times ) {
-		yield();
-	}
-}
-
 //----------
 // sleep: force thread to block and be rescheduled after Duration duration