Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision c59a34600a910c39a26acef7ab377d43d56daa06)
+++ libcfa/src/concurrency/io.cfa	(revision f66605204cbdef9351e5c18bbb41c70d7643bbbb)
@@ -17,9 +17,17 @@
 
 #if !defined(HAVE_LINUX_IO_URING_H)
-	void __kernel_io_startup( cluster & this ) {
+	void __kernel_io_startup( cluster & ) {
 		// Nothing to do without io_uring
 	}
 
-	void __kernel_io_shutdown( cluster & this ) {
+	void __kernel_io_start_thrd( cluster & ) {
+		// Nothing to do without io_uring
+	}
+
+	void __kernel_io_stop_thrd ( cluster & ) {
+		// Nothing to do without io_uring
+	}
+
+	void __kernel_io_shutdown( cluster & ) {
 		// Nothing to do without io_uring
 	}
@@ -46,37 +54,48 @@
 	}
 
-	static void * __io_poller( void * arg );
-
-       // Weirdly, some systems that do support io_uring don't actually define these
-       #ifdef __alpha__
-       /*
-       * alpha is the only exception, all other architectures
-       * have common numbers for new system calls.
-       */
-       # ifndef __NR_io_uring_setup
-       #  define __NR_io_uring_setup           535
-       # endif
-       # ifndef __NR_io_uring_enter
-       #  define __NR_io_uring_enter           536
-       # endif
-       # ifndef __NR_io_uring_register
-       #  define __NR_io_uring_register        537
-       # endif
-       #else /* !__alpha__ */
-       # ifndef __NR_io_uring_setup
-       #  define __NR_io_uring_setup           425
-       # endif
-       # ifndef __NR_io_uring_enter
-       #  define __NR_io_uring_enter           426
-       # endif
-       # ifndef __NR_io_uring_register
-       #  define __NR_io_uring_register        427
-       # endif
-       #endif
+	static void * __io_poller_slow( void * arg );
+
+	// Weirdly, some systems that do support io_uring don't actually define these
+	#ifdef __alpha__
+		/*
+		* alpha is the only exception, all other architectures
+		* have common numbers for new system calls.
+		*/
+		#ifndef __NR_io_uring_setup
+			#define __NR_io_uring_setup           535
+		#endif
+		#ifndef __NR_io_uring_enter
+			#define __NR_io_uring_enter           536
+		#endif
+		#ifndef __NR_io_uring_register
+			#define __NR_io_uring_register        537
+		#endif
+	#else /* !__alpha__ */
+		#ifndef __NR_io_uring_setup
+			#define __NR_io_uring_setup           425
+		#endif
+		#ifndef __NR_io_uring_enter
+			#define __NR_io_uring_enter           426
+		#endif
+		#ifndef __NR_io_uring_register
+			#define __NR_io_uring_register        427
+		#endif
+	#endif
+
+	#if defined(__CFA_IO_POLLING_USER__)
+		void ?{}( __io_poller_fast & this, struct cluster & cltr ) {
+			this.ring = &cltr.io;
+			(this.thrd){ "I/O Poller", cltr };
+		}
+		void ^?{}( __io_poller_fast & mutex this );
+      	void main( __io_poller_fast & this );
+      	static inline $thread * get_thread( __io_poller_fast & this ) { return &this.thrd; }
+		void ^?{}( __io_poller_fast & mutex this ) {}
+	#endif
 
 //=============================================================================================
 // I/O Startup / Shutdown logic
 //=============================================================================================
-	void __kernel_io_startup( cluster & this ) {
+	void __kernel_io_startup( cluster & this, bool main_cluster ) {
 		// Step 1 : call to setup
 		struct io_uring_params params;
@@ -184,19 +203,46 @@
 		#endif
 
+		if(!main_cluster) {
+			__kernel_io_finish_start( this );
+		}
+	}
+
+	void __kernel_io_finish_start( cluster & this ) {
+		#if defined(__CFA_IO_POLLING_USER__)
+			(this.io.poller.fast){ this };
+			__thrd_start( this.io.poller.fast, main );
+		#endif
+
 		// Create the poller thread
-		this.io.stack = __create_pthread( &this.io.poller, __io_poller, &this );
-	}
-
-	void __kernel_io_shutdown( cluster & this ) {
-		// Stop the IO Poller
-		#if __CFA_IO_POLLING__ == __CFA_IO_POLLING_NAIVE__
+		this.io.poller.slow.stack = __create_pthread( &this.io.poller.slow.kthrd, __io_poller_slow, &this );
+	}
+
+	void __kernel_io_prepare_stop( cluster & this ) {
 		// Notify the poller thread of the shutdown
 		__atomic_store_n(&this.io.done, true, __ATOMIC_SEQ_CST);
+
+		// Stop the IO Poller
 		sigval val = { 1 };
-		pthread_sigqueue( this.io.poller, SIGUSR1, val );
+		pthread_sigqueue( this.io.poller.slow.kthrd, SIGUSR1, val );
+		#if defined(__CFA_IO_POLLING_USER__)
+			post( this.io.poller.sem );
+		#endif
 
 		// Wait for the poller thread to finish
-		pthread_join( this.io.poller, 0p );
-		free( this.io.stack );
+		pthread_join( this.io.poller.slow.kthrd, 0p );
+		free( this.io.poller.slow.stack );
+
+		#if defined(__CFA_IO_POLLING_USER__)
+			// unpark the fast io_poller
+			unpark( &this.io.poller.fast.thrd __cfaabi_dbg_ctx2 );
+
+			^(this.io.poller.fast){};
+		#endif
+	}
+
+	void __kernel_io_shutdown( cluster & this, bool main_cluster ) {
+		if(!main_cluster) {
+			__kernel_io_prepare_stop( this );
+		}
 
 		// print statistics
@@ -246,6 +292,6 @@
 	// Process a single completion message from the io_uring
 	// This is NOT thread-safe
-	static int __drain_io( struct io_ring & ring, sigset_t & mask, int waitcnt ) {
-		int ret = syscall( __NR_io_uring_enter, ring.fd, 0, waitcnt, IORING_ENTER_GETEVENTS, &mask, _NSIG / 8);
+	static int __drain_io( struct io_ring & ring, sigset_t * mask, int waitcnt, bool in_kernel ) {
+		int ret = syscall( __NR_io_uring_enter, ring.fd, 0, waitcnt, IORING_ENTER_GETEVENTS, mask, _NSIG / 8);
 		if( ret < 0 ) {
 			switch((int)errno) {
@@ -281,5 +327,6 @@
 
 			data->result = cqe.res;
-			__unpark( data->thrd __cfaabi_dbg_ctx2 );
+			if(!in_kernel) { unpark( data->thrd __cfaabi_dbg_ctx2 ); }
+			else         { __unpark( data->thrd __cfaabi_dbg_ctx2 ); }
 		}
 
@@ -300,5 +347,5 @@
 	}
 
-	static void * __io_poller( void * arg ) {
+	static void * __io_poller_slow( void * arg ) {
 		cluster * cltr = (cluster *)arg;
 		struct io_ring & ring = cltr->io;
@@ -316,9 +363,45 @@
 
 		while(!__atomic_load_n(&ring.done, __ATOMIC_SEQ_CST)) {
-			__drain_io( ring, mask, 1 );
+			#if defined(__CFA_IO_POLLING_USER__)
+
+				// In the user-thread approach drain and if anything was drained,
+				// batton pass to the user-thread
+				int count = __drain_io( ring, &mask, 1, true );
+				if(count > 0) {
+					__unpark( &ring.poller.fast.thrd __cfaabi_dbg_ctx2 );
+					wait( ring.poller.sem );
+				}
+
+			#else
+
+				//In the naive approach, just poll the io completion queue directly
+				__drain_io( ring, &mask, 1, true );
+
+			#endif
 		}
 
 		return 0p;
 	}
+
+	#if defined(__CFA_IO_POLLING_USER__)
+		void main( __io_poller_fast & this ) {
+			// Start parked
+			park( __cfaabi_dbg_ctx );
+
+			// Then loop until we need to start
+			while(!__atomic_load_n(&this.ring->done, __ATOMIC_SEQ_CST)) {
+				// Drain the io
+				if(0 > __drain_io( *this.ring, 0p, 0, false )) {
+					// If we got something, just yield and check again
+					yield();
+				}
+				else {
+					// We didn't get anything baton pass to the slow poller
+					post( this.ring->poller.sem );
+					park( __cfaabi_dbg_ctx );
+				}
+			}
+		}
+	#endif
 
 //=============================================================================================
@@ -422,10 +505,10 @@
 		this.len = len;
 	}
-#endif
+
 
 //=============================================================================================
 // I/O Interface
 //=============================================================================================
-#if defined(HAVE_LINUX_IO_URING_H)
+
 	#define __submit_prelude \
 		struct io_ring & ring = active_cluster()->io; \
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision c59a34600a910c39a26acef7ab377d43d56daa06)
+++ libcfa/src/concurrency/kernel.cfa	(revision f66605204cbdef9351e5c18bbb41c70d7643bbbb)
@@ -266,5 +266,5 @@
 	threads{ __get };
 
-	__kernel_io_startup( this );
+	__kernel_io_startup( this, &this == mainCluster );
 
 	doregister(this);
@@ -272,5 +272,5 @@
 
 void ^?{}(cluster & this) {
-	__kernel_io_shutdown( this );
+	__kernel_io_shutdown( this, &this == mainCluster );
 
 	unregister(this);
@@ -784,7 +784,12 @@
 
 
-
 	// THE SYSTEM IS NOW COMPLETELY RUNNING
-	__cfaabi_dbg_print_safe("Kernel : Started\n--------------------------------------------------\n\n");
+
+
+	// Now that the system is up, finish creating systems that need threading
+	__kernel_io_finish_start( *mainCluster );
+
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Started\n--------------------------------------------------\n\n");
 
 	verify( ! kernelTLS.preemption_state.enabled );
@@ -794,9 +799,12 @@
 
 static void __kernel_shutdown(void) {
-	__cfaabi_dbg_print_safe("\n--------------------------------------------------\nKernel : Shutting down\n");
+	//Before we start shutting things down, wait for systems that need threading to shutdown
+	__kernel_io_prepare_stop( *mainCluster );
 
 	/* paranoid */ verify( TL_GET( preemption_state.enabled ) );
 	disable_interrupts();
 	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+
+	__cfadbg_print_safe(runtime_core, "\n--------------------------------------------------\nKernel : Shutting down\n");
 
 	// SKULLDUGGERY: Notify the mainProcessor it needs to terminates.
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision c59a34600a910c39a26acef7ab377d43d56daa06)
+++ libcfa/src/concurrency/kernel.hfa	(revision f66605204cbdef9351e5c18bbb41c70d7643bbbb)
@@ -136,4 +136,5 @@
 	// Like head/tail but not seen by the kernel
 	volatile uint32_t alloc;
+	volatile uint32_t ready;
 
 	__spinlock_t lock;
@@ -187,4 +188,11 @@
 };
 
+#if defined(__CFA_IO_POLLING_USER__)
+	struct __io_poller_fast {
+		struct io_ring * ring;
+		$thread thrd;
+	};
+#endif
+
 struct io_ring {
 	struct io_uring_sq submit_q;
@@ -192,8 +200,16 @@
 	uint32_t flags;
 	int fd;
-	pthread_t poller;
-	void * stack;
+	semaphore submit;
 	volatile bool done;
-	semaphore submit;
+	struct {
+		struct {
+			void * stack;
+			pthread_t kthrd;
+		} slow;
+		#if defined(__CFA_IO_POLLING_USER__)
+			__io_poller_fast fast;
+			__bin_sem_t sem;
+		#endif
+	} poller;
 };
 #endif
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision c59a34600a910c39a26acef7ab377d43d56daa06)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision f66605204cbdef9351e5c18bbb41c70d7643bbbb)
@@ -75,6 +75,8 @@
 //-----------------------------------------------------------------------------
 // I/O
-void __kernel_io_startup ( cluster & );
-void __kernel_io_shutdown( cluster & );
+void __kernel_io_startup     ( cluster &, bool );
+void __kernel_io_finish_start( cluster & );
+void __kernel_io_prepare_stop( cluster & );
+void __kernel_io_shutdown    ( cluster &, bool );
 
 //-----------------------------------------------------------------------------
