Index: libcfa/src/bits/defs.hfa
===================================================================
--- libcfa/src/bits/defs.hfa	(revision 3ac8b9f795fc403158704c66c51faffe5be12842)
+++ libcfa/src/bits/defs.hfa	(revision e660761701e94bf2b2ae0130750e911d4122366f)
@@ -16,6 +16,4 @@
 #pragma once
 
-#include <stdbool.h>
-#include <stddef.h>
 #include <stdint.h>
 
@@ -54,74 +52,2 @@
     return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
 }
-
-// #define __CFA_NO_BIT_TEST_AND_SET__
-
-#if defined( __i386 )
-static inline bool __atomic_bts(volatile unsigned long int * target, unsigned long int bit ) {
-	#if defined(__CFA_NO_BIT_TEST_AND_SET__)
-        unsigned long int mask = 1ul << bit;
-        unsigned long int ret = __atomic_fetch_or(target, mask, (int)__ATOMIC_RELAXED);
-        return (ret & mask) != 0;
-    #else
-        int result = 0;
-        asm volatile(
-            "LOCK btsl %[bit], %[target]\n\t"
-            : "=@ccc" (result)
-            : [target] "m" (*target), [bit] "r" (bit)
-        );
-        return result != 0;
-    #endif
-}
-
-static inline bool __atomic_btr(volatile unsigned long int * target, unsigned long int bit ) {
-	#if defined(__CFA_NO_BIT_TEST_AND_SET__)
-        unsigned long int mask = 1ul << bit;
-        unsigned long int ret = __atomic_fetch_and(target, ~mask, (int)__ATOMIC_RELAXED);
-        return (ret & mask) != 0;
-	#else
-        int result = 0;
-        asm volatile(
-            "LOCK btrl %[bit], %[target]\n\t"
-            :"=@ccc" (result)
-            : [target] "m" (*target), [bit] "r" (bit)
-        );
-        return result != 0;
-    #endif
-}
-#elif defined( __x86_64 )
-static inline bool __atomic_bts(volatile unsigned long long int * target, unsigned long long int bit ) {
-	#if defined(__CFA_NO_BIT_TEST_AND_SET__)
-        unsigned long long int mask = 1ul << bit;
-        unsigned long long int ret = __atomic_fetch_or(target, mask, (int)__ATOMIC_RELAXED);
-        return (ret & mask) != 0;
-    #else
-        int result = 0;
-        asm volatile(
-            "LOCK btsq %[bit], %[target]\n\t"
-            : "=@ccc" (result)
-            : [target] "m" (*target), [bit] "r" (bit)
-        );
-        return result != 0;
-    #endif
-}
-
-static inline bool __atomic_btr(volatile unsigned long long int * target, unsigned long long int bit ) {
-	#if defined(__CFA_NO_BIT_TEST_AND_SET__)
-        unsigned long long int mask = 1ul << bit;
-        unsigned long long int ret = __atomic_fetch_and(target, ~mask, (int)__ATOMIC_RELAXED);
-        return (ret & mask) != 0;
-	#else
-        int result = 0;
-        asm volatile(
-            "LOCK btrq %[bit], %[target]\n\t"
-            :"=@ccc" (result)
-            : [target] "m" (*target), [bit] "r" (bit)
-        );
-        return result != 0;
-    #endif
-}
-#elif defined( __ARM_ARCH )
-    #error __atomic_bts and __atomic_btr not implemented for arm
-#else
-	#error uknown hardware architecture
-#endif
Index: libcfa/src/concurrency/alarm.cfa
===================================================================
--- libcfa/src/concurrency/alarm.cfa	(revision 3ac8b9f795fc403158704c66c51faffe5be12842)
+++ libcfa/src/concurrency/alarm.cfa	(revision e660761701e94bf2b2ae0130750e911d4122366f)
@@ -23,5 +23,5 @@
 
 #include "alarm.hfa"
-#include "kernel_private.hfa"
+#include "kernel/fwd.hfa"
 #include "preemption.hfa"
 
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 3ac8b9f795fc403158704c66c51faffe5be12842)
+++ libcfa/src/concurrency/io.cfa	(revision e660761701e94bf2b2ae0130750e911d4122366f)
@@ -16,5 +16,5 @@
 #if defined(__CFA_DEBUG__)
 	// #define __CFA_DEBUG_PRINT_IO__
-	#define __CFA_DEBUG_PRINT_IO_CORE__
+	// #define __CFA_DEBUG_PRINT_IO_CORE__
 #endif
 
@@ -173,5 +173,5 @@
 } iopoll;
 
-void __kernel_io_startup() {
+void __kernel_io_startup(void) {
 	__cfaabi_dbg_print_safe( "Kernel : Creating EPOLL instance\n" );
 
@@ -187,5 +187,5 @@
 }
 
-void __kernel_io_shutdown() {
+void __kernel_io_shutdown(void) {
 	// Notify the io poller thread of the shutdown
 	iopoll.run = false;
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 3ac8b9f795fc403158704c66c51faffe5be12842)
+++ libcfa/src/concurrency/kernel.cfa	(revision e660761701e94bf2b2ae0130750e911d4122366f)
@@ -18,22 +18,12 @@
 
 //C Includes
-#include <stddef.h>
 #include <errno.h>
-#include <string.h>
 #include <stdio.h>
-#include <fenv.h>
 #include <signal.h>
 #include <unistd.h>
-#include <limits.h>										// PTHREAD_STACK_MIN
-#include <sys/mman.h>									// mprotect
-extern "C" {
-#include <sys/resource.h>
-}
 
 //CFA Includes
-#include "time.hfa"
 #include "kernel_private.hfa"
 #include "preemption.hfa"
-#include "startup.hfa"
 
 //Private includes
@@ -45,12 +35,4 @@
 // Some assembly required
 #if defined( __i386 )
-	#define CtxGet( ctx )        \
-		__asm__ volatile (     \
-			"movl %%esp,%0\n"\
-			"movl %%ebp,%1\n"\
-			: "=rm" (ctx.SP),\
-				"=rm" (ctx.FP) \
-		)
-
 	// mxcr : SSE Status and Control bits (control bits are preserved across function calls)
 	// fcw  : X87 FPU control word (preserved across function calls)
@@ -74,12 +56,4 @@
 
 #elif defined( __x86_64 )
-	#define CtxGet( ctx )        \
-		__asm__ volatile (     \
-			"movq %%rsp,%0\n"\
-			"movq %%rbp,%1\n"\
-			: "=rm" (ctx.SP),\
-				"=rm" (ctx.FP) \
-		)
-
 	#define __x87_store         \
 		uint32_t __mxcr;      \
@@ -102,16 +76,10 @@
 
 #elif defined( __ARM_ARCH )
-#define CtxGet( ctx ) __asm__ ( \
-		"mov %0,%%sp\n"   \
-		"mov %1,%%r11\n"   \
-	: "=rm" (ctx.SP), "=rm" (ctx.FP) )
 #else
 	#error unknown hardware architecture
 #endif
 
-//-----------------------------------------------------------------------------
-//Start and stop routine for the kernel, declared first to make sure they run first
-static void __kernel_startup (void) __attribute__(( constructor( STARTUP_PRIORITY_KERNEL ) ));
-static void __kernel_shutdown(void) __attribute__(( destructor ( STARTUP_PRIORITY_KERNEL ) ));
+extern $thread * mainThread;
+extern processor * mainProcessor;
 
 //-----------------------------------------------------------------------------
@@ -120,251 +88,7 @@
 static bool __has_next_thread(cluster * this);
 static void __run_thread(processor * this, $thread * dst);
-static bool __wake_proc(processor *);
 static bool __wake_one(struct __processor_id_t * id, cluster * cltr);
 static void __halt(processor * this);
-
-//-----------------------------------------------------------------------------
-// Kernel storage
-KERNEL_STORAGE(cluster,	             mainCluster);
-KERNEL_STORAGE(processor,            mainProcessor);
-KERNEL_STORAGE($thread,	             mainThread);
-KERNEL_STORAGE(__stack_t,            mainThreadCtx);
-KERNEL_STORAGE(io_context,           mainPollerThread);
-KERNEL_STORAGE(__scheduler_RWLock_t, __scheduler_lock);
-#if !defined(__CFA_NO_STATISTICS__)
-KERNEL_STORAGE(__stats_t, mainProcStats);
-#endif
-
-cluster              * mainCluster;
-processor            * mainProcessor;
-$thread              * mainThread;
-__scheduler_RWLock_t * __scheduler_lock;
-
-extern "C" {
-	struct { __dllist_t(cluster) list; __spinlock_t lock; } __cfa_dbg_global_clusters;
-}
-
-size_t __page_size = 0;
-
-//-----------------------------------------------------------------------------
-// Global state
-thread_local struct KernelThreadData kernelTLS __attribute__ ((tls_model ( "initial-exec" ))) @= {
-	NULL,												// cannot use 0p
-	NULL,
-	NULL,
-	{ 1, false, false },
-};
-
-//-----------------------------------------------------------------------------
-// Struct to steal stack
-struct current_stack_info_t {
-	__stack_t * storage;								// pointer to stack object
-	void * base;										// base of stack
-	void * limit;										// stack grows towards stack limit
-	void * context;										// address of cfa_context_t
-};
-
-void ?{}( current_stack_info_t & this ) {
-	__stack_context_t ctx;
-	CtxGet( ctx );
-	this.base = ctx.FP;
-
-	rlimit r;
-	getrlimit( RLIMIT_STACK, &r);
-	size_t size = r.rlim_cur;
-
-	this.limit = (void *)(((intptr_t)this.base) - size);
-	this.context = &storage_mainThreadCtx;
-}
-
-//-----------------------------------------------------------------------------
-// Main thread construction
-
-void ?{}( $coroutine & this, current_stack_info_t * info) with( this ) {
-	stack.storage = info->storage;
-	with(*stack.storage) {
-		limit     = info->limit;
-		base      = info->base;
-	}
-	__attribute__((may_alias)) intptr_t * istorage = (intptr_t*) &stack.storage;
-	*istorage |= 0x1;
-	name = "Main Thread";
-	state = Start;
-	starter = 0p;
-	last = 0p;
-	cancellation = 0p;
-}
-
-void ?{}( $thread & this, current_stack_info_t * info) with( this ) {
-	ticket = 1;
-	state = Start;
-	self_cor{ info };
-	curr_cor = &self_cor;
-	curr_cluster = mainCluster;
-	self_mon.owner = &this;
-	self_mon.recursion = 1;
-	self_mon_p = &self_mon;
-	link.next = 0p;
-	link.prev = 0p;
-
-	node.next = 0p;
-	node.prev = 0p;
-	doregister(curr_cluster, this);
-
-	monitors{ &self_mon_p, 1, (fptr_t)0 };
-}
-
-//-----------------------------------------------------------------------------
-// Processor coroutine
-void ?{}(processorCtx_t & this) {
-
-}
-
-// Construct the processor context of non-main processors
-static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info) {
-	(this.__cor){ info };
-	this.proc = proc;
-}
-
-static void * __invoke_processor(void * arg);
-
-static init(processor & this, const char name[], cluster & _cltr) with( this ) {
-	this.name = name;
-	this.cltr = &_cltr;
-	id = -1u;
-	destroyer = 0p;
-	do_terminate = false;
-	preemption_alarm = 0p;
-	pending_preemption = false;
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		print_stats = 0;
-		print_halts = false;
-	#endif
-
-	int target = __atomic_add_fetch( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
-
-	id = doregister((__processor_id_t*)&this);
-
-	// Lock the RWlock so no-one pushes/pops while we are changing the queue
-	uint_fast32_t last_size = ready_mutate_lock();
-
-		// Adjust the ready queue size
-		ready_queue_grow( cltr, target );
-
-	// Unlock the RWlock
-	ready_mutate_unlock( last_size );
-
-	__cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
-}
-
-// Not a ctor, it just preps the destruction but should not destroy members
-void deinit(processor & this) {
-
-	int target = __atomic_sub_fetch( &this.cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
-
-	// Lock the RWlock so no-one pushes/pops while we are changing the queue
-	uint_fast32_t last_size = ready_mutate_lock();
-
-		// Adjust the ready queue size
-		ready_queue_shrink( this.cltr, target );
-
-		// Make sure we aren't on the idle queue
-		unsafe_remove( this.cltr->idles, &this );
-
-	// Unlock the RWlock
-	ready_mutate_unlock( last_size );
-
-	// Finally we don't need the read_lock any more
-	unregister((__processor_id_t*)&this);
-}
-
-void ?{}(processor & this, const char name[], cluster & _cltr) {
-	( this.idle ){};
-	( this.terminated ){ 0 };
-	( this.runner ){};
-	init( this, name, _cltr );
-
-	__cfadbg_print_safe(runtime_core, "Kernel : Starting core %p\n", &this);
-
-	this.stack = __create_pthread( &this.kernel_thread, __invoke_processor, (void *)&this );
-
-}
-
-void ^?{}(processor & this) with( this ){
-	if( ! __atomic_load_n(&do_terminate, __ATOMIC_ACQUIRE) ) {
-		__cfadbg_print_safe(runtime_core, "Kernel : core %p signaling termination\n", &this);
-
-		__atomic_store_n(&do_terminate, true, __ATOMIC_RELAXED);
-		__wake_proc( &this );
-
-		P( terminated );
-		verify( kernelTLS.this_processor != &this);
-	}
-
-	int err = pthread_join( kernel_thread, 0p );
-	if( err != 0 ) abort("KERNEL ERROR: joining processor %p caused error %s\n", &this, strerror(err));
-
-	free( this.stack );
-
-	deinit( this );
-}
-
-void ?{}(cluster & this, const char name[], Duration preemption_rate, unsigned num_io, const io_context_params & io_params) with( this ) {
-	this.name = name;
-	this.preemption_rate = preemption_rate;
-	this.nprocessors = 0;
-	ready_queue{};
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		print_stats = 0;
-		stats = alloc();
-		__init_stats( stats );
-	#endif
-
-	threads{ __get };
-
-	doregister(this);
-
-	// Lock the RWlock so no-one pushes/pops while we are changing the queue
-	uint_fast32_t last_size = ready_mutate_lock();
-
-		// Adjust the ready queue size
-		ready_queue_grow( &this, 0 );
-
-	// Unlock the RWlock
-	ready_mutate_unlock( last_size );
-
-	this.io.cnt  = num_io;
-	this.io.ctxs = aalloc(num_io);
-	for(i; this.io.cnt) {
-		(this.io.ctxs[i]){ this, io_params };
-	}
-}
-
-void ^?{}(cluster & this) {
-	for(i; this.io.cnt) {
-		^(this.io.ctxs[i]){ true };
-	}
-	free(this.io.ctxs);
-
-	// Lock the RWlock so no-one pushes/pops while we are changing the queue
-	uint_fast32_t last_size = ready_mutate_lock();
-
-		// Adjust the ready queue size
-		ready_queue_shrink( &this, 0 );
-
-	// Unlock the RWlock
-	ready_mutate_unlock( last_size );
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		if( 0 != this.print_stats ) {
-			__print_stats( this.stats, this.print_stats, true, this.name, (void*)&this );
-		}
-		free( this.stats );
-	#endif
-
-	unregister(this);
-}
+bool __wake_proc(processor *);
 
 //=============================================================================================
@@ -557,147 +281,4 @@
 }
 
-// KERNEL_ONLY
-// Context invoker for processors
-// This is the entry point for processors (kernel threads)
-// It effectively constructs a coroutine by stealing the pthread stack
-static void * __invoke_processor(void * arg) {
-	#if !defined( __CFA_NO_STATISTICS__ )
-		__stats_t local_stats;
-		__init_stats( &local_stats );
-		kernelTLS.this_stats = &local_stats;
-	#endif
-
-	processor * proc = (processor *) arg;
-	kernelTLS.this_processor = proc;
-	kernelTLS.this_thread    = 0p;
-	kernelTLS.preemption_state.[enabled, disable_count] = [false, 1];
-	// SKULLDUGGERY: We want to create a context for the processor coroutine
-	// which is needed for the 2-step context switch. However, there is no reason
-	// to waste the perfectly valid stack create by pthread.
-	current_stack_info_t info;
-	__stack_t ctx;
-	info.storage = &ctx;
-	(proc->runner){ proc, &info };
-
-	__cfaabi_dbg_print_safe("Coroutine : created stack %p\n", get_coroutine(proc->runner)->stack.storage);
-
-	//Set global state
-	kernelTLS.this_thread = 0p;
-
-	//We now have a proper context from which to schedule threads
-	__cfadbg_print_safe(runtime_core, "Kernel : core %p created (%p, %p)\n", proc, &proc->runner, &ctx);
-
-	// SKULLDUGGERY: Since the coroutine doesn't have its own stack, we can't
-	// resume it to start it like it normally would, it will just context switch
-	// back to here. Instead directly call the main since we already are on the
-	// appropriate stack.
-	get_coroutine(proc->runner)->state = Active;
-	main( proc->runner );
-	get_coroutine(proc->runner)->state = Halted;
-
-	// Main routine of the core returned, the core is now fully terminated
-	__cfadbg_print_safe(runtime_core, "Kernel : core %p main ended (%p)\n", proc, &proc->runner);
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		__tally_stats(proc->cltr->stats, &local_stats);
-		if( 0 != proc->print_stats ) {
-			__print_stats( &local_stats, proc->print_stats, true, proc->name, (void*)proc );
-		}
-	#endif
-
-	return 0p;
-}
-
-static void Abort( int ret, const char func[] ) {
-	if ( ret ) {										// pthread routines return errno values
-		abort( "%s : internal error, error(%d) %s.", func, ret, strerror( ret ) );
-	} // if
-} // Abort
-
-void * __create_pthread( pthread_t * pthread, void * (*start)(void *), void * arg ) {
-	pthread_attr_t attr;
-
-	Abort( pthread_attr_init( &attr ), "pthread_attr_init" ); // initialize attribute
-
-	size_t stacksize;
-	// default stack size, normally defined by shell limit
-	Abort( pthread_attr_getstacksize( &attr, &stacksize ), "pthread_attr_getstacksize" );
-	assert( stacksize >= PTHREAD_STACK_MIN );
-
-	void * stack;
-	__cfaabi_dbg_debug_do(
-		stack = memalign( __page_size, stacksize + __page_size );
-		// pthread has no mechanism to create the guard page in user supplied stack.
-		if ( mprotect( stack, __page_size, PROT_NONE ) == -1 ) {
-			abort( "mprotect : internal error, mprotect failure, error(%d) %s.", errno, strerror( errno ) );
-		} // if
-	);
-	__cfaabi_dbg_no_debug_do(
-		stack = malloc( stacksize );
-	);
-
-	Abort( pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" );
-
-	Abort( pthread_create( pthread, &attr, start, arg ), "pthread_create" );
-	return stack;
-}
-
-// KERNEL_ONLY
-static void __kernel_first_resume( processor * this ) {
-	$thread * src = mainThread;
-	$coroutine * dst = get_coroutine(this->runner);
-
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	kernelTLS.this_thread->curr_cor = dst;
-	__stack_prepare( &dst->stack, 65000 );
-	__cfactx_start(main, dst, this->runner, __cfactx_invoke_coroutine);
-
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	dst->last = &src->self_cor;
-	dst->starter = dst->starter ? dst->starter : &src->self_cor;
-
-	// make sure the current state is still correct
-	/* paranoid */ verify(src->state == Ready);
-
-	// context switch to specified coroutine
-	verify( dst->context.SP );
-	__cfactx_switch( &src->context, &dst->context );
-	// when __cfactx_switch returns we are back in the src coroutine
-
-	mainThread->curr_cor = &mainThread->self_cor;
-
-	// make sure the current state has been update
-	/* paranoid */ verify(src->state == Active);
-
-	verify( ! kernelTLS.preemption_state.enabled );
-}
-
-// KERNEL_ONLY
-static void __kernel_last_resume( processor * this ) {
-	$coroutine * src = &mainThread->self_cor;
-	$coroutine * dst = get_coroutine(this->runner);
-
-	verify( ! kernelTLS.preemption_state.enabled );
-	verify( dst->starter == src );
-	verify( dst->context.SP );
-
-	// SKULLDUGGERY in debug the processors check that the
-	// stack is still within the limit of the stack limits after running a thread.
-	// that check doesn't make sense if we context switch to the processor using the
-	// coroutine semantics. Since this is a special case, use the current context
-	// info to populate these fields.
-	__cfaabi_dbg_debug_do(
-		__stack_context_t ctx;
-		CtxGet( ctx );
-		mainThread->context.SP = ctx.SP;
-		mainThread->context.FP = ctx.FP;
-	)
-
-	// context switch to the processor
-	__cfactx_switch( &src->context, &dst->context );
-}
-
 //-----------------------------------------------------------------------------
 // Scheduler routines
@@ -841,157 +422,4 @@
 
 //=============================================================================================
-// Kernel Setup logic
-//=============================================================================================
-//-----------------------------------------------------------------------------
-// Kernel boot procedures
-static void __kernel_startup(void) {
-	verify( ! kernelTLS.preemption_state.enabled );
-	__cfadbg_print_safe(runtime_core, "Kernel : Starting\n");
-
-	__page_size = sysconf( _SC_PAGESIZE );
-
-	__cfa_dbg_global_clusters.list{ __get };
-	__cfa_dbg_global_clusters.lock{};
-
-	// Initialize the global scheduler lock
-	__scheduler_lock = (__scheduler_RWLock_t*)&storage___scheduler_lock;
-	(*__scheduler_lock){};
-
-	// Initialize the main cluster
-	mainCluster = (cluster *)&storage_mainCluster;
-	(*mainCluster){"Main Cluster", 0};
-
-	__cfadbg_print_safe(runtime_core, "Kernel : Main cluster ready\n");
-
-	// Start by initializing the main thread
-	// SKULLDUGGERY: the mainThread steals the process main thread
-	// which will then be scheduled by the mainProcessor normally
-	mainThread = ($thread *)&storage_mainThread;
-	current_stack_info_t info;
-	info.storage = (__stack_t*)&storage_mainThreadCtx;
-	(*mainThread){ &info };
-
-	__cfadbg_print_safe(runtime_core, "Kernel : Main thread ready\n");
-
-
-
-	// Construct the processor context of the main processor
-	void ?{}(processorCtx_t & this, processor * proc) {
-		(this.__cor){ "Processor" };
-		this.__cor.starter = 0p;
-		this.proc = proc;
-	}
-
-	void ?{}(processor & this) with( this ) {
-		( this.idle ){};
-		( this.terminated ){ 0 };
-		( this.runner ){};
-		init( this, "Main Processor", *mainCluster );
-		kernel_thread = pthread_self();
-
-		runner{ &this };
-		__cfadbg_print_safe(runtime_core, "Kernel : constructed main processor context %p\n", &runner);
-	}
-
-	// Initialize the main processor and the main processor ctx
-	// (the coroutine that contains the processing control flow)
-	mainProcessor = (processor *)&storage_mainProcessor;
-	(*mainProcessor){};
-
-	//initialize the global state variables
-	kernelTLS.this_processor = mainProcessor;
-	kernelTLS.this_thread    = mainThread;
-
-	#if !defined( __CFA_NO_STATISTICS__ )
-		kernelTLS.this_stats = (__stats_t *)& storage_mainProcStats;
-		__init_stats( kernelTLS.this_stats );
-	#endif
-
-	// Start IO
-	__kernel_io_startup();
-
-	// Enable preemption
-	kernel_start_preemption();
-
-	// Add the main thread to the ready queue
-	// once resume is called on mainProcessor->runner the mainThread needs to be scheduled like any normal thread
-	__schedule_thread((__processor_id_t *)mainProcessor, mainThread);
-
-	// SKULLDUGGERY: Force a context switch to the main processor to set the main thread's context to the current UNIX
-	// context. Hence, the main thread does not begin through __cfactx_invoke_thread, like all other threads. The trick here is that
-	// mainThread is on the ready queue when this call is made.
-	__kernel_first_resume( kernelTLS.this_processor );
-
-
-	// THE SYSTEM IS NOW COMPLETELY RUNNING
-
-
-	// Now that the system is up, finish creating systems that need threading
-	mainCluster->io.ctxs = (io_context *)&storage_mainPollerThread;
-	mainCluster->io.cnt  = 1;
-	(*mainCluster->io.ctxs){ *mainCluster };
-
-	__cfadbg_print_safe(runtime_core, "Kernel : Started\n--------------------------------------------------\n\n");
-
-	verify( ! kernelTLS.preemption_state.enabled );
-	enable_interrupts( __cfaabi_dbg_ctx );
-	verify( TL_GET( preemption_state.enabled ) );
-}
-
-static void __kernel_shutdown(void) {
-	//Before we start shutting things down, wait for systems that need threading to shutdown
-	^(*mainCluster->io.ctxs){};
-	mainCluster->io.cnt  = 0;
-	mainCluster->io.ctxs = 0p;
-
-	/* paranoid */ verify( TL_GET( preemption_state.enabled ) );
-	disable_interrupts();
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-
-	__cfadbg_print_safe(runtime_core, "\n--------------------------------------------------\nKernel : Shutting down\n");
-
-	// SKULLDUGGERY: Notify the mainProcessor it needs to terminates.
-	// When its coroutine terminates, it return control to the mainThread
-	// which is currently here
-	__atomic_store_n(&mainProcessor->do_terminate, true, __ATOMIC_RELEASE);
-	__kernel_last_resume( kernelTLS.this_processor );
-	mainThread->self_cor.state = Halted;
-
-	// THE SYSTEM IS NOW COMPLETELY STOPPED
-
-	// Disable preemption
-	kernel_stop_preemption();
-
-	// Stop IO
-	__kernel_io_shutdown();
-
-	// Destroy the main processor and its context in reverse order of construction
-	// These were manually constructed so we need manually destroy them
-	void ^?{}(processor & this) with( this ){
-		deinit( this );
-
-		/* paranoid */ verify( this.do_terminate == true );
-		__cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
-	}
-
-	^(*mainProcessor){};
-
-	// Final step, destroy the main thread since it is no longer needed
-
-	// Since we provided a stack to this taxk it will not destroy anything
-	/* paranoid */ verify(mainThread->self_cor.stack.storage == (__stack_t*)(((uintptr_t)&storage_mainThreadCtx)| 0x1));
-	^(*mainThread){};
-
-	^(*mainCluster){};
-
-	^(*__scheduler_lock){};
-
-	^(__cfa_dbg_global_clusters.list){};
-	^(__cfa_dbg_global_clusters.lock){};
-
-	__cfadbg_print_safe(runtime_core, "Kernel : Shutdown complete\n");
-}
-
-//=============================================================================================
 // Kernel Idle Sleep
 //=============================================================================================
@@ -1013,5 +441,5 @@
 
 // Unconditionnaly wake a thread
-static bool __wake_proc(processor * this) {
+bool __wake_proc(processor * this) {
 	__cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this);
 
@@ -1189,32 +617,4 @@
 
 //-----------------------------------------------------------------------------
-// Global Queues
-void doregister( cluster     & cltr ) {
-	lock      ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
-	push_front( __cfa_dbg_global_clusters.list, cltr );
-	unlock    ( __cfa_dbg_global_clusters.lock );
-}
-
-void unregister( cluster     & cltr ) {
-	lock  ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
-	remove( __cfa_dbg_global_clusters.list, cltr );
-	unlock( __cfa_dbg_global_clusters.lock );
-}
-
-void doregister( cluster * cltr, $thread & thrd ) {
-	lock      (cltr->thread_list_lock __cfaabi_dbg_ctx2);
-	cltr->nthreads += 1;
-	push_front(cltr->threads, thrd);
-	unlock    (cltr->thread_list_lock);
-}
-
-void unregister( cluster * cltr, $thread & thrd ) {
-	lock  (cltr->thread_list_lock __cfaabi_dbg_ctx2);
-	remove(cltr->threads, thrd );
-	cltr->nthreads -= 1;
-	unlock(cltr->thread_list_lock);
-}
-
-//-----------------------------------------------------------------------------
 // Debug
 __cfaabi_dbg_debug_do(
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 3ac8b9f795fc403158704c66c51faffe5be12842)
+++ libcfa/src/concurrency/kernel.hfa	(revision e660761701e94bf2b2ae0130750e911d4122366f)
@@ -16,7 +16,4 @@
 #pragma once
 
-#include <stdbool.h>
-#include <stdint.h>
-
 #include "invoke.h"
 #include "time_t.hfa"
@@ -27,5 +24,4 @@
 extern "C" {
 #include <pthread.h>
-#include <semaphore.h>
 }
 
Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision e660761701e94bf2b2ae0130750e911d4122366f)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision e660761701e94bf2b2ae0130750e911d4122366f)
@@ -0,0 +1,81 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// kernel/fwd.hfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Thu Jul 30 16:46:41 2020
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#include "bits/defs.hfa"
+#include "bits/debug.hfa"
+
+#if !defined(__cforall_thread__)
+#error non-thread source file includes kernel/fwd.hfa
+#endif
+
+struct $thread;
+struct processor;
+struct cluster;
+
+#ifdef __cforall
+extern "C" {
+      extern "Cforall" {
+		extern __attribute__((aligned(128))) thread_local struct KernelThreadData {
+			struct $thread    * volatile this_thread;
+			struct processor  * volatile this_processor;
+			struct __stats_t  * volatile this_stats;
+
+			struct {
+				volatile unsigned short disable_count;
+				volatile bool enabled;
+				volatile bool in_progress;
+			} preemption_state;
+
+			#if defined(__SIZEOF_INT128__)
+				__uint128_t rand_seed;
+			#else
+				uint64_t rand_seed;
+			#endif
+		} kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
+	}
+
+      #ifdef __ARM_ARCH
+            // function prototypes are only really used by these macros on ARM
+            void disable_global_interrupts();
+            void enable_global_interrupts();
+
+            #define TL_GET( member ) ( { __typeof__( kernelTLS.member ) target; \
+                  disable_global_interrupts(); \
+                  target = kernelTLS.member; \
+                  enable_global_interrupts(); \
+                  target; } )
+            #define TL_SET( member, value ) disable_global_interrupts(); \
+                  kernelTLS.member = value; \
+                  enable_global_interrupts();
+      #else
+            #define TL_GET( member ) kernelTLS.member
+            #define TL_SET( member, value ) kernelTLS.member = value;
+      #endif
+
+      extern void disable_interrupts();
+      extern void enable_interrupts_noPoll();
+	extern void enable_interrupts( __cfaabi_dbg_ctx_param );
+
+	enum __Preemption_Reason { __NO_PREEMPTION, __ALARM_PREEMPTION, __POLL_PREEMPTION, __MANUAL_PREEMPTION };
+
+      extern "Cforall" {
+            extern void park( __cfaabi_dbg_ctx_param );
+            extern void unpark( struct $thread * this __cfaabi_dbg_ctx_param2 );
+            static inline struct $thread * active_thread () { return TL_GET( this_thread ); }
+
+            extern bool force_yield( enum __Preemption_Reason );
+      }
+}
+#endif
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision e660761701e94bf2b2ae0130750e911d4122366f)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision e660761701e94bf2b2ae0130750e911d4122366f)
@@ -0,0 +1,669 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// kernel/startup.cfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Thu Jul 30 15:12:54 2020
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#define __cforall_thread__
+
+// C Includes
+#include <errno.h>              // errno
+#include <string.h>             // strerror
+#include <unistd.h>             // sysconf
+extern "C" {
+      #include <limits.h>       // PTHREAD_STACK_MIN
+      #include <sys/mman.h>     // mprotect
+      #include <sys/resource.h> // getrlimit
+}
+
+// CFA Includes
+#include "kernel_private.hfa"
+#include "startup.hfa"          // STARTUP_PRIORITY_XXX
+
+//-----------------------------------------------------------------------------
+// Some assembly required
+#if defined( __i386 )
+	#define CtxGet( ctx )        \
+		__asm__ volatile (     \
+			"movl %%esp,%0\n"\
+			"movl %%ebp,%1\n"\
+			: "=rm" (ctx.SP),\
+				"=rm" (ctx.FP) \
+		)
+#elif defined( __x86_64 )
+	#define CtxGet( ctx )        \
+		__asm__ volatile (     \
+			"movq %%rsp,%0\n"\
+			"movq %%rbp,%1\n"\
+			: "=rm" (ctx.SP),\
+				"=rm" (ctx.FP) \
+		)
+#elif defined( __ARM_ARCH )
+#define CtxGet( ctx ) __asm__ ( \
+		"mov %0,%%sp\n"   \
+		"mov %1,%%r11\n"   \
+	: "=rm" (ctx.SP), "=rm" (ctx.FP) )
+#else
+	#error unknown hardware architecture
+#endif
+
+//-----------------------------------------------------------------------------
+// Start and stop routine for the kernel, declared first to make sure they run first
+static void __kernel_startup (void) __attribute__(( constructor( STARTUP_PRIORITY_KERNEL ) ));
+static void __kernel_shutdown(void) __attribute__(( destructor ( STARTUP_PRIORITY_KERNEL ) ));
+
+//-----------------------------------------------------------------------------
+// Static Forward Declarations
+struct current_stack_info_t;
+
+static void * __invoke_processor(void * arg);
+static void __kernel_first_resume( processor * this );
+static void __kernel_last_resume ( processor * this );
+static void init(processor & this, const char name[], cluster & _cltr);
+static void deinit(processor & this);
+static void doregister( struct cluster & cltr );
+static void unregister( struct cluster & cltr );
+static void ?{}( $coroutine & this, current_stack_info_t * info);
+static void ?{}( $thread & this, current_stack_info_t * info);
+static void ?{}(processorCtx_t & this) {}
+static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info);
+
+//-----------------------------------------------------------------------------
+// Forward Declarations for other modules
+extern void __kernel_alarm_startup(void);
+extern void __kernel_alarm_shutdown(void);
+extern void __kernel_io_startup (void);
+extern void __kernel_io_shutdown(void);
+
+//-----------------------------------------------------------------------------
+// Other Forward Declarations
+extern bool __wake_proc(processor *);
+
+//-----------------------------------------------------------------------------
+// Kernel storage
+#warning duplicated in preemption.cfa
+#define KERNEL_STORAGE(T,X) __attribute((aligned(__alignof__(T)))) static char storage_##X[sizeof(T)]
+KERNEL_STORAGE(cluster,	             mainCluster);
+KERNEL_STORAGE(processor,            mainProcessor);
+KERNEL_STORAGE($thread,	             mainThread);
+KERNEL_STORAGE(__stack_t,            mainThreadCtx);
+KERNEL_STORAGE(io_context,           mainPollerThread);
+KERNEL_STORAGE(__scheduler_RWLock_t, __scheduler_lock);
+#if !defined(__CFA_NO_STATISTICS__)
+KERNEL_STORAGE(__stats_t, mainProcStats);
+#endif
+
+cluster              * mainCluster;
+processor            * mainProcessor;
+$thread              * mainThread;
+__scheduler_RWLock_t * __scheduler_lock;
+
+extern "C" {
+	struct { __dllist_t(cluster) list; __spinlock_t lock; } __cfa_dbg_global_clusters;
+}
+
+size_t __page_size = 0;
+
+//-----------------------------------------------------------------------------
+// Global state
+thread_local struct KernelThreadData kernelTLS __attribute__ ((tls_model ( "initial-exec" ))) @= {
+	NULL,												// cannot use 0p
+	NULL,
+	NULL,
+	{ 1, false, false },
+};
+
+//-----------------------------------------------------------------------------
+// Struct to steal stack
+struct current_stack_info_t {
+	__stack_t * storage;  // pointer to stack object
+	void * base;          // base of stack
+	void * limit;         // stack grows towards stack limit
+	void * context;       // address of cfa_context_t
+};
+
+void ?{}( current_stack_info_t & this ) {
+	__stack_context_t ctx;
+	CtxGet( ctx );
+	this.base = ctx.FP;
+
+	rlimit r;
+	getrlimit( RLIMIT_STACK, &r);
+	size_t size = r.rlim_cur;
+
+	this.limit = (void *)(((intptr_t)this.base) - size);
+	this.context = &storage_mainThreadCtx;
+}
+
+
+
+//=============================================================================================
+// Kernel Setup logic
+//=============================================================================================
+//-----------------------------------------------------------------------------
+// Kernel boot procedures
+static void __kernel_startup(void) {
+	verify( ! kernelTLS.preemption_state.enabled );
+	__cfadbg_print_safe(runtime_core, "Kernel : Starting\n");
+
+	__page_size = sysconf( _SC_PAGESIZE );
+
+	__cfa_dbg_global_clusters.list{ __get };
+	__cfa_dbg_global_clusters.lock{};
+
+	// Initialize the global scheduler lock
+	__scheduler_lock = (__scheduler_RWLock_t*)&storage___scheduler_lock;
+	(*__scheduler_lock){};
+
+	// Initialize the main cluster
+	mainCluster = (cluster *)&storage_mainCluster;
+	(*mainCluster){"Main Cluster", 0};
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Main cluster ready\n");
+
+	// Start by initializing the main thread
+	// SKULLDUGGERY: the mainThread steals the process main thread
+	// which will then be scheduled by the mainProcessor normally
+	mainThread = ($thread *)&storage_mainThread;
+	current_stack_info_t info;
+	info.storage = (__stack_t*)&storage_mainThreadCtx;
+	(*mainThread){ &info };
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Main thread ready\n");
+
+
+
+	// Construct the processor context of the main processor
+	void ?{}(processorCtx_t & this, processor * proc) {
+		(this.__cor){ "Processor" };
+		this.__cor.starter = 0p;
+		this.proc = proc;
+	}
+
+	void ?{}(processor & this) with( this ) {
+		( this.idle ){};
+		( this.terminated ){ 0 };
+		( this.runner ){};
+		init( this, "Main Processor", *mainCluster );
+		kernel_thread = pthread_self();
+
+		runner{ &this };
+		__cfadbg_print_safe(runtime_core, "Kernel : constructed main processor context %p\n", &runner);
+	}
+
+	// Initialize the main processor and the main processor ctx
+	// (the coroutine that contains the processing control flow)
+	mainProcessor = (processor *)&storage_mainProcessor;
+	(*mainProcessor){};
+
+	//initialize the global state variables
+	kernelTLS.this_processor = mainProcessor;
+	kernelTLS.this_thread    = mainThread;
+
+	#if !defined( __CFA_NO_STATISTICS__ )
+		kernelTLS.this_stats = (__stats_t *)& storage_mainProcStats;
+		__init_stats( kernelTLS.this_stats );
+	#endif
+
+	// Enable preemption
+	__kernel_alarm_startup();
+
+	// Start IO
+	__kernel_io_startup();
+
+	// Add the main thread to the ready queue
+	// once resume is called on mainProcessor->runner the mainThread needs to be scheduled like any normal thread
+	__schedule_thread((__processor_id_t *)mainProcessor, mainThread);
+
+	// SKULLDUGGERY: Force a context switch to the main processor to set the main thread's context to the current UNIX
+	// context. Hence, the main thread does not begin through __cfactx_invoke_thread, like all other threads. The trick here is that
+	// mainThread is on the ready queue when this call is made.
+	__kernel_first_resume( kernelTLS.this_processor );
+
+
+	// THE SYSTEM IS NOW COMPLETELY RUNNING
+
+
+	// SKULLDUGGERY: The constructor for the mainCluster will call alloc with a dimension of 0
+	// malloc *can* return a non-null value, we should free it if that is the case
+	free( mainCluster->io.ctxs );
+
+	// Now that the system is up, finish creating systems that need threading
+	mainCluster->io.ctxs = (io_context *)&storage_mainPollerThread;
+	mainCluster->io.cnt  = 1;
+	(*mainCluster->io.ctxs){ *mainCluster };
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Started\n--------------------------------------------------\n\n");
+
+	verify( ! kernelTLS.preemption_state.enabled );
+	enable_interrupts( __cfaabi_dbg_ctx );
+	verify( TL_GET( preemption_state.enabled ) );
+}
+
+static void __kernel_shutdown(void) {
+	//Before we start shutting things down, wait for systems that need threading to shutdown
+	^(*mainCluster->io.ctxs){};
+	mainCluster->io.cnt  = 0;
+	mainCluster->io.ctxs = 0p;
+
+	/* paranoid */ verify( TL_GET( preemption_state.enabled ) );
+	disable_interrupts();
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+
+	__cfadbg_print_safe(runtime_core, "\n--------------------------------------------------\nKernel : Shutting down\n");
+
+	// SKULLDUGGERY: Notify the mainProcessor it needs to terminates.
+	// When its coroutine terminates, it return control to the mainThread
+	// which is currently here
+	__atomic_store_n(&mainProcessor->do_terminate, true, __ATOMIC_RELEASE);
+	__kernel_last_resume( kernelTLS.this_processor );
+	mainThread->self_cor.state = Halted;
+
+	// THE SYSTEM IS NOW COMPLETELY STOPPED
+
+	// Disable preemption
+	__kernel_alarm_shutdown();
+
+	// Stop IO
+	__kernel_io_shutdown();
+
+	// Destroy the main processor and its context in reverse order of construction
+	// These were manually constructed so we need manually destroy them
+	void ^?{}(processor & this) with( this ){
+		deinit( this );
+
+		/* paranoid */ verify( this.do_terminate == true );
+		__cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
+	}
+
+	^(*mainProcessor){};
+
+	// Final step, destroy the main thread since it is no longer needed
+
+	// Since we provided a stack to this taxk it will not destroy anything
+	/* paranoid */ verify(mainThread->self_cor.stack.storage == (__stack_t*)(((uintptr_t)&storage_mainThreadCtx)| 0x1));
+	^(*mainThread){};
+
+	^(*mainCluster){};
+
+	^(*__scheduler_lock){};
+
+	^(__cfa_dbg_global_clusters.list){};
+	^(__cfa_dbg_global_clusters.lock){};
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Shutdown complete\n");
+}
+
+//=============================================================================================
+// Kernel Initial Scheduling logic
+//=============================================================================================
+
+// Context invoker for processors
+// This is the entry point for processors (kernel threads) *except* for the main processor
+// It effectively constructs a coroutine by stealing the pthread stack
+static void * __invoke_processor(void * arg) {
+	#if !defined( __CFA_NO_STATISTICS__ )
+		__stats_t local_stats;
+		__init_stats( &local_stats );
+		kernelTLS.this_stats = &local_stats;
+	#endif
+
+	processor * proc = (processor *) arg;
+	kernelTLS.this_processor = proc;
+	kernelTLS.this_thread    = 0p;
+	kernelTLS.preemption_state.[enabled, disable_count] = [false, 1];
+	// SKULLDUGGERY: We want to create a context for the processor coroutine
+	// which is needed for the 2-step context switch. However, there is no reason
+	// to waste the perfectly valid stack create by pthread.
+	current_stack_info_t info;
+	__stack_t ctx;
+	info.storage = &ctx;
+	(proc->runner){ proc, &info };
+
+	__cfaabi_dbg_print_safe("Coroutine : created stack %p\n", get_coroutine(proc->runner)->stack.storage);
+
+	//Set global state
+	kernelTLS.this_thread = 0p;
+
+	//We now have a proper context from which to schedule threads
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p created (%p, %p)\n", proc, &proc->runner, &ctx);
+
+	// SKULLDUGGERY: Since the coroutine doesn't have its own stack, we can't
+	// resume it to start it like it normally would, it will just context switch
+	// back to here. Instead directly call the main since we already are on the
+	// appropriate stack.
+	get_coroutine(proc->runner)->state = Active;
+	main( proc->runner );
+	get_coroutine(proc->runner)->state = Halted;
+
+	// Main routine of the core returned, the core is now fully terminated
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p main ended (%p)\n", proc, &proc->runner);
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		__tally_stats(proc->cltr->stats, &local_stats);
+		if( 0 != proc->print_stats ) {
+			__print_stats( &local_stats, proc->print_stats, true, proc->name, (void*)proc );
+		}
+	#endif
+
+	return 0p;
+}
+
+static void __kernel_first_resume( processor * this ) {
+	$thread * src = mainThread;
+	$coroutine * dst = get_coroutine(this->runner);
+
+	verify( ! kernelTLS.preemption_state.enabled );
+
+	kernelTLS.this_thread->curr_cor = dst;
+	__stack_prepare( &dst->stack, 65000 );
+	__cfactx_start(main, dst, this->runner, __cfactx_invoke_coroutine);
+
+	verify( ! kernelTLS.preemption_state.enabled );
+
+	dst->last = &src->self_cor;
+	dst->starter = dst->starter ? dst->starter : &src->self_cor;
+
+	// make sure the current state is still correct
+	/* paranoid */ verify(src->state == Ready);
+
+	// context switch to specified coroutine
+	verify( dst->context.SP );
+	__cfactx_switch( &src->context, &dst->context );
+	// when __cfactx_switch returns we are back in the src coroutine
+
+	mainThread->curr_cor = &mainThread->self_cor;
+
+	// make sure the current state has been update
+	/* paranoid */ verify(src->state == Active);
+
+	verify( ! kernelTLS.preemption_state.enabled );
+}
+
+// KERNEL_ONLY
+static void __kernel_last_resume( processor * this ) {
+	$coroutine * src = &mainThread->self_cor;
+	$coroutine * dst = get_coroutine(this->runner);
+
+	verify( ! kernelTLS.preemption_state.enabled );
+	verify( dst->starter == src );
+	verify( dst->context.SP );
+
+	// SKULLDUGGERY in debug the processors check that the
+	// stack is still within the limit of the stack limits after running a thread.
+	// that check doesn't make sense if we context switch to the processor using the
+	// coroutine semantics. Since this is a special case, use the current context
+	// info to populate these fields.
+	__cfaabi_dbg_debug_do(
+		__stack_context_t ctx;
+		CtxGet( ctx );
+		mainThread->context.SP = ctx.SP;
+		mainThread->context.FP = ctx.FP;
+	)
+
+	// context switch to the processor
+	__cfactx_switch( &src->context, &dst->context );
+}
+
+
+//=============================================================================================
+// Kernel Object Constructors logic
+//=============================================================================================
+//-----------------------------------------------------------------------------
+// Main thread construction
+static void ?{}( $coroutine & this, current_stack_info_t * info) with( this ) {
+	stack.storage = info->storage;
+	with(*stack.storage) {
+		limit     = info->limit;
+		base      = info->base;
+	}
+	__attribute__((may_alias)) intptr_t * istorage = (intptr_t*) &stack.storage;
+	*istorage |= 0x1;
+	name = "Main Thread";
+	state = Start;
+	starter = 0p;
+	last = 0p;
+	cancellation = 0p;
+}
+
+static void ?{}( $thread & this, current_stack_info_t * info) with( this ) {
+	ticket = 1;
+	state = Start;
+	self_cor{ info };
+	curr_cor = &self_cor;
+	curr_cluster = mainCluster;
+	self_mon.owner = &this;
+	self_mon.recursion = 1;
+	self_mon_p = &self_mon;
+	link.next = 0p;
+	link.prev = 0p;
+
+	node.next = 0p;
+	node.prev = 0p;
+	doregister(curr_cluster, this);
+
+	monitors{ &self_mon_p, 1, (fptr_t)0 };
+}
+
+//-----------------------------------------------------------------------------
+// Processor
+// Construct the processor context of non-main processors
+static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info) {
+	(this.__cor){ info };
+	this.proc = proc;
+}
+
+static void init(processor & this, const char name[], cluster & _cltr) with( this ) {
+	this.name = name;
+	this.cltr = &_cltr;
+	id = -1u;
+	destroyer = 0p;
+	do_terminate = false;
+	preemption_alarm = 0p;
+	pending_preemption = false;
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		print_stats = 0;
+		print_halts = false;
+	#endif
+
+	int target = __atomic_add_fetch( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+
+	id = doregister((__processor_id_t*)&this);
+
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock();
+
+		// Adjust the ready queue size
+		ready_queue_grow( cltr, target );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
+}
+
+// Not a ctor, it just preps the destruction but should not destroy members
+static void deinit(processor & this) {
+
+	int target = __atomic_sub_fetch( &this.cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock();
+
+		// Adjust the ready queue size
+		ready_queue_shrink( this.cltr, target );
+
+		// Make sure we aren't on the idle queue
+		unsafe_remove( this.cltr->idles, &this );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+
+	// Finally we don't need the read_lock any more
+	unregister((__processor_id_t*)&this);
+}
+
+void ?{}(processor & this, const char name[], cluster & _cltr) {
+	( this.idle ){};
+	( this.terminated ){ 0 };
+	( this.runner ){};
+	init( this, name, _cltr );
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Starting core %p\n", &this);
+
+	this.stack = __create_pthread( &this.kernel_thread, __invoke_processor, (void *)&this );
+
+}
+
+void ^?{}(processor & this) with( this ){
+	if( ! __atomic_load_n(&do_terminate, __ATOMIC_ACQUIRE) ) {
+		__cfadbg_print_safe(runtime_core, "Kernel : core %p signaling termination\n", &this);
+
+		__atomic_store_n(&do_terminate, true, __ATOMIC_RELAXED);
+		__wake_proc( &this );
+
+		P( terminated );
+		verify( kernelTLS.this_processor != &this);
+	}
+
+	int err = pthread_join( kernel_thread, 0p );
+	if( err != 0 ) abort("KERNEL ERROR: joining processor %p caused error %s\n", &this, strerror(err));
+
+	free( this.stack );
+
+	deinit( this );
+}
+
+//-----------------------------------------------------------------------------
+// Cluster
+void ?{}(cluster & this, const char name[], Duration preemption_rate, unsigned num_io, const io_context_params & io_params) with( this ) {
+	this.name = name;
+	this.preemption_rate = preemption_rate;
+	this.nprocessors = 0;
+	ready_queue{};
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		print_stats = 0;
+		stats = alloc();
+		__init_stats( stats );
+	#endif
+
+	threads{ __get };
+
+	doregister(this);
+
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock();
+
+		// Adjust the ready queue size
+		ready_queue_grow( &this, 0 );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+
+	this.io.cnt  = num_io;
+	this.io.ctxs = aalloc(num_io);
+	for(i; this.io.cnt) {
+		(this.io.ctxs[i]){ this, io_params };
+	}
+}
+
+void ^?{}(cluster & this) {
+	for(i; this.io.cnt) {
+		^(this.io.ctxs[i]){ true };
+	}
+	free(this.io.ctxs);
+
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock();
+
+		// Adjust the ready queue size
+		ready_queue_shrink( &this, 0 );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		if( 0 != this.print_stats ) {
+			__print_stats( this.stats, this.print_stats, true, this.name, (void*)&this );
+		}
+		free( this.stats );
+	#endif
+
+	unregister(this);
+}
+
+//=============================================================================================
+// Miscellaneous Initialization
+//=============================================================================================
+//-----------------------------------------------------------------------------
+// Global Queues
+static void doregister( cluster     & cltr ) {
+	lock      ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
+	push_front( __cfa_dbg_global_clusters.list, cltr );
+	unlock    ( __cfa_dbg_global_clusters.lock );
+}
+
+static void unregister( cluster     & cltr ) {
+	lock  ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
+	remove( __cfa_dbg_global_clusters.list, cltr );
+	unlock( __cfa_dbg_global_clusters.lock );
+}
+
+void doregister( cluster * cltr, $thread & thrd ) {
+	lock      (cltr->thread_list_lock __cfaabi_dbg_ctx2);
+	cltr->nthreads += 1;
+	push_front(cltr->threads, thrd);
+	unlock    (cltr->thread_list_lock);
+}
+
+void unregister( cluster * cltr, $thread & thrd ) {
+	lock  (cltr->thread_list_lock __cfaabi_dbg_ctx2);
+	remove(cltr->threads, thrd );
+	cltr->nthreads -= 1;
+	unlock(cltr->thread_list_lock);
+}
+
+static void check( int ret, const char func[] ) {
+	if ( ret ) {										// pthread routines return errno values
+		abort( "%s : internal error, error(%d) %s.", func, ret, strerror( ret ) );
+	} // if
+} // Abort
+
+void * __create_pthread( pthread_t * pthread, void * (*start)(void *), void * arg ) {
+	pthread_attr_t attr;
+
+	check( pthread_attr_init( &attr ), "pthread_attr_init" ); // initialize attribute
+
+	size_t stacksize;
+	// default stack size, normally defined by shell limit
+	check( pthread_attr_getstacksize( &attr, &stacksize ), "pthread_attr_getstacksize" );
+	assert( stacksize >= PTHREAD_STACK_MIN );
+
+	void * stack;
+	__cfaabi_dbg_debug_do(
+		stack = memalign( __page_size, stacksize + __page_size );
+		// pthread has no mechanism to create the guard page in user supplied stack.
+		if ( mprotect( stack, __page_size, PROT_NONE ) == -1 ) {
+			abort( "mprotect : internal error, mprotect failure, error(%d) %s.", errno, strerror( errno ) );
+		} // if
+	);
+	__cfaabi_dbg_no_debug_do(
+		stack = malloc( stacksize );
+	);
+
+	check( pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" );
+
+	check( pthread_create( pthread, &attr, start, arg ), "pthread_create" );
+	return stack;
+}
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 3ac8b9f795fc403158704c66c51faffe5be12842)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision e660761701e94bf2b2ae0130750e911d4122366f)
@@ -52,19 +52,4 @@
 
 
-
-struct event_kernel_t {
-	alarm_list_t alarms;
-	__spinlock_t lock;
-};
-
-extern event_kernel_t * event_kernel;
-
-struct __cfa_kernel_preemption_state_t {
-	bool enabled;
-	bool in_progress;
-	unsigned short disable_count;
-};
-
-extern volatile thread_local __cfa_kernel_preemption_state_t preemption_state __attribute__ ((tls_model ( "initial-exec" )));
 
 extern cluster * mainCluster;
@@ -104,6 +89,4 @@
 //-----------------------------------------------------------------------------
 // Utils
-#define KERNEL_STORAGE(T,X) __attribute((aligned(__alignof__(T)))) static char storage_##X[sizeof(T)]
-
 static inline uint64_t __tls_rand() {
 	#if defined(__SIZEOF_INT128__)
@@ -113,8 +96,4 @@
 	#endif
 }
-
-
-void doregister( struct cluster & cltr );
-void unregister( struct cluster & cltr );
 
 void doregister( struct cluster * cltr, struct $thread & thrd );
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 3ac8b9f795fc403158704c66c51faffe5be12842)
+++ libcfa/src/concurrency/preemption.cfa	(revision e660761701e94bf2b2ae0130750e911d4122366f)
@@ -26,4 +26,5 @@
 
 #include "bits/signal.hfa"
+#include "kernel_private.hfa"
 
 #if !defined(__CFA_DEFAULT_PREEMPTION__)
@@ -60,4 +61,7 @@
 #error unknown hardware architecture
 #endif
+
+#warning duplicated in startup.cfa
+#define KERNEL_STORAGE(T,X) __attribute((aligned(__alignof__(T)))) static char storage_##X[sizeof(T)]
 
 KERNEL_STORAGE(event_kernel_t, event_kernel);         // private storage for event kernel
@@ -293,5 +297,5 @@
 // Startup routine to activate preemption
 // Called from kernel_startup
-void kernel_start_preemption() {
+void __kernel_alarm_startup() {
 	__cfaabi_dbg_print_safe( "Kernel : Starting preemption\n" );
 
@@ -315,5 +319,5 @@
 // Shutdown routine to deactivate preemption
 // Called from kernel_shutdown
-void kernel_stop_preemption() {
+void __kernel_alarm_shutdown() {
 	__cfaabi_dbg_print_safe( "Kernel : Preemption stopping\n" );
 
Index: libcfa/src/concurrency/preemption.hfa
===================================================================
--- libcfa/src/concurrency/preemption.hfa	(revision 3ac8b9f795fc403158704c66c51faffe5be12842)
+++ libcfa/src/concurrency/preemption.hfa	(revision e660761701e94bf2b2ae0130750e911d4122366f)
@@ -16,9 +16,14 @@
 #pragma once
 
+#include "bits/locks.hfa"
 #include "alarm.hfa"
-#include "kernel_private.hfa"
 
-void kernel_start_preemption();
-void kernel_stop_preemption();
+struct event_kernel_t {
+	alarm_list_t alarms;
+	__spinlock_t lock;
+};
+
+extern event_kernel_t * event_kernel;
+
 void update_preemption( processor * this, Duration duration );