Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision e660761701e94bf2b2ae0130750e911d4122366f)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision e660761701e94bf2b2ae0130750e911d4122366f)
@@ -0,0 +1,81 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// kernel/fwd.hfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Thu Jul 30 16:46:41 2020
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#include "bits/defs.hfa"
+#include "bits/debug.hfa"
+
+#if !defined(__cforall_thread__)
+#error non-thread source file includes kernel/fwd.hfa
+#endif
+
+struct $thread;
+struct processor;
+struct cluster;
+
+#ifdef __cforall
+extern "C" {
+      extern "Cforall" {
+		extern __attribute__((aligned(128))) thread_local struct KernelThreadData {
+			struct $thread    * volatile this_thread;
+			struct processor  * volatile this_processor;
+			struct __stats_t  * volatile this_stats;
+
+			struct {
+				volatile unsigned short disable_count;
+				volatile bool enabled;
+				volatile bool in_progress;
+			} preemption_state;
+
+			#if defined(__SIZEOF_INT128__)
+				__uint128_t rand_seed;
+			#else
+				uint64_t rand_seed;
+			#endif
+		} kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
+	}
+
+      #ifdef __ARM_ARCH
+            // function prototypes are only really used by these macros on ARM
+            void disable_global_interrupts();
+            void enable_global_interrupts();
+
+            #define TL_GET( member ) ( { __typeof__( kernelTLS.member ) target; \
+                  disable_global_interrupts(); \
+                  target = kernelTLS.member; \
+                  enable_global_interrupts(); \
+                  target; } )
+            #define TL_SET( member, value ) disable_global_interrupts(); \
+                  kernelTLS.member = value; \
+                  enable_global_interrupts();
+      #else
+            #define TL_GET( member ) kernelTLS.member
+            #define TL_SET( member, value ) kernelTLS.member = value;
+      #endif
+
+      extern void disable_interrupts();
+      extern void enable_interrupts_noPoll();
+	extern void enable_interrupts( __cfaabi_dbg_ctx_param );
+
+	enum __Preemption_Reason { __NO_PREEMPTION, __ALARM_PREEMPTION, __POLL_PREEMPTION, __MANUAL_PREEMPTION };
+
+      extern "Cforall" {
+            extern void park( __cfaabi_dbg_ctx_param );
+            extern void unpark( struct $thread * this __cfaabi_dbg_ctx_param2 );
+            static inline struct $thread * active_thread () { return TL_GET( this_thread ); }
+
+            extern bool force_yield( enum __Preemption_Reason );
+      }
+}
+#endif
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision e660761701e94bf2b2ae0130750e911d4122366f)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision e660761701e94bf2b2ae0130750e911d4122366f)
@@ -0,0 +1,669 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// kernel/startup.cfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Thu Jul 30 15:12:54 2020
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#define __cforall_thread__
+
+// C Includes
+#include <errno.h>              // errno
+#include <string.h>             // strerror
+#include <unistd.h>             // sysconf
+extern "C" {
+      #include <limits.h>       // PTHREAD_STACK_MIN
+      #include <sys/mman.h>     // mprotect
+      #include <sys/resource.h> // getrlimit
+}
+
+// CFA Includes
+#include "kernel_private.hfa"
+#include "startup.hfa"          // STARTUP_PRIORITY_XXX
+
+//-----------------------------------------------------------------------------
+// Some assembly required
+#if defined( __i386 )
+	#define CtxGet( ctx )        \
+		__asm__ volatile (     \
+			"movl %%esp,%0\n"\
+			"movl %%ebp,%1\n"\
+			: "=rm" (ctx.SP),\
+				"=rm" (ctx.FP) \
+		)
+#elif defined( __x86_64 )
+	#define CtxGet( ctx )        \
+		__asm__ volatile (     \
+			"movq %%rsp,%0\n"\
+			"movq %%rbp,%1\n"\
+			: "=rm" (ctx.SP),\
+				"=rm" (ctx.FP) \
+		)
+#elif defined( __ARM_ARCH )
+#define CtxGet( ctx ) __asm__ ( \
+		"mov %0,%%sp\n"   \
+		"mov %1,%%r11\n"   \
+	: "=rm" (ctx.SP), "=rm" (ctx.FP) )
+#else
+	#error unknown hardware architecture
+#endif
+
+//-----------------------------------------------------------------------------
+// Start and stop routine for the kernel, declared first to make sure they run first
+static void __kernel_startup (void) __attribute__(( constructor( STARTUP_PRIORITY_KERNEL ) ));
+static void __kernel_shutdown(void) __attribute__(( destructor ( STARTUP_PRIORITY_KERNEL ) ));
+
+//-----------------------------------------------------------------------------
+// Static Forward Declarations
+struct current_stack_info_t;
+
+static void * __invoke_processor(void * arg);
+static void __kernel_first_resume( processor * this );
+static void __kernel_last_resume ( processor * this );
+static void init(processor & this, const char name[], cluster & _cltr);
+static void deinit(processor & this);
+static void doregister( struct cluster & cltr );
+static void unregister( struct cluster & cltr );
+static void ?{}( $coroutine & this, current_stack_info_t * info);
+static void ?{}( $thread & this, current_stack_info_t * info);
+static void ?{}(processorCtx_t & this) {}
+static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info);
+
+//-----------------------------------------------------------------------------
+// Forward Declarations for other modules
+extern void __kernel_alarm_startup(void);
+extern void __kernel_alarm_shutdown(void);
+extern void __kernel_io_startup (void);
+extern void __kernel_io_shutdown(void);
+
+//-----------------------------------------------------------------------------
+// Other Forward Declarations
+extern bool __wake_proc(processor *);
+
+//-----------------------------------------------------------------------------
+// Kernel storage
+#warning duplicated in preemption.cfa
+#define KERNEL_STORAGE(T,X) __attribute((aligned(__alignof__(T)))) static char storage_##X[sizeof(T)]
+KERNEL_STORAGE(cluster,	             mainCluster);
+KERNEL_STORAGE(processor,            mainProcessor);
+KERNEL_STORAGE($thread,	             mainThread);
+KERNEL_STORAGE(__stack_t,            mainThreadCtx);
+KERNEL_STORAGE(io_context,           mainPollerThread);
+KERNEL_STORAGE(__scheduler_RWLock_t, __scheduler_lock);
+#if !defined(__CFA_NO_STATISTICS__)
+KERNEL_STORAGE(__stats_t, mainProcStats);
+#endif
+
+cluster              * mainCluster;
+processor            * mainProcessor;
+$thread              * mainThread;
+__scheduler_RWLock_t * __scheduler_lock;
+
+extern "C" {
+	struct { __dllist_t(cluster) list; __spinlock_t lock; } __cfa_dbg_global_clusters;
+}
+
+size_t __page_size = 0;
+
+//-----------------------------------------------------------------------------
+// Global state
+thread_local struct KernelThreadData kernelTLS __attribute__ ((tls_model ( "initial-exec" ))) @= {
+	NULL,												// cannot use 0p
+	NULL,
+	NULL,
+	{ 1, false, false },
+};
+
+//-----------------------------------------------------------------------------
+// Struct to steal stack
+struct current_stack_info_t {
+	__stack_t * storage;  // pointer to stack object
+	void * base;          // base of stack
+	void * limit;         // stack grows towards stack limit
+	void * context;       // address of cfa_context_t
+};
+
+void ?{}( current_stack_info_t & this ) {
+	__stack_context_t ctx;
+	CtxGet( ctx );
+	this.base = ctx.FP;
+
+	rlimit r;
+	getrlimit( RLIMIT_STACK, &r);
+	size_t size = r.rlim_cur;
+
+	this.limit = (void *)(((intptr_t)this.base) - size);
+	this.context = &storage_mainThreadCtx;
+}
+
+
+
+//=============================================================================================
+// Kernel Setup logic
+//=============================================================================================
+//-----------------------------------------------------------------------------
+// Kernel boot procedures
+static void __kernel_startup(void) {
+	verify( ! kernelTLS.preemption_state.enabled );
+	__cfadbg_print_safe(runtime_core, "Kernel : Starting\n");
+
+	__page_size = sysconf( _SC_PAGESIZE );
+
+	__cfa_dbg_global_clusters.list{ __get };
+	__cfa_dbg_global_clusters.lock{};
+
+	// Initialize the global scheduler lock
+	__scheduler_lock = (__scheduler_RWLock_t*)&storage___scheduler_lock;
+	(*__scheduler_lock){};
+
+	// Initialize the main cluster
+	mainCluster = (cluster *)&storage_mainCluster;
+	(*mainCluster){"Main Cluster", 0};
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Main cluster ready\n");
+
+	// Start by initializing the main thread
+	// SKULLDUGGERY: the mainThread steals the process main thread
+	// which will then be scheduled by the mainProcessor normally
+	mainThread = ($thread *)&storage_mainThread;
+	current_stack_info_t info;
+	info.storage = (__stack_t*)&storage_mainThreadCtx;
+	(*mainThread){ &info };
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Main thread ready\n");
+
+
+
+	// Construct the processor context of the main processor
+	void ?{}(processorCtx_t & this, processor * proc) {
+		(this.__cor){ "Processor" };
+		this.__cor.starter = 0p;
+		this.proc = proc;
+	}
+
+	void ?{}(processor & this) with( this ) {
+		( this.idle ){};
+		( this.terminated ){ 0 };
+		( this.runner ){};
+		init( this, "Main Processor", *mainCluster );
+		kernel_thread = pthread_self();
+
+		runner{ &this };
+		__cfadbg_print_safe(runtime_core, "Kernel : constructed main processor context %p\n", &runner);
+	}
+
+	// Initialize the main processor and the main processor ctx
+	// (the coroutine that contains the processing control flow)
+	mainProcessor = (processor *)&storage_mainProcessor;
+	(*mainProcessor){};
+
+	//initialize the global state variables
+	kernelTLS.this_processor = mainProcessor;
+	kernelTLS.this_thread    = mainThread;
+
+	#if !defined( __CFA_NO_STATISTICS__ )
+		kernelTLS.this_stats = (__stats_t *)& storage_mainProcStats;
+		__init_stats( kernelTLS.this_stats );
+	#endif
+
+	// Enable preemption
+	__kernel_alarm_startup();
+
+	// Start IO
+	__kernel_io_startup();
+
+	// Add the main thread to the ready queue
+	// once resume is called on mainProcessor->runner the mainThread needs to be scheduled like any normal thread
+	__schedule_thread((__processor_id_t *)mainProcessor, mainThread);
+
+	// SKULLDUGGERY: Force a context switch to the main processor to set the main thread's context to the current UNIX
+	// context. Hence, the main thread does not begin through __cfactx_invoke_thread, like all other threads. The trick here is that
+	// mainThread is on the ready queue when this call is made.
+	__kernel_first_resume( kernelTLS.this_processor );
+
+
+	// THE SYSTEM IS NOW COMPLETELY RUNNING
+
+
+	// SKULLDUGGERY: The constructor for the mainCluster will call alloc with a dimension of 0
+	// malloc *can* return a non-null value, we should free it if that is the case
+	free( mainCluster->io.ctxs );
+
+	// Now that the system is up, finish creating systems that need threading
+	mainCluster->io.ctxs = (io_context *)&storage_mainPollerThread;
+	mainCluster->io.cnt  = 1;
+	(*mainCluster->io.ctxs){ *mainCluster };
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Started\n--------------------------------------------------\n\n");
+
+	verify( ! kernelTLS.preemption_state.enabled );
+	enable_interrupts( __cfaabi_dbg_ctx );
+	verify( TL_GET( preemption_state.enabled ) );
+}
+
+static void __kernel_shutdown(void) {
+	//Before we start shutting things down, wait for systems that need threading to shutdown
+	^(*mainCluster->io.ctxs){};
+	mainCluster->io.cnt  = 0;
+	mainCluster->io.ctxs = 0p;
+
+	/* paranoid */ verify( TL_GET( preemption_state.enabled ) );
+	disable_interrupts();
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+
+	__cfadbg_print_safe(runtime_core, "\n--------------------------------------------------\nKernel : Shutting down\n");
+
+	// SKULLDUGGERY: Notify the mainProcessor it needs to terminates.
+	// When its coroutine terminates, it return control to the mainThread
+	// which is currently here
+	__atomic_store_n(&mainProcessor->do_terminate, true, __ATOMIC_RELEASE);
+	__kernel_last_resume( kernelTLS.this_processor );
+	mainThread->self_cor.state = Halted;
+
+	// THE SYSTEM IS NOW COMPLETELY STOPPED
+
+	// Disable preemption
+	__kernel_alarm_shutdown();
+
+	// Stop IO
+	__kernel_io_shutdown();
+
+	// Destroy the main processor and its context in reverse order of construction
+	// These were manually constructed so we need manually destroy them
+	void ^?{}(processor & this) with( this ){
+		deinit( this );
+
+		/* paranoid */ verify( this.do_terminate == true );
+		__cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
+	}
+
+	^(*mainProcessor){};
+
+	// Final step, destroy the main thread since it is no longer needed
+
+	// Since we provided a stack to this taxk it will not destroy anything
+	/* paranoid */ verify(mainThread->self_cor.stack.storage == (__stack_t*)(((uintptr_t)&storage_mainThreadCtx)| 0x1));
+	^(*mainThread){};
+
+	^(*mainCluster){};
+
+	^(*__scheduler_lock){};
+
+	^(__cfa_dbg_global_clusters.list){};
+	^(__cfa_dbg_global_clusters.lock){};
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Shutdown complete\n");
+}
+
+//=============================================================================================
+// Kernel Initial Scheduling logic
+//=============================================================================================
+
+// Context invoker for processors
+// This is the entry point for processors (kernel threads) *except* for the main processor
+// It effectively constructs a coroutine by stealing the pthread stack
+static void * __invoke_processor(void * arg) {
+	#if !defined( __CFA_NO_STATISTICS__ )
+		__stats_t local_stats;
+		__init_stats( &local_stats );
+		kernelTLS.this_stats = &local_stats;
+	#endif
+
+	processor * proc = (processor *) arg;
+	kernelTLS.this_processor = proc;
+	kernelTLS.this_thread    = 0p;
+	kernelTLS.preemption_state.[enabled, disable_count] = [false, 1];
+	// SKULLDUGGERY: We want to create a context for the processor coroutine
+	// which is needed for the 2-step context switch. However, there is no reason
+	// to waste the perfectly valid stack create by pthread.
+	current_stack_info_t info;
+	__stack_t ctx;
+	info.storage = &ctx;
+	(proc->runner){ proc, &info };
+
+	__cfaabi_dbg_print_safe("Coroutine : created stack %p\n", get_coroutine(proc->runner)->stack.storage);
+
+	//Set global state
+	kernelTLS.this_thread = 0p;
+
+	//We now have a proper context from which to schedule threads
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p created (%p, %p)\n", proc, &proc->runner, &ctx);
+
+	// SKULLDUGGERY: Since the coroutine doesn't have its own stack, we can't
+	// resume it to start it like it normally would, it will just context switch
+	// back to here. Instead directly call the main since we already are on the
+	// appropriate stack.
+	get_coroutine(proc->runner)->state = Active;
+	main( proc->runner );
+	get_coroutine(proc->runner)->state = Halted;
+
+	// Main routine of the core returned, the core is now fully terminated
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p main ended (%p)\n", proc, &proc->runner);
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		__tally_stats(proc->cltr->stats, &local_stats);
+		if( 0 != proc->print_stats ) {
+			__print_stats( &local_stats, proc->print_stats, true, proc->name, (void*)proc );
+		}
+	#endif
+
+	return 0p;
+}
+
+static void __kernel_first_resume( processor * this ) {
+	$thread * src = mainThread;
+	$coroutine * dst = get_coroutine(this->runner);
+
+	verify( ! kernelTLS.preemption_state.enabled );
+
+	kernelTLS.this_thread->curr_cor = dst;
+	__stack_prepare( &dst->stack, 65000 );
+	__cfactx_start(main, dst, this->runner, __cfactx_invoke_coroutine);
+
+	verify( ! kernelTLS.preemption_state.enabled );
+
+	dst->last = &src->self_cor;
+	dst->starter = dst->starter ? dst->starter : &src->self_cor;
+
+	// make sure the current state is still correct
+	/* paranoid */ verify(src->state == Ready);
+
+	// context switch to specified coroutine
+	verify( dst->context.SP );
+	__cfactx_switch( &src->context, &dst->context );
+	// when __cfactx_switch returns we are back in the src coroutine
+
+	mainThread->curr_cor = &mainThread->self_cor;
+
+	// make sure the current state has been update
+	/* paranoid */ verify(src->state == Active);
+
+	verify( ! kernelTLS.preemption_state.enabled );
+}
+
+// KERNEL_ONLY
+static void __kernel_last_resume( processor * this ) {
+	$coroutine * src = &mainThread->self_cor;
+	$coroutine * dst = get_coroutine(this->runner);
+
+	verify( ! kernelTLS.preemption_state.enabled );
+	verify( dst->starter == src );
+	verify( dst->context.SP );
+
+	// SKULLDUGGERY in debug the processors check that the
+	// stack is still within the limit of the stack limits after running a thread.
+	// that check doesn't make sense if we context switch to the processor using the
+	// coroutine semantics. Since this is a special case, use the current context
+	// info to populate these fields.
+	__cfaabi_dbg_debug_do(
+		__stack_context_t ctx;
+		CtxGet( ctx );
+		mainThread->context.SP = ctx.SP;
+		mainThread->context.FP = ctx.FP;
+	)
+
+	// context switch to the processor
+	__cfactx_switch( &src->context, &dst->context );
+}
+
+
+//=============================================================================================
+// Kernel Object Constructors logic
+//=============================================================================================
+//-----------------------------------------------------------------------------
+// Main thread construction
+static void ?{}( $coroutine & this, current_stack_info_t * info) with( this ) {
+	stack.storage = info->storage;
+	with(*stack.storage) {
+		limit     = info->limit;
+		base      = info->base;
+	}
+	__attribute__((may_alias)) intptr_t * istorage = (intptr_t*) &stack.storage;
+	*istorage |= 0x1;
+	name = "Main Thread";
+	state = Start;
+	starter = 0p;
+	last = 0p;
+	cancellation = 0p;
+}
+
+static void ?{}( $thread & this, current_stack_info_t * info) with( this ) {
+	ticket = 1;
+	state = Start;
+	self_cor{ info };
+	curr_cor = &self_cor;
+	curr_cluster = mainCluster;
+	self_mon.owner = &this;
+	self_mon.recursion = 1;
+	self_mon_p = &self_mon;
+	link.next = 0p;
+	link.prev = 0p;
+
+	node.next = 0p;
+	node.prev = 0p;
+	doregister(curr_cluster, this);
+
+	monitors{ &self_mon_p, 1, (fptr_t)0 };
+}
+
+//-----------------------------------------------------------------------------
+// Processor
+// Construct the processor context of non-main processors
+static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info) {
+	(this.__cor){ info };
+	this.proc = proc;
+}
+
+static void init(processor & this, const char name[], cluster & _cltr) with( this ) {
+	this.name = name;
+	this.cltr = &_cltr;
+	id = -1u;
+	destroyer = 0p;
+	do_terminate = false;
+	preemption_alarm = 0p;
+	pending_preemption = false;
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		print_stats = 0;
+		print_halts = false;
+	#endif
+
+	int target = __atomic_add_fetch( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+
+	id = doregister((__processor_id_t*)&this);
+
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock();
+
+		// Adjust the ready queue size
+		ready_queue_grow( cltr, target );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
+}
+
+// Not a ctor, it just preps the destruction but should not destroy members
+static void deinit(processor & this) {
+
+	int target = __atomic_sub_fetch( &this.cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock();
+
+		// Adjust the ready queue size
+		ready_queue_shrink( this.cltr, target );
+
+		// Make sure we aren't on the idle queue
+		unsafe_remove( this.cltr->idles, &this );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+
+	// Finally we don't need the read_lock any more
+	unregister((__processor_id_t*)&this);
+}
+
+void ?{}(processor & this, const char name[], cluster & _cltr) {
+	( this.idle ){};
+	( this.terminated ){ 0 };
+	( this.runner ){};
+	init( this, name, _cltr );
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Starting core %p\n", &this);
+
+	this.stack = __create_pthread( &this.kernel_thread, __invoke_processor, (void *)&this );
+
+}
+
+void ^?{}(processor & this) with( this ){
+	if( ! __atomic_load_n(&do_terminate, __ATOMIC_ACQUIRE) ) {
+		__cfadbg_print_safe(runtime_core, "Kernel : core %p signaling termination\n", &this);
+
+		__atomic_store_n(&do_terminate, true, __ATOMIC_RELAXED);
+		__wake_proc( &this );
+
+		P( terminated );
+		verify( kernelTLS.this_processor != &this);
+	}
+
+	int err = pthread_join( kernel_thread, 0p );
+	if( err != 0 ) abort("KERNEL ERROR: joining processor %p caused error %s\n", &this, strerror(err));
+
+	free( this.stack );
+
+	deinit( this );
+}
+
+//-----------------------------------------------------------------------------
+// Cluster
+void ?{}(cluster & this, const char name[], Duration preemption_rate, unsigned num_io, const io_context_params & io_params) with( this ) {
+	this.name = name;
+	this.preemption_rate = preemption_rate;
+	this.nprocessors = 0;
+	ready_queue{};
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		print_stats = 0;
+		stats = alloc();
+		__init_stats( stats );
+	#endif
+
+	threads{ __get };
+
+	doregister(this);
+
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock();
+
+		// Adjust the ready queue size
+		ready_queue_grow( &this, 0 );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+
+	this.io.cnt  = num_io;
+	this.io.ctxs = aalloc(num_io);
+	for(i; this.io.cnt) {
+		(this.io.ctxs[i]){ this, io_params };
+	}
+}
+
+void ^?{}(cluster & this) {
+	for(i; this.io.cnt) {
+		^(this.io.ctxs[i]){ true };
+	}
+	free(this.io.ctxs);
+
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock();
+
+		// Adjust the ready queue size
+		ready_queue_shrink( &this, 0 );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		if( 0 != this.print_stats ) {
+			__print_stats( this.stats, this.print_stats, true, this.name, (void*)&this );
+		}
+		free( this.stats );
+	#endif
+
+	unregister(this);
+}
+
+//=============================================================================================
+// Miscellaneous Initialization
+//=============================================================================================
+//-----------------------------------------------------------------------------
+// Global Queues
+static void doregister( cluster     & cltr ) {
+	lock      ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
+	push_front( __cfa_dbg_global_clusters.list, cltr );
+	unlock    ( __cfa_dbg_global_clusters.lock );
+}
+
+static void unregister( cluster     & cltr ) {
+	lock  ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
+	remove( __cfa_dbg_global_clusters.list, cltr );
+	unlock( __cfa_dbg_global_clusters.lock );
+}
+
+void doregister( cluster * cltr, $thread & thrd ) {
+	lock      (cltr->thread_list_lock __cfaabi_dbg_ctx2);
+	cltr->nthreads += 1;
+	push_front(cltr->threads, thrd);
+	unlock    (cltr->thread_list_lock);
+}
+
+void unregister( cluster * cltr, $thread & thrd ) {
+	lock  (cltr->thread_list_lock __cfaabi_dbg_ctx2);
+	remove(cltr->threads, thrd );
+	cltr->nthreads -= 1;
+	unlock(cltr->thread_list_lock);
+}
+
+static void check( int ret, const char func[] ) {
+	if ( ret ) {										// pthread routines return errno values
+		abort( "%s : internal error, error(%d) %s.", func, ret, strerror( ret ) );
+	} // if
+} // Abort
+
+void * __create_pthread( pthread_t * pthread, void * (*start)(void *), void * arg ) {
+	pthread_attr_t attr;
+
+	check( pthread_attr_init( &attr ), "pthread_attr_init" ); // initialize attribute
+
+	size_t stacksize;
+	// default stack size, normally defined by shell limit
+	check( pthread_attr_getstacksize( &attr, &stacksize ), "pthread_attr_getstacksize" );
+	assert( stacksize >= PTHREAD_STACK_MIN );
+
+	void * stack;
+	__cfaabi_dbg_debug_do(
+		stack = memalign( __page_size, stacksize + __page_size );
+		// pthread has no mechanism to create the guard page in user supplied stack.
+		if ( mprotect( stack, __page_size, PROT_NONE ) == -1 ) {
+			abort( "mprotect : internal error, mprotect failure, error(%d) %s.", errno, strerror( errno ) );
+		} // if
+	);
+	__cfaabi_dbg_no_debug_do(
+		stack = malloc( stacksize );
+	);
+
+	check( pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" );
+
+	check( pthread_create( pthread, &attr, start, arg ), "pthread_create" );
+	return stack;
+}