Index: libcfa/src/concurrency/alarm.c
===================================================================
--- libcfa/src/concurrency/alarm.c	(revision bf71cfdb7285490eee552b461158846f626cc52f)
+++ 	(revision )
@@ -1,179 +1,0 @@
-//
-// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
-//
-// The contents of this file are covered under the licence agreement in the
-// file "LICENCE" distributed with Cforall.
-//
-// alarm.c --
-//
-// Author           : Thierry Delisle
-// Created On       : Fri Jun 2 11:31:25 2017
-// Last Modified By : Peter A. Buhr
-// Last Modified On : Fri May 25 06:25:47 2018
-// Update Count     : 67
-//
-
-extern "C" {
-#include <errno.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/time.h>
-}
-
-#include "alarm.h"
-#include "kernel_private.h"
-#include "preemption.h"
-
-//=============================================================================================
-// Clock logic
-//=============================================================================================
-
-Time __kernel_get_time() {
-	timespec curr;
-	clock_gettime( CLOCK_MONOTONIC_RAW, &curr );		// CLOCK_REALTIME
-	return (Time){ curr };
-}
-
-void __kernel_set_timer( Duration alarm ) {
-	verifyf(alarm >= 1`us || alarm == 0, "Setting timer to < 1us (%jins)", alarm.tv);
-	setitimer( ITIMER_REAL, &(itimerval){ alarm }, NULL );
-}
-
-//=============================================================================================
-// Alarm logic
-//=============================================================================================
-
-void ?{}( alarm_node_t & this, thread_desc * thrd, Time alarm, Duration period ) with( this ) {
-	this.thrd = thrd;
-	this.alarm = alarm;
-	this.period = period;
-	next = 0;
-	set = false;
-	kernel_alarm = false;
-}
-
-void ?{}( alarm_node_t & this, processor   * proc, Time alarm, Duration period ) with( this ) {
-	this.proc = proc;
-	this.alarm = alarm;
-	this.period = period;
-	next = 0;
-	set = false;
-	kernel_alarm = true;
-}
-
-void ^?{}( alarm_node_t & this ) {
-	if( this.set ) {
-		unregister_self( &this );
-	}
-}
-
-#if !defined(NDEBUG) && (defined(__CFA_DEBUG__) || defined(__CFA_VERIFY__))
-bool validate( alarm_list_t * this ) {
-	alarm_node_t ** it = &this->head;
-	while( (*it) ) {
-		it = &(*it)->next;
-	}
-
-	return it == this->tail;
-}
-#endif
-
-static inline void insert_at( alarm_list_t * this, alarm_node_t * n, __alarm_it_t p ) {
-	verify( !n->next );
-	if( p == this->tail ) {
-		this->tail = &n->next;
-	}
-	else {
-		n->next = *p;
-	}
-	*p = n;
-
-	verify( validate( this ) );
-}
-
-void insert( alarm_list_t * this, alarm_node_t * n ) {
-	alarm_node_t ** it = &this->head;
-	while( (*it) && (n->alarm > (*it)->alarm) ) {
-		it = &(*it)->next;
-	}
-
-	insert_at( this, n, it );
-
-	verify( validate( this ) );
-}
-
-alarm_node_t * pop( alarm_list_t * this ) {
-	alarm_node_t * head = this->head;
-	if( head ) {
-		this->head = head->next;
-		if( !head->next ) {
-			this->tail = &this->head;
-		}
-		head->next = NULL;
-	}
-	verify( validate( this ) );
-	return head;
-}
-
-static inline void remove_at( alarm_list_t * this, alarm_node_t * n, __alarm_it_t it ) {
-	verify( it );
-	verify( (*it) == n );
-
-	(*it) = n->next;
-	if( !n-> next ) {
-		this->tail = it;
-	}
-	n->next = NULL;
-
-	verify( validate( this ) );
-}
-
-static inline void remove( alarm_list_t * this, alarm_node_t * n ) {
-	alarm_node_t ** it = &this->head;
-	while( (*it) && (*it) != n ) {
-		it = &(*it)->next;
-	}
-
-	verify( validate( this ) );
-
-	if( *it ) { remove_at( this, n, it ); }
-
-	verify( validate( this ) );
-}
-
-void register_self( alarm_node_t * this ) {
-	alarm_list_t * alarms = &event_kernel->alarms;
-
-	disable_interrupts();
-	lock( event_kernel->lock __cfaabi_dbg_ctx2 );
-	{
-		verify( validate( alarms ) );
-		bool first = !alarms->head;
-
-		insert( alarms, this );
-		if( first ) {
-			__kernel_set_timer( alarms->head->alarm - __kernel_get_time() );
-		}
-	}
-	unlock( event_kernel->lock );
-	this->set = true;
-	enable_interrupts( __cfaabi_dbg_ctx );
-}
-
-void unregister_self( alarm_node_t * this ) {
-	disable_interrupts();
-	lock( event_kernel->lock __cfaabi_dbg_ctx2 );
-	{
-		verify( validate( &event_kernel->alarms ) );
-		remove( &event_kernel->alarms, this );
-	}
-	unlock( event_kernel->lock );
-	enable_interrupts( __cfaabi_dbg_ctx );
-	this->set = false;
-}
-
-// Local Variables: //
-// mode: c //
-// tab-width: 4 //
-// End: //
Index: libcfa/src/concurrency/alarm.cfa
===================================================================
--- libcfa/src/concurrency/alarm.cfa	(revision 4dcaed2c212a42d633824e9b88053258a2d0f969)
+++ libcfa/src/concurrency/alarm.cfa	(revision 4dcaed2c212a42d633824e9b88053258a2d0f969)
@@ -0,0 +1,179 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// alarm.c --
+//
+// Author           : Thierry Delisle
+// Created On       : Fri Jun 2 11:31:25 2017
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Fri May 25 06:25:47 2018
+// Update Count     : 67
+//
+
+extern "C" {
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/time.h>
+}
+
+#include "alarm.h"
+#include "kernel_private.h"
+#include "preemption.h"
+
+//=============================================================================================
+// Clock logic
+//=============================================================================================
+
+Time __kernel_get_time() {
+	timespec curr;
+	clock_gettime( CLOCK_MONOTONIC_RAW, &curr );		// CLOCK_REALTIME
+	return (Time){ curr };
+}
+
+void __kernel_set_timer( Duration alarm ) {
+	verifyf(alarm >= 1`us || alarm == 0, "Setting timer to < 1us (%jins)", alarm.tv);
+	setitimer( ITIMER_REAL, &(itimerval){ alarm }, NULL );
+}
+
+//=============================================================================================
+// Alarm logic
+//=============================================================================================
+
+void ?{}( alarm_node_t & this, thread_desc * thrd, Time alarm, Duration period ) with( this ) {
+	this.thrd = thrd;
+	this.alarm = alarm;
+	this.period = period;
+	next = 0;
+	set = false;
+	kernel_alarm = false;
+}
+
+void ?{}( alarm_node_t & this, processor   * proc, Time alarm, Duration period ) with( this ) {
+	this.proc = proc;
+	this.alarm = alarm;
+	this.period = period;
+	next = 0;
+	set = false;
+	kernel_alarm = true;
+}
+
+void ^?{}( alarm_node_t & this ) {
+	if( this.set ) {
+		unregister_self( &this );
+	}
+}
+
+#if !defined(NDEBUG) && (defined(__CFA_DEBUG__) || defined(__CFA_VERIFY__))
+bool validate( alarm_list_t * this ) {
+	alarm_node_t ** it = &this->head;
+	while( (*it) ) {
+		it = &(*it)->next;
+	}
+
+	return it == this->tail;
+}
+#endif
+
+static inline void insert_at( alarm_list_t * this, alarm_node_t * n, __alarm_it_t p ) {
+	verify( !n->next );
+	if( p == this->tail ) {
+		this->tail = &n->next;
+	}
+	else {
+		n->next = *p;
+	}
+	*p = n;
+
+	verify( validate( this ) );
+}
+
+void insert( alarm_list_t * this, alarm_node_t * n ) {
+	alarm_node_t ** it = &this->head;
+	while( (*it) && (n->alarm > (*it)->alarm) ) {
+		it = &(*it)->next;
+	}
+
+	insert_at( this, n, it );
+
+	verify( validate( this ) );
+}
+
+alarm_node_t * pop( alarm_list_t * this ) {
+	alarm_node_t * head = this->head;
+	if( head ) {
+		this->head = head->next;
+		if( !head->next ) {
+			this->tail = &this->head;
+		}
+		head->next = NULL;
+	}
+	verify( validate( this ) );
+	return head;
+}
+
+static inline void remove_at( alarm_list_t * this, alarm_node_t * n, __alarm_it_t it ) {
+	verify( it );
+	verify( (*it) == n );
+
+	(*it) = n->next;
+	if( !n-> next ) {
+		this->tail = it;
+	}
+	n->next = NULL;
+
+	verify( validate( this ) );
+}
+
+static inline void remove( alarm_list_t * this, alarm_node_t * n ) {
+	alarm_node_t ** it = &this->head;
+	while( (*it) && (*it) != n ) {
+		it = &(*it)->next;
+	}
+
+	verify( validate( this ) );
+
+	if( *it ) { remove_at( this, n, it ); }
+
+	verify( validate( this ) );
+}
+
+void register_self( alarm_node_t * this ) {
+	alarm_list_t * alarms = &event_kernel->alarms;
+
+	disable_interrupts();
+	lock( event_kernel->lock __cfaabi_dbg_ctx2 );
+	{
+		verify( validate( alarms ) );
+		bool first = !alarms->head;
+
+		insert( alarms, this );
+		if( first ) {
+			__kernel_set_timer( alarms->head->alarm - __kernel_get_time() );
+		}
+	}
+	unlock( event_kernel->lock );
+	this->set = true;
+	enable_interrupts( __cfaabi_dbg_ctx );
+}
+
+void unregister_self( alarm_node_t * this ) {
+	disable_interrupts();
+	lock( event_kernel->lock __cfaabi_dbg_ctx2 );
+	{
+		verify( validate( &event_kernel->alarms ) );
+		remove( &event_kernel->alarms, this );
+	}
+	unlock( event_kernel->lock );
+	enable_interrupts( __cfaabi_dbg_ctx );
+	this->set = false;
+}
+
+// Local Variables: //
+// mode: c //
+// tab-width: 4 //
+// End: //
Index: libcfa/src/concurrency/coroutine.c
===================================================================
--- libcfa/src/concurrency/coroutine.c	(revision bf71cfdb7285490eee552b461158846f626cc52f)
+++ 	(revision )
@@ -1,183 +1,0 @@
-//
-// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
-//
-// The contents of this file are covered under the licence agreement in the
-// file "LICENCE" distributed with Cforall.
-//
-// coroutine.c --
-//
-// Author           : Thierry Delisle
-// Created On       : Mon Nov 28 12:27:26 2016
-// Last Modified By : Peter A. Buhr
-// Last Modified On : Fri Mar 30 17:20:57 2018
-// Update Count     : 9
-//
-
-#include "coroutine"
-
-extern "C" {
-#include <stddef.h>
-#include <malloc.h>
-#include <errno.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/mman.h>
-}
-
-#include "kernel_private.h"
-
-#define __CFA_INVOKE_PRIVATE__
-#include "invoke.h"
-
-//-----------------------------------------------------------------------------
-// Global state variables
-
-// minimum feasible stack size in bytes
-#define MinStackSize 1000
-static size_t pageSize = 0;				// architecture pagesize HACK, should go in proper runtime singleton
-
-//-----------------------------------------------------------------------------
-// Coroutine ctors and dtors
-void ?{}( coStack_t & this, void * storage, size_t storageSize ) with( this ) {
-      size		 = storageSize == 0 ? 65000 : storageSize; // size of stack
-      this.storage = storage;                                // pointer to stack
-      limit		 = NULL;                                   // stack grows towards stack limit
-      base		 = NULL;                                   // base of stack
-      context	 = NULL;                                   // address of cfa_context_t
-      top		 = NULL;                                   // address of top of storage
-      userStack	 = storage != NULL;
-}
-
-void ^?{}(coStack_t & this) {
-      if ( ! this.userStack && this.storage ) {
-            __cfaabi_dbg_debug_do(
-                  if ( mprotect( this.storage, pageSize, PROT_READ | PROT_WRITE ) == -1 ) {
-                        abort( "(coStack_t *)%p.^?{}() : internal error, mprotect failure, error(%d) %s.", &this, errno, strerror( errno ) );
-                  }
-            );
-            free( this.storage );
-      }
-}
-
-void ?{}( coroutine_desc & this, const char * name, void * storage, size_t storageSize ) with( this ) {
-      (this.stack){storage, storageSize};
-      this.name = name;
-      errno_ = 0;
-      state = Start;
-      starter = NULL;
-      last = NULL;
-}
-
-void ^?{}(coroutine_desc& this) {}
-
-// Part of the Public API
-// Not inline since only ever called once per coroutine
-forall(dtype T | is_coroutine(T))
-void prime(T& cor) {
-      coroutine_desc* this = get_coroutine(cor);
-      assert(this->state == Start);
-
-      this->state = Primed;
-      resume(cor);
-}
-
-// Wrapper for co
-void CoroutineCtxSwitch(coroutine_desc* src, coroutine_desc* dst) {
-      // Safety note : This could cause some false positives due to preemption
-      verify( TL_GET( preemption_state.enabled ) || TL_GET( this_processor )->do_terminate );
-      disable_interrupts();
-
-      // set state of current coroutine to inactive
-      src->state = src->state == Halted ? Halted : Inactive;
-
-      // set new coroutine that task is executing
-      kernelTLS.this_coroutine = dst;
-
-      // context switch to specified coroutine
-      assert( src->stack.context );
-      CtxSwitch( src->stack.context, dst->stack.context );
-      // when CtxSwitch returns we are back in the src coroutine
-
-      // set state of new coroutine to active
-      src->state = Active;
-
-      enable_interrupts( __cfaabi_dbg_ctx );
-      // Safety note : This could cause some false positives due to preemption
-      verify( TL_GET( preemption_state.enabled ) || TL_GET( this_processor )->do_terminate );
-} //ctxSwitchDirect
-
-void create_stack( coStack_t* this, unsigned int storageSize ) with( *this ) {
-      //TEMP HACK do this on proper kernel startup
-      if(pageSize == 0ul) pageSize = sysconf( _SC_PAGESIZE );
-
-      size_t cxtSize = libCeiling( sizeof(machine_context_t), 8 ); // minimum alignment
-
-      if ( !storage ) {
-            __cfaabi_dbg_print_safe("Kernel : Creating stack of size %zu for stack obj %p\n", cxtSize + size + 8, this);
-
-            userStack = false;
-            size = libCeiling( storageSize, 16 );
-            // use malloc/memalign because "new" raises an exception for out-of-memory
-
-            // assume malloc has 8 byte alignment so add 8 to allow rounding up to 16 byte alignment
-            __cfaabi_dbg_debug_do( storage = memalign( pageSize, cxtSize + size + pageSize ) );
-            __cfaabi_dbg_no_debug_do( storage = malloc( cxtSize + size + 8 ) );
-
-            __cfaabi_dbg_debug_do(
-                  if ( mprotect( storage, pageSize, PROT_NONE ) == -1 ) {
-                        abort( "(uMachContext &)%p.createContext() : internal error, mprotect failure, error(%d) %s.", this, (int)errno, strerror( (int)errno ) );
-                  } // if
-            );
-
-            if ( (intptr_t)storage == 0 ) {
-                  abort( "Attempt to allocate %zd bytes of storage for coroutine or task execution-state but insufficient memory available.", size );
-            } // if
-
-            __cfaabi_dbg_debug_do( limit = (char *)storage + pageSize );
-            __cfaabi_dbg_no_debug_do( limit = (char *)libCeiling( (unsigned long)storage, 16 ) ); // minimum alignment
-
-      } else {
-            __cfaabi_dbg_print_safe("Kernel : stack obj %p using user stack %p(%u bytes)\n", this, storage, storageSize);
-
-            assertf( ((size_t)storage & (libAlign() - 1)) == 0ul, "Stack storage %p for task/coroutine must be aligned on %d byte boundary.", storage, (int)libAlign() );
-            userStack = true;
-            size = storageSize - cxtSize;
-
-            if ( size % 16 != 0u ) size -= 8;
-
-            limit = (char *)libCeiling( (unsigned long)storage, 16 ); // minimum alignment
-      } // if
-      assertf( size >= MinStackSize, "Stack size %zd provides less than minimum of %d bytes for a stack.", size, MinStackSize );
-
-      base = (char *)limit + size;
-      context = base;
-      top = (char *)context + cxtSize;
-}
-
-// We need to call suspend from invoke.c, so we expose this wrapper that
-// is not inline (We can't inline Cforall in C)
-extern "C" {
-      void __suspend_internal(void) {
-            suspend();
-      }
-
-      void __leave_coroutine(void) {
-            coroutine_desc * src = TL_GET( this_coroutine ); // optimization
-
-            assertf( src->starter != 0,
-                  "Attempt to suspend/leave coroutine \"%.256s\" (%p) that has never been resumed.\n"
-                  "Possible cause is a suspend executed in a member called by a coroutine user rather than by the coroutine main.",
-                  src->name, src );
-            assertf( src->starter->state != Halted,
-                  "Attempt by coroutine \"%.256s\" (%p) to suspend/leave back to terminated coroutine \"%.256s\" (%p).\n"
-                  "Possible cause is terminated coroutine's main routine has already returned.",
-                  src->name, src, src->starter->name, src->starter );
-
-            CoroutineCtxSwitch( src, src->starter );
-      }
-}
-
-// Local Variables: //
-// mode: c //
-// tab-width: 4 //
-// End: //
Index: libcfa/src/concurrency/coroutine.cfa
===================================================================
--- libcfa/src/concurrency/coroutine.cfa	(revision 4dcaed2c212a42d633824e9b88053258a2d0f969)
+++ libcfa/src/concurrency/coroutine.cfa	(revision 4dcaed2c212a42d633824e9b88053258a2d0f969)
@@ -0,0 +1,183 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// coroutine.c --
+//
+// Author           : Thierry Delisle
+// Created On       : Mon Nov 28 12:27:26 2016
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Fri Mar 30 17:20:57 2018
+// Update Count     : 9
+//
+
+#include "coroutine"
+
+extern "C" {
+#include <stddef.h>
+#include <malloc.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+}
+
+#include "kernel_private.h"
+
+#define __CFA_INVOKE_PRIVATE__
+#include "invoke.h"
+
+//-----------------------------------------------------------------------------
+// Global state variables
+
+// minimum feasible stack size in bytes
+#define MinStackSize 1000
+static size_t pageSize = 0;				// architecture pagesize HACK, should go in proper runtime singleton
+
+//-----------------------------------------------------------------------------
+// Coroutine ctors and dtors
+void ?{}( coStack_t & this, void * storage, size_t storageSize ) with( this ) {
+      size		 = storageSize == 0 ? 65000 : storageSize; // size of stack
+      this.storage = storage;                                // pointer to stack
+      limit		 = NULL;                                   // stack grows towards stack limit
+      base		 = NULL;                                   // base of stack
+      context	 = NULL;                                   // address of cfa_context_t
+      top		 = NULL;                                   // address of top of storage
+      userStack	 = storage != NULL;
+}
+
+void ^?{}(coStack_t & this) {
+      if ( ! this.userStack && this.storage ) {
+            __cfaabi_dbg_debug_do(
+                  if ( mprotect( this.storage, pageSize, PROT_READ | PROT_WRITE ) == -1 ) {
+                        abort( "(coStack_t *)%p.^?{}() : internal error, mprotect failure, error(%d) %s.", &this, errno, strerror( errno ) );
+                  }
+            );
+            free( this.storage );
+      }
+}
+
+void ?{}( coroutine_desc & this, const char * name, void * storage, size_t storageSize ) with( this ) {
+      (this.stack){storage, storageSize};
+      this.name = name;
+      errno_ = 0;
+      state = Start;
+      starter = NULL;
+      last = NULL;
+}
+
+void ^?{}(coroutine_desc& this) {}
+
+// Part of the Public API
+// Not inline since only ever called once per coroutine
+forall(dtype T | is_coroutine(T))
+void prime(T& cor) {
+      coroutine_desc* this = get_coroutine(cor);
+      assert(this->state == Start);
+
+      this->state = Primed;
+      resume(cor);
+}
+
+// Wrapper for co
+void CoroutineCtxSwitch(coroutine_desc* src, coroutine_desc* dst) {
+      // Safety note : This could cause some false positives due to preemption
+      verify( TL_GET( preemption_state.enabled ) || TL_GET( this_processor )->do_terminate );
+      disable_interrupts();
+
+      // set state of current coroutine to inactive
+      src->state = src->state == Halted ? Halted : Inactive;
+
+      // set new coroutine that task is executing
+      kernelTLS.this_coroutine = dst;
+
+      // context switch to specified coroutine
+      assert( src->stack.context );
+      CtxSwitch( src->stack.context, dst->stack.context );
+      // when CtxSwitch returns we are back in the src coroutine
+
+      // set state of new coroutine to active
+      src->state = Active;
+
+      enable_interrupts( __cfaabi_dbg_ctx );
+      // Safety note : This could cause some false positives due to preemption
+      verify( TL_GET( preemption_state.enabled ) || TL_GET( this_processor )->do_terminate );
+} //ctxSwitchDirect
+
+void create_stack( coStack_t* this, unsigned int storageSize ) with( *this ) {
+      //TEMP HACK do this on proper kernel startup
+      if(pageSize == 0ul) pageSize = sysconf( _SC_PAGESIZE );
+
+      size_t cxtSize = libCeiling( sizeof(machine_context_t), 8 ); // minimum alignment
+
+      if ( !storage ) {
+            __cfaabi_dbg_print_safe("Kernel : Creating stack of size %zu for stack obj %p\n", cxtSize + size + 8, this);
+
+            userStack = false;
+            size = libCeiling( storageSize, 16 );
+            // use malloc/memalign because "new" raises an exception for out-of-memory
+
+            // assume malloc has 8 byte alignment so add 8 to allow rounding up to 16 byte alignment
+            __cfaabi_dbg_debug_do( storage = memalign( pageSize, cxtSize + size + pageSize ) );
+            __cfaabi_dbg_no_debug_do( storage = malloc( cxtSize + size + 8 ) );
+
+            __cfaabi_dbg_debug_do(
+                  if ( mprotect( storage, pageSize, PROT_NONE ) == -1 ) {
+                        abort( "(uMachContext &)%p.createContext() : internal error, mprotect failure, error(%d) %s.", this, (int)errno, strerror( (int)errno ) );
+                  } // if
+            );
+
+            if ( (intptr_t)storage == 0 ) {
+                  abort( "Attempt to allocate %zd bytes of storage for coroutine or task execution-state but insufficient memory available.", size );
+            } // if
+
+            __cfaabi_dbg_debug_do( limit = (char *)storage + pageSize );
+            __cfaabi_dbg_no_debug_do( limit = (char *)libCeiling( (unsigned long)storage, 16 ) ); // minimum alignment
+
+      } else {
+            __cfaabi_dbg_print_safe("Kernel : stack obj %p using user stack %p(%u bytes)\n", this, storage, storageSize);
+
+            assertf( ((size_t)storage & (libAlign() - 1)) == 0ul, "Stack storage %p for task/coroutine must be aligned on %d byte boundary.", storage, (int)libAlign() );
+            userStack = true;
+            size = storageSize - cxtSize;
+
+            if ( size % 16 != 0u ) size -= 8;
+
+            limit = (char *)libCeiling( (unsigned long)storage, 16 ); // minimum alignment
+      } // if
+      assertf( size >= MinStackSize, "Stack size %zd provides less than minimum of %d bytes for a stack.", size, MinStackSize );
+
+      base = (char *)limit + size;
+      context = base;
+      top = (char *)context + cxtSize;
+}
+
+// We need to call suspend from invoke.c, so we expose this wrapper that
+// is not inline (We can't inline Cforall in C)
+extern "C" {
+      void __suspend_internal(void) {
+            suspend();
+      }
+
+      void __leave_coroutine(void) {
+            coroutine_desc * src = TL_GET( this_coroutine ); // optimization
+
+            assertf( src->starter != 0,
+                  "Attempt to suspend/leave coroutine \"%.256s\" (%p) that has never been resumed.\n"
+                  "Possible cause is a suspend executed in a member called by a coroutine user rather than by the coroutine main.",
+                  src->name, src );
+            assertf( src->starter->state != Halted,
+                  "Attempt by coroutine \"%.256s\" (%p) to suspend/leave back to terminated coroutine \"%.256s\" (%p).\n"
+                  "Possible cause is terminated coroutine's main routine has already returned.",
+                  src->name, src, src->starter->name, src->starter );
+
+            CoroutineCtxSwitch( src, src->starter );
+      }
+}
+
+// Local Variables: //
+// mode: c //
+// tab-width: 4 //
+// End: //
Index: libcfa/src/concurrency/kernel.c
===================================================================
--- libcfa/src/concurrency/kernel.c	(revision bf71cfdb7285490eee552b461158846f626cc52f)
+++ 	(revision )
@@ -1,843 +1,0 @@
-//
-// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
-//
-// The contents of this file are covered under the licence agreement in the
-// file "LICENCE" distributed with Cforall.
-//
-// kernel.c --
-//
-// Author           : Thierry Delisle
-// Created On       : Tue Jan 17 12:27:26 2017
-// Last Modified By : Peter A. Buhr
-// Last Modified On : Mon Apr  9 16:11:46 2018
-// Update Count     : 24
-//
-
-//C Includes
-#include <stddef.h>
-#include <errno.h>
-#include <string.h>
-extern "C" {
-#include <stdio.h>
-#include <fenv.h>
-#include <sys/resource.h>
-#include <signal.h>
-#include <unistd.h>
-}
-
-//CFA Includes
-#include "time"
-#include "kernel_private.h"
-#include "preemption.h"
-#include "startup.h"
-
-//Private includes
-#define __CFA_INVOKE_PRIVATE__
-#include "invoke.h"
-
-//Start and stop routine for the kernel, declared first to make sure they run first
-static void kernel_startup(void)  __attribute__(( constructor( STARTUP_PRIORITY_KERNEL ) ));
-static void kernel_shutdown(void) __attribute__(( destructor ( STARTUP_PRIORITY_KERNEL ) ));
-
-//-----------------------------------------------------------------------------
-// Kernel storage
-KERNEL_STORAGE(cluster,           mainCluster);
-KERNEL_STORAGE(processor,         mainProcessor);
-KERNEL_STORAGE(thread_desc,       mainThread);
-KERNEL_STORAGE(machine_context_t, mainThreadCtx);
-
-cluster     * mainCluster;
-processor   * mainProcessor;
-thread_desc * mainThread;
-
-extern "C" {
-struct { __dllist_t(cluster) list; __spinlock_t lock; } __cfa_dbg_global_clusters;
-}
-
-//-----------------------------------------------------------------------------
-// Global state
-thread_local struct KernelThreadData kernelTLS = {
-	NULL,
-	NULL,
-	NULL,
-	{ 1, false, false }
-};
-
-//-----------------------------------------------------------------------------
-// Struct to steal stack
-struct current_stack_info_t {
-	machine_context_t ctx;
-	unsigned int size;		// size of stack
-	void *base;				// base of stack
-	void *storage;			// pointer to stack
-	void *limit;			// stack grows towards stack limit
-	void *context;			// address of cfa_context_t
-	void *top;				// address of top of storage
-};
-
-void ?{}( current_stack_info_t & this ) {
-	CtxGet( this.ctx );
-	this.base = this.ctx.FP;
-	this.storage = this.ctx.SP;
-
-	rlimit r;
-	getrlimit( RLIMIT_STACK, &r);
-	this.size = r.rlim_cur;
-
-	this.limit = (void *)(((intptr_t)this.base) - this.size);
-	this.context = &storage_mainThreadCtx;
-	this.top = this.base;
-}
-
-//-----------------------------------------------------------------------------
-// Main thread construction
-void ?{}( coStack_t & this, current_stack_info_t * info) with( this ) {
-	size      = info->size;
-	storage   = info->storage;
-	limit     = info->limit;
-	base      = info->base;
-	context   = info->context;
-	top       = info->top;
-	userStack = true;
-}
-
-void ?{}( coroutine_desc & this, current_stack_info_t * info) with( this ) {
-	stack{ info };
-	name = "Main Thread";
-	errno_ = 0;
-	state = Start;
-	starter = NULL;
-}
-
-void ?{}( thread_desc & this, current_stack_info_t * info) with( this ) {
-	self_cor{ info };
-	curr_cor = &self_cor;
-	curr_cluster = mainCluster;
-	self_mon.owner = &this;
-	self_mon.recursion = 1;
-	self_mon_p = &self_mon;
-	next = NULL;
-
-	node.next = NULL;
-	node.prev = NULL;
-	doregister(curr_cluster, this);
-
-	monitors{ &self_mon_p, 1, (fptr_t)0 };
-}
-
-//-----------------------------------------------------------------------------
-// Processor coroutine
-void ?{}(processorCtx_t & this) {
-
-}
-
-// Construct the processor context of non-main processors
-static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info) {
-	(this.__cor){ info };
-	this.proc = proc;
-}
-
-static void start(processor * this);
-void ?{}(processor & this, const char * name, cluster & cltr) with( this ) {
-	this.name = name;
-	this.cltr = &cltr;
-	terminated{ 0 };
-	do_terminate = false;
-	preemption_alarm = NULL;
-	pending_preemption = false;
-	runner.proc = &this;
-
-	idleLock{};
-
-	start( &this );
-}
-
-void ^?{}(processor & this) with( this ){
-	if( ! __atomic_load_n(&do_terminate, __ATOMIC_ACQUIRE) ) {
-		__cfaabi_dbg_print_safe("Kernel : core %p signaling termination\n", &this);
-
-		__atomic_store_n(&do_terminate, true, __ATOMIC_RELAXED);
-		wake( &this );
-
-		P( terminated );
-		verify( kernelTLS.this_processor != &this);
-	}
-
-	pthread_join( kernel_thread, NULL );
-}
-
-void ?{}(cluster & this, const char * name, Duration preemption_rate) with( this ) {
-	this.name = name;
-	this.preemption_rate = preemption_rate;
-	ready_queue{};
-	ready_queue_lock{};
-
-	procs{ __get };
-	idles{ __get };
-	threads{ __get };
-
-	doregister(this);
-}
-
-void ^?{}(cluster & this) {
-	unregister(this);
-}
-
-//=============================================================================================
-// Kernel Scheduling logic
-//=============================================================================================
-static void runThread(processor * this, thread_desc * dst);
-static void finishRunning(processor * this);
-static void halt(processor * this);
-
-//Main of the processor contexts
-void main(processorCtx_t & runner) {
-	processor * this = runner.proc;
-	verify(this);
-
-	__cfaabi_dbg_print_safe("Kernel : core %p starting\n", this);
-
-	doregister(this->cltr, this);
-
-	{
-		// Setup preemption data
-		preemption_scope scope = { this };
-
-		__cfaabi_dbg_print_safe("Kernel : core %p started\n", this);
-
-		thread_desc * readyThread = NULL;
-		for( unsigned int spin_count = 0; ! __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST); spin_count++ )
-		{
-			readyThread = nextThread( this->cltr );
-
-			if(readyThread)
-			{
-				verify( ! kernelTLS.preemption_state.enabled );
-
-				runThread(this, readyThread);
-
-				verify( ! kernelTLS.preemption_state.enabled );
-
-				//Some actions need to be taken from the kernel
-				finishRunning(this);
-
-				spin_count = 0;
-			}
-			else
-			{
-				// spin(this, &spin_count);
-				halt(this);
-			}
-		}
-
-		__cfaabi_dbg_print_safe("Kernel : core %p stopping\n", this);
-	}
-
-	unregister(this->cltr, this);
-
-	V( this->terminated );
-
-	__cfaabi_dbg_print_safe("Kernel : core %p terminated\n", this);
-}
-
-// KERNEL ONLY
-// runThread runs a thread by context switching
-// from the processor coroutine to the target thread
-static void runThread(processor * this, thread_desc * dst) {
-	assert(dst->curr_cor);
-	coroutine_desc * proc_cor = get_coroutine(this->runner);
-	coroutine_desc * thrd_cor = dst->curr_cor;
-
-	// Reset the terminating actions here
-	this->finish.action_code = No_Action;
-
-	// Update global state
-	kernelTLS.this_thread = dst;
-
-	// Context Switch to the thread
-	ThreadCtxSwitch(proc_cor, thrd_cor);
-	// when ThreadCtxSwitch returns we are back in the processor coroutine
-}
-
-// KERNEL_ONLY
-static void returnToKernel() {
-	coroutine_desc * proc_cor = get_coroutine(kernelTLS.this_processor->runner);
-	coroutine_desc * thrd_cor = kernelTLS.this_thread->curr_cor = kernelTLS.this_coroutine;
-	ThreadCtxSwitch(thrd_cor, proc_cor);
-}
-
-// KERNEL_ONLY
-// Once a thread has finished running, some of
-// its final actions must be executed from the kernel
-static void finishRunning(processor * this) with( this->finish ) {
-	verify( ! kernelTLS.preemption_state.enabled );
-	choose( action_code ) {
-	case No_Action:
-		break;
-	case Release:
-		unlock( *lock );
-	case Schedule:
-		ScheduleThread( thrd );
-	case Release_Schedule:
-		unlock( *lock );
-		ScheduleThread( thrd );
-	case Release_Multi:
-		for(int i = 0; i < lock_count; i++) {
-			unlock( *locks[i] );
-		}
-	case Release_Multi_Schedule:
-		for(int i = 0; i < lock_count; i++) {
-			unlock( *locks[i] );
-		}
-		for(int i = 0; i < thrd_count; i++) {
-			ScheduleThread( thrds[i] );
-		}
-	case Callback:
-		callback();
-	default:
-		abort("KERNEL ERROR: Unexpected action to run after thread");
-	}
-}
-
-// KERNEL_ONLY
-// Context invoker for processors
-// This is the entry point for processors (kernel threads)
-// It effectively constructs a coroutine by stealing the pthread stack
-static void * CtxInvokeProcessor(void * arg) {
-	processor * proc = (processor *) arg;
-	kernelTLS.this_processor = proc;
-	kernelTLS.this_coroutine = NULL;
-	kernelTLS.this_thread    = NULL;
-	kernelTLS.preemption_state.[enabled, disable_count] = [false, 1];
-	// SKULLDUGGERY: We want to create a context for the processor coroutine
-	// which is needed for the 2-step context switch. However, there is no reason
-	// to waste the perfectly valid stack create by pthread.
-	current_stack_info_t info;
-	machine_context_t ctx;
-	info.context = &ctx;
-	(proc->runner){ proc, &info };
-
-	__cfaabi_dbg_print_safe("Coroutine : created stack %p\n", get_coroutine(proc->runner)->stack.base);
-
-	//Set global state
-	kernelTLS.this_coroutine = get_coroutine(proc->runner);
-	kernelTLS.this_thread    = NULL;
-
-	//We now have a proper context from which to schedule threads
-	__cfaabi_dbg_print_safe("Kernel : core %p created (%p, %p)\n", proc, &proc->runner, &ctx);
-
-	// SKULLDUGGERY: Since the coroutine doesn't have its own stack, we can't
-	// resume it to start it like it normally would, it will just context switch
-	// back to here. Instead directly call the main since we already are on the
-	// appropriate stack.
-	get_coroutine(proc->runner)->state = Active;
-	main( proc->runner );
-	get_coroutine(proc->runner)->state = Halted;
-
-	// Main routine of the core returned, the core is now fully terminated
-	__cfaabi_dbg_print_safe("Kernel : core %p main ended (%p)\n", proc, &proc->runner);
-
-	return NULL;
-}
-
-static void start(processor * this) {
-	__cfaabi_dbg_print_safe("Kernel : Starting core %p\n", this);
-
-	pthread_create( &this->kernel_thread, NULL, CtxInvokeProcessor, (void*)this );
-
-	__cfaabi_dbg_print_safe("Kernel : core %p started\n", this);
-}
-
-// KERNEL_ONLY
-void kernel_first_resume(processor * this) {
-	coroutine_desc * src = kernelTLS.this_coroutine;
-	coroutine_desc * dst = get_coroutine(this->runner);
-
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	create_stack(&dst->stack, dst->stack.size);
-	CtxStart(&this->runner, CtxInvokeCoroutine);
-
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	dst->last = src;
-	dst->starter = dst->starter ? dst->starter : src;
-
-	// set state of current coroutine to inactive
-	src->state = src->state == Halted ? Halted : Inactive;
-
-	// set new coroutine that task is executing
-	kernelTLS.this_coroutine = dst;
-
-	// SKULLDUGGERY normally interrupts are enable before leaving a coroutine ctxswitch.
-	// Therefore, when first creating a coroutine, interrupts are enable before calling the main.
-	// This is consistent with thread creation. However, when creating the main processor coroutine,
-	// we wan't interrupts to be disabled. Therefore, we double-disable interrupts here so they will
-	// stay disabled.
-	disable_interrupts();
-
-	// context switch to specified coroutine
-	assert( src->stack.context );
-	CtxSwitch( src->stack.context, dst->stack.context );
-	// when CtxSwitch returns we are back in the src coroutine
-
-	// set state of new coroutine to active
-	src->state = Active;
-
-	verify( ! kernelTLS.preemption_state.enabled );
-}
-
-//-----------------------------------------------------------------------------
-// Scheduler routines
-
-// KERNEL ONLY
-void ScheduleThread( thread_desc * thrd ) {
-	verify( thrd );
-	verify( thrd->self_cor.state != Halted );
-
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	verifyf( thrd->next == NULL, "Expected null got %p", thrd->next );
-
-	with( *thrd->curr_cluster ) {
-		lock  ( ready_queue_lock __cfaabi_dbg_ctx2 );
-		bool was_empty = !(ready_queue != 0);
-		append( ready_queue, thrd );
-		unlock( ready_queue_lock );
-
-		if(was_empty) {
-			lock      (proc_list_lock __cfaabi_dbg_ctx2);
-			if(idles) {
-				wake_fast(idles.head);
-			}
-			unlock    (proc_list_lock);
-		}
-		else if( struct processor * idle = idles.head ) {
-			wake_fast(idle);
-		}
-
-	}
-
-	verify( ! kernelTLS.preemption_state.enabled );
-}
-
-// KERNEL ONLY
-thread_desc * nextThread(cluster * this) with( *this ) {
-	verify( ! kernelTLS.preemption_state.enabled );
-	lock( ready_queue_lock __cfaabi_dbg_ctx2 );
-	thread_desc * head = pop_head( ready_queue );
-	unlock( ready_queue_lock );
-	verify( ! kernelTLS.preemption_state.enabled );
-	return head;
-}
-
-void BlockInternal() {
-	disable_interrupts();
-	verify( ! kernelTLS.preemption_state.enabled );
-	returnToKernel();
-	verify( ! kernelTLS.preemption_state.enabled );
-	enable_interrupts( __cfaabi_dbg_ctx );
-}
-
-void BlockInternal( __spinlock_t * lock ) {
-	disable_interrupts();
-	with( *kernelTLS.this_processor ) {
-		finish.action_code = Release;
-		finish.lock        = lock;
-	}
-
-	verify( ! kernelTLS.preemption_state.enabled );
-	returnToKernel();
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	enable_interrupts( __cfaabi_dbg_ctx );
-}
-
-void BlockInternal( thread_desc * thrd ) {
-	disable_interrupts();
-	with( * kernelTLS.this_processor ) {
-		finish.action_code = Schedule;
-		finish.thrd        = thrd;
-	}
-
-	verify( ! kernelTLS.preemption_state.enabled );
-	returnToKernel();
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	enable_interrupts( __cfaabi_dbg_ctx );
-}
-
-void BlockInternal( __spinlock_t * lock, thread_desc * thrd ) {
-	assert(thrd);
-	disable_interrupts();
-	with( * kernelTLS.this_processor ) {
-		finish.action_code = Release_Schedule;
-		finish.lock        = lock;
-		finish.thrd        = thrd;
-	}
-
-	verify( ! kernelTLS.preemption_state.enabled );
-	returnToKernel();
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	enable_interrupts( __cfaabi_dbg_ctx );
-}
-
-void BlockInternal(__spinlock_t * locks [], unsigned short count) {
-	disable_interrupts();
-	with( * kernelTLS.this_processor ) {
-		finish.action_code = Release_Multi;
-		finish.locks       = locks;
-		finish.lock_count  = count;
-	}
-
-	verify( ! kernelTLS.preemption_state.enabled );
-	returnToKernel();
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	enable_interrupts( __cfaabi_dbg_ctx );
-}
-
-void BlockInternal(__spinlock_t * locks [], unsigned short lock_count, thread_desc * thrds [], unsigned short thrd_count) {
-	disable_interrupts();
-	with( *kernelTLS.this_processor ) {
-		finish.action_code = Release_Multi_Schedule;
-		finish.locks       = locks;
-		finish.lock_count  = lock_count;
-		finish.thrds       = thrds;
-		finish.thrd_count  = thrd_count;
-	}
-
-	verify( ! kernelTLS.preemption_state.enabled );
-	returnToKernel();
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	enable_interrupts( __cfaabi_dbg_ctx );
-}
-
-void BlockInternal(__finish_callback_fptr_t callback) {
-	disable_interrupts();
-	with( *kernelTLS.this_processor ) {
-		finish.action_code = Callback;
-		finish.callback    = callback;
-	}
-
-	verify( ! kernelTLS.preemption_state.enabled );
-	returnToKernel();
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	enable_interrupts( __cfaabi_dbg_ctx );
-}
-
-// KERNEL ONLY
-void LeaveThread(__spinlock_t * lock, thread_desc * thrd) {
-	verify( ! kernelTLS.preemption_state.enabled );
-	with( * kernelTLS.this_processor ) {
-		finish.action_code = thrd ? Release_Schedule : Release;
-		finish.lock        = lock;
-		finish.thrd        = thrd;
-	}
-
-	returnToKernel();
-}
-
-//=============================================================================================
-// Kernel Setup logic
-//=============================================================================================
-//-----------------------------------------------------------------------------
-// Kernel boot procedures
-static void kernel_startup(void) {
-	verify( ! kernelTLS.preemption_state.enabled );
-	__cfaabi_dbg_print_safe("Kernel : Starting\n");
-
-	__cfa_dbg_global_clusters.list{ __get };
-	__cfa_dbg_global_clusters.lock{};
-
-	// Initialize the main cluster
-	mainCluster = (cluster *)&storage_mainCluster;
-	(*mainCluster){"Main Cluster"};
-
-	__cfaabi_dbg_print_safe("Kernel : Main cluster ready\n");
-
-	// Start by initializing the main thread
-	// SKULLDUGGERY: the mainThread steals the process main thread
-	// which will then be scheduled by the mainProcessor normally
-	mainThread = (thread_desc *)&storage_mainThread;
-	current_stack_info_t info;
-	(*mainThread){ &info };
-
-	__cfaabi_dbg_print_safe("Kernel : Main thread ready\n");
-
-
-
-	// Construct the processor context of the main processor
-	void ?{}(processorCtx_t & this, processor * proc) {
-		(this.__cor){ "Processor" };
-		this.__cor.starter = NULL;
-		this.proc = proc;
-	}
-
-	void ?{}(processor & this) with( this ) {
-		name = "Main Processor";
-		cltr = mainCluster;
-		terminated{ 0 };
-		do_terminate = false;
-		preemption_alarm = NULL;
-		pending_preemption = false;
-		kernel_thread = pthread_self();
-
-		runner{ &this };
-		__cfaabi_dbg_print_safe("Kernel : constructed main processor context %p\n", &runner);
-	}
-
-	// Initialize the main processor and the main processor ctx
-	// (the coroutine that contains the processing control flow)
-	mainProcessor = (processor *)&storage_mainProcessor;
-	(*mainProcessor){};
-
-	//initialize the global state variables
-	kernelTLS.this_processor = mainProcessor;
-	kernelTLS.this_thread    = mainThread;
-	kernelTLS.this_coroutine = &mainThread->self_cor;
-
-	// Enable preemption
-	kernel_start_preemption();
-
-	// Add the main thread to the ready queue
-	// once resume is called on mainProcessor->runner the mainThread needs to be scheduled like any normal thread
-	ScheduleThread(mainThread);
-
-	// SKULLDUGGERY: Force a context switch to the main processor to set the main thread's context to the current UNIX
-	// context. Hence, the main thread does not begin through CtxInvokeThread, like all other threads. The trick here is that
-	// mainThread is on the ready queue when this call is made.
-	kernel_first_resume( kernelTLS.this_processor );
-
-
-
-	// THE SYSTEM IS NOW COMPLETELY RUNNING
-	__cfaabi_dbg_print_safe("Kernel : Started\n--------------------------------------------------\n\n");
-
-	verify( ! kernelTLS.preemption_state.enabled );
-	enable_interrupts( __cfaabi_dbg_ctx );
-	verify( TL_GET( preemption_state.enabled ) );
-}
-
-static void kernel_shutdown(void) {
-	__cfaabi_dbg_print_safe("\n--------------------------------------------------\nKernel : Shutting down\n");
-
-	verify( TL_GET( preemption_state.enabled ) );
-	disable_interrupts();
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	// SKULLDUGGERY: Notify the mainProcessor it needs to terminates.
-	// When its coroutine terminates, it return control to the mainThread
-	// which is currently here
-	__atomic_store_n(&mainProcessor->do_terminate, true, __ATOMIC_RELEASE);
-	returnToKernel();
-	mainThread->self_cor.state = Halted;
-
-	// THE SYSTEM IS NOW COMPLETELY STOPPED
-
-	// Disable preemption
-	kernel_stop_preemption();
-
-	// Destroy the main processor and its context in reverse order of construction
-	// These were manually constructed so we need manually destroy them
-	^(mainProcessor->runner){};
-	^(mainProcessor){};
-
-	// Final step, destroy the main thread since it is no longer needed
-	// Since we provided a stack to this taxk it will not destroy anything
-	^(mainThread){};
-
-	^(__cfa_dbg_global_clusters.list){};
-	^(__cfa_dbg_global_clusters.lock){};
-
-	__cfaabi_dbg_print_safe("Kernel : Shutdown complete\n");
-}
-
-//=============================================================================================
-// Kernel Quiescing
-//=============================================================================================
-static void halt(processor * this) with( *this ) {
-	// verify( ! __atomic_load_n(&do_terminate, __ATOMIC_SEQ_CST) );
-
-	with( *cltr ) {
-		lock      (proc_list_lock __cfaabi_dbg_ctx2);
-		remove    (procs, *this);
-		push_front(idles, *this);
-		unlock    (proc_list_lock);
-	}
-
-	__cfaabi_dbg_print_safe("Kernel : Processor %p ready to sleep\n", this);
-
-	wait( idleLock );
-
-	__cfaabi_dbg_print_safe("Kernel : Processor %p woke up and ready to run\n", this);
-
-	with( *cltr ) {
-		lock      (proc_list_lock __cfaabi_dbg_ctx2);
-		remove    (idles, *this);
-		push_front(procs, *this);
-		unlock    (proc_list_lock);
-	}
-}
-
-//=============================================================================================
-// Unexpected Terminating logic
-//=============================================================================================
-static __spinlock_t kernel_abort_lock;
-static bool kernel_abort_called = false;
-
-void * kernel_abort(void) __attribute__ ((__nothrow__)) {
-	// abort cannot be recursively entered by the same or different processors because all signal handlers return when
-	// the globalAbort flag is true.
-	lock( kernel_abort_lock __cfaabi_dbg_ctx2 );
-
-	// first task to abort ?
-	if ( kernel_abort_called ) {			// not first task to abort ?
-		unlock( kernel_abort_lock );
-
-		sigset_t mask;
-		sigemptyset( &mask );
-		sigaddset( &mask, SIGALRM );		// block SIGALRM signals
-		sigsuspend( &mask );			// block the processor to prevent further damage during abort
-		_exit( EXIT_FAILURE );			// if processor unblocks before it is killed, terminate it
-	}
-	else {
-		kernel_abort_called = true;
-		unlock( kernel_abort_lock );
-	}
-
-	return kernelTLS.this_thread;
-}
-
-void kernel_abort_msg( void * kernel_data, char * abort_text, int abort_text_size ) {
-	thread_desc * thrd = kernel_data;
-
-	if(thrd) {
-		int len = snprintf( abort_text, abort_text_size, "Error occurred while executing thread %.256s (%p)", thrd->self_cor.name, thrd );
-		__cfaabi_dbg_bits_write( abort_text, len );
-
-		if ( get_coroutine(thrd) != kernelTLS.this_coroutine ) {
-			len = snprintf( abort_text, abort_text_size, " in coroutine %.256s (%p).\n", kernelTLS.this_coroutine->name, kernelTLS.this_coroutine );
-			__cfaabi_dbg_bits_write( abort_text, len );
-		}
-		else {
-			__cfaabi_dbg_bits_write( ".\n", 2 );
-		}
-	}
-	else {
-		int len = snprintf( abort_text, abort_text_size, "Error occurred outside of any thread.\n" );
-		__cfaabi_dbg_bits_write( abort_text, len );
-	}
-}
-
-int kernel_abort_lastframe( void ) __attribute__ ((__nothrow__)) {
-	return get_coroutine(kernelTLS.this_thread) == get_coroutine(mainThread) ? 4 : 2;
-}
-
-static __spinlock_t kernel_debug_lock;
-
-extern "C" {
-	void __cfaabi_dbg_bits_acquire() {
-		lock( kernel_debug_lock __cfaabi_dbg_ctx2 );
-	}
-
-	void __cfaabi_dbg_bits_release() {
-		unlock( kernel_debug_lock );
-	}
-}
-
-//=============================================================================================
-// Kernel Utilities
-//=============================================================================================
-//-----------------------------------------------------------------------------
-// Locks
-void  ?{}( semaphore & this, int count = 1 ) {
-	(this.lock){};
-	this.count = count;
-	(this.waiting){};
-}
-void ^?{}(semaphore & this) {}
-
-void P(semaphore & this) with( this ){
-	lock( lock __cfaabi_dbg_ctx2 );
-	count -= 1;
-	if ( count < 0 ) {
-		// queue current task
-		append( waiting, kernelTLS.this_thread );
-
-		// atomically release spin lock and block
-		BlockInternal( &lock );
-	}
-	else {
-	    unlock( lock );
-	}
-}
-
-void V(semaphore & this) with( this ) {
-	thread_desc * thrd = NULL;
-	lock( lock __cfaabi_dbg_ctx2 );
-	count += 1;
-	if ( count <= 0 ) {
-		// remove task at head of waiting list
-		thrd = pop_head( waiting );
-	}
-
-	unlock( lock );
-
-	// make new owner
-	WakeThread( thrd );
-}
-
-//-----------------------------------------------------------------------------
-// Global Queues
-void doregister( cluster     & cltr ) {
-	lock      ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
-	push_front( __cfa_dbg_global_clusters.list, cltr );
-	unlock    ( __cfa_dbg_global_clusters.lock );
-}
-
-void unregister( cluster     & cltr ) {
-	lock  ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
-	remove( __cfa_dbg_global_clusters.list, cltr );
-	unlock( __cfa_dbg_global_clusters.lock );
-}
-
-void doregister( cluster * cltr, thread_desc & thrd ) {
-	lock      (cltr->thread_list_lock __cfaabi_dbg_ctx2);
-	push_front(cltr->threads, thrd);
-	unlock    (cltr->thread_list_lock);
-}
-
-void unregister( cluster * cltr, thread_desc & thrd ) {
-	lock  (cltr->thread_list_lock __cfaabi_dbg_ctx2);
-	remove(cltr->threads, thrd );
-	unlock(cltr->thread_list_lock);
-}
-
-void doregister( cluster * cltr, processor * proc ) {
-	lock      (cltr->proc_list_lock __cfaabi_dbg_ctx2);
-	push_front(cltr->procs, *proc);
-	unlock    (cltr->proc_list_lock);
-}
-
-void unregister( cluster * cltr, processor * proc ) {
-	lock  (cltr->proc_list_lock __cfaabi_dbg_ctx2);
-	remove(cltr->procs, *proc );
-	unlock(cltr->proc_list_lock);
-}
-
-//-----------------------------------------------------------------------------
-// Debug
-__cfaabi_dbg_debug_do(
-	void __cfaabi_dbg_record(__spinlock_t & this, const char * prev_name) {
-		this.prev_name = prev_name;
-		this.prev_thrd = kernelTLS.this_thread;
-	}
-)
-// Local Variables: //
-// mode: c //
-// tab-width: 4 //
-// End: //
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 4dcaed2c212a42d633824e9b88053258a2d0f969)
+++ libcfa/src/concurrency/kernel.cfa	(revision 4dcaed2c212a42d633824e9b88053258a2d0f969)
@@ -0,0 +1,843 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// kernel.c --
+//
+// Author           : Thierry Delisle
+// Created On       : Tue Jan 17 12:27:26 2017
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Mon Apr  9 16:11:46 2018
+// Update Count     : 24
+//
+
+//C Includes
+#include <stddef.h>
+#include <errno.h>
+#include <string.h>
+extern "C" {
+#include <stdio.h>
+#include <fenv.h>
+#include <sys/resource.h>
+#include <signal.h>
+#include <unistd.h>
+}
+
+//CFA Includes
+#include "time"
+#include "kernel_private.h"
+#include "preemption.h"
+#include "startup.h"
+
+//Private includes
+#define __CFA_INVOKE_PRIVATE__
+#include "invoke.h"
+
+//Start and stop routine for the kernel, declared first to make sure they run first
+static void kernel_startup(void)  __attribute__(( constructor( STARTUP_PRIORITY_KERNEL ) ));
+static void kernel_shutdown(void) __attribute__(( destructor ( STARTUP_PRIORITY_KERNEL ) ));
+
+//-----------------------------------------------------------------------------
+// Kernel storage
+KERNEL_STORAGE(cluster,           mainCluster);
+KERNEL_STORAGE(processor,         mainProcessor);
+KERNEL_STORAGE(thread_desc,       mainThread);
+KERNEL_STORAGE(machine_context_t, mainThreadCtx);
+
+cluster     * mainCluster;
+processor   * mainProcessor;
+thread_desc * mainThread;
+
+extern "C" {
+struct { __dllist_t(cluster) list; __spinlock_t lock; } __cfa_dbg_global_clusters;
+}
+
+//-----------------------------------------------------------------------------
+// Global state
+thread_local struct KernelThreadData kernelTLS = {
+	NULL,
+	NULL,
+	NULL,
+	{ 1, false, false }
+};
+
+//-----------------------------------------------------------------------------
+// Struct to steal stack
+struct current_stack_info_t {
+	machine_context_t ctx;
+	unsigned int size;		// size of stack
+	void *base;				// base of stack
+	void *storage;			// pointer to stack
+	void *limit;			// stack grows towards stack limit
+	void *context;			// address of cfa_context_t
+	void *top;				// address of top of storage
+};
+
+void ?{}( current_stack_info_t & this ) {
+	CtxGet( this.ctx );
+	this.base = this.ctx.FP;
+	this.storage = this.ctx.SP;
+
+	rlimit r;
+	getrlimit( RLIMIT_STACK, &r);
+	this.size = r.rlim_cur;
+
+	this.limit = (void *)(((intptr_t)this.base) - this.size);
+	this.context = &storage_mainThreadCtx;
+	this.top = this.base;
+}
+
+//-----------------------------------------------------------------------------
+// Main thread construction
+void ?{}( coStack_t & this, current_stack_info_t * info) with( this ) {
+	size      = info->size;
+	storage   = info->storage;
+	limit     = info->limit;
+	base      = info->base;
+	context   = info->context;
+	top       = info->top;
+	userStack = true;
+}
+
+void ?{}( coroutine_desc & this, current_stack_info_t * info) with( this ) {
+	stack{ info };
+	name = "Main Thread";
+	errno_ = 0;
+	state = Start;
+	starter = NULL;
+}
+
+void ?{}( thread_desc & this, current_stack_info_t * info) with( this ) {
+	self_cor{ info };
+	curr_cor = &self_cor;
+	curr_cluster = mainCluster;
+	self_mon.owner = &this;
+	self_mon.recursion = 1;
+	self_mon_p = &self_mon;
+	next = NULL;
+
+	node.next = NULL;
+	node.prev = NULL;
+	doregister(curr_cluster, this);
+
+	monitors{ &self_mon_p, 1, (fptr_t)0 };
+}
+
+//-----------------------------------------------------------------------------
+// Processor coroutine
+void ?{}(processorCtx_t & this) {
+
+}
+
+// Construct the processor context of non-main processors
+static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info) {
+	(this.__cor){ info };
+	this.proc = proc;
+}
+
+static void start(processor * this);
+void ?{}(processor & this, const char * name, cluster & cltr) with( this ) {
+	this.name = name;
+	this.cltr = &cltr;
+	terminated{ 0 };
+	do_terminate = false;
+	preemption_alarm = NULL;
+	pending_preemption = false;
+	runner.proc = &this;
+
+	idleLock{};
+
+	start( &this );
+}
+
+void ^?{}(processor & this) with( this ){
+	if( ! __atomic_load_n(&do_terminate, __ATOMIC_ACQUIRE) ) {
+		__cfaabi_dbg_print_safe("Kernel : core %p signaling termination\n", &this);
+
+		__atomic_store_n(&do_terminate, true, __ATOMIC_RELAXED);
+		wake( &this );
+
+		P( terminated );
+		verify( kernelTLS.this_processor != &this);
+	}
+
+	pthread_join( kernel_thread, NULL );
+}
+
+void ?{}(cluster & this, const char * name, Duration preemption_rate) with( this ) {
+	this.name = name;
+	this.preemption_rate = preemption_rate;
+	ready_queue{};
+	ready_queue_lock{};
+
+	procs{ __get };
+	idles{ __get };
+	threads{ __get };
+
+	doregister(this);
+}
+
+void ^?{}(cluster & this) {
+	unregister(this);
+}
+
+//=============================================================================================
+// Kernel Scheduling logic
+//=============================================================================================
+static void runThread(processor * this, thread_desc * dst);
+static void finishRunning(processor * this);
+static void halt(processor * this);
+
+//Main of the processor contexts
+void main(processorCtx_t & runner) {
+	processor * this = runner.proc;
+	verify(this);
+
+	__cfaabi_dbg_print_safe("Kernel : core %p starting\n", this);
+
+	doregister(this->cltr, this);
+
+	{
+		// Setup preemption data
+		preemption_scope scope = { this };
+
+		__cfaabi_dbg_print_safe("Kernel : core %p started\n", this);
+
+		thread_desc * readyThread = NULL;
+		for( unsigned int spin_count = 0; ! __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST); spin_count++ )
+		{
+			readyThread = nextThread( this->cltr );
+
+			if(readyThread)
+			{
+				verify( ! kernelTLS.preemption_state.enabled );
+
+				runThread(this, readyThread);
+
+				verify( ! kernelTLS.preemption_state.enabled );
+
+				//Some actions need to be taken from the kernel
+				finishRunning(this);
+
+				spin_count = 0;
+			}
+			else
+			{
+				// spin(this, &spin_count);
+				halt(this);
+			}
+		}
+
+		__cfaabi_dbg_print_safe("Kernel : core %p stopping\n", this);
+	}
+
+	unregister(this->cltr, this);
+
+	V( this->terminated );
+
+	__cfaabi_dbg_print_safe("Kernel : core %p terminated\n", this);
+}
+
+// KERNEL ONLY
+// runThread runs a thread by context switching
+// from the processor coroutine to the target thread
+static void runThread(processor * this, thread_desc * dst) {
+	assert(dst->curr_cor);
+	coroutine_desc * proc_cor = get_coroutine(this->runner);
+	coroutine_desc * thrd_cor = dst->curr_cor;
+
+	// Reset the terminating actions here
+	this->finish.action_code = No_Action;
+
+	// Update global state
+	kernelTLS.this_thread = dst;
+
+	// Context Switch to the thread
+	ThreadCtxSwitch(proc_cor, thrd_cor);
+	// when ThreadCtxSwitch returns we are back in the processor coroutine
+}
+
+// KERNEL_ONLY
+static void returnToKernel() {
+	coroutine_desc * proc_cor = get_coroutine(kernelTLS.this_processor->runner);
+	coroutine_desc * thrd_cor = kernelTLS.this_thread->curr_cor = kernelTLS.this_coroutine;
+	ThreadCtxSwitch(thrd_cor, proc_cor);
+}
+
+// KERNEL_ONLY
+// Once a thread has finished running, some of
+// its final actions must be executed from the kernel
+static void finishRunning(processor * this) with( this->finish ) {
+	verify( ! kernelTLS.preemption_state.enabled );
+	choose( action_code ) {
+	case No_Action:
+		break;
+	case Release:
+		unlock( *lock );
+	case Schedule:
+		ScheduleThread( thrd );
+	case Release_Schedule:
+		unlock( *lock );
+		ScheduleThread( thrd );
+	case Release_Multi:
+		for(int i = 0; i < lock_count; i++) {
+			unlock( *locks[i] );
+		}
+	case Release_Multi_Schedule:
+		for(int i = 0; i < lock_count; i++) {
+			unlock( *locks[i] );
+		}
+		for(int i = 0; i < thrd_count; i++) {
+			ScheduleThread( thrds[i] );
+		}
+	case Callback:
+		callback();
+	default:
+		abort("KERNEL ERROR: Unexpected action to run after thread");
+	}
+}
+
+// KERNEL_ONLY
+// Context invoker for processors
+// This is the entry point for processors (kernel threads)
+// It effectively constructs a coroutine by stealing the pthread stack
+static void * CtxInvokeProcessor(void * arg) {
+	processor * proc = (processor *) arg;
+	kernelTLS.this_processor = proc;
+	kernelTLS.this_coroutine = NULL;
+	kernelTLS.this_thread    = NULL;
+	kernelTLS.preemption_state.[enabled, disable_count] = [false, 1];
+	// SKULLDUGGERY: We want to create a context for the processor coroutine
+	// which is needed for the 2-step context switch. However, there is no reason
+	// to waste the perfectly valid stack create by pthread.
+	current_stack_info_t info;
+	machine_context_t ctx;
+	info.context = &ctx;
+	(proc->runner){ proc, &info };
+
+	__cfaabi_dbg_print_safe("Coroutine : created stack %p\n", get_coroutine(proc->runner)->stack.base);
+
+	//Set global state
+	kernelTLS.this_coroutine = get_coroutine(proc->runner);
+	kernelTLS.this_thread    = NULL;
+
+	//We now have a proper context from which to schedule threads
+	__cfaabi_dbg_print_safe("Kernel : core %p created (%p, %p)\n", proc, &proc->runner, &ctx);
+
+	// SKULLDUGGERY: Since the coroutine doesn't have its own stack, we can't
+	// resume it to start it like it normally would, it will just context switch
+	// back to here. Instead directly call the main since we already are on the
+	// appropriate stack.
+	get_coroutine(proc->runner)->state = Active;
+	main( proc->runner );
+	get_coroutine(proc->runner)->state = Halted;
+
+	// Main routine of the core returned, the core is now fully terminated
+	__cfaabi_dbg_print_safe("Kernel : core %p main ended (%p)\n", proc, &proc->runner);
+
+	return NULL;
+}
+
+static void start(processor * this) {
+	__cfaabi_dbg_print_safe("Kernel : Starting core %p\n", this);
+
+	pthread_create( &this->kernel_thread, NULL, CtxInvokeProcessor, (void*)this );
+
+	__cfaabi_dbg_print_safe("Kernel : core %p started\n", this);
+}
+
+// KERNEL_ONLY
+void kernel_first_resume(processor * this) {
+	coroutine_desc * src = kernelTLS.this_coroutine;
+	coroutine_desc * dst = get_coroutine(this->runner);
+
+	verify( ! kernelTLS.preemption_state.enabled );
+
+	create_stack(&dst->stack, dst->stack.size);
+	CtxStart(&this->runner, CtxInvokeCoroutine);
+
+	verify( ! kernelTLS.preemption_state.enabled );
+
+	dst->last = src;
+	dst->starter = dst->starter ? dst->starter : src;
+
+	// set state of current coroutine to inactive
+	src->state = src->state == Halted ? Halted : Inactive;
+
+	// set new coroutine that task is executing
+	kernelTLS.this_coroutine = dst;
+
+	// SKULLDUGGERY normally interrupts are enable before leaving a coroutine ctxswitch.
+	// Therefore, when first creating a coroutine, interrupts are enable before calling the main.
+	// This is consistent with thread creation. However, when creating the main processor coroutine,
+	// we wan't interrupts to be disabled. Therefore, we double-disable interrupts here so they will
+	// stay disabled.
+	disable_interrupts();
+
+	// context switch to specified coroutine
+	assert( src->stack.context );
+	CtxSwitch( src->stack.context, dst->stack.context );
+	// when CtxSwitch returns we are back in the src coroutine
+
+	// set state of new coroutine to active
+	src->state = Active;
+
+	verify( ! kernelTLS.preemption_state.enabled );
+}
+
+//-----------------------------------------------------------------------------
+// Scheduler routines
+
+// KERNEL ONLY
+void ScheduleThread( thread_desc * thrd ) {
+	verify( thrd );
+	verify( thrd->self_cor.state != Halted );
+
+	verify( ! kernelTLS.preemption_state.enabled );
+
+	verifyf( thrd->next == NULL, "Expected null got %p", thrd->next );
+
+	with( *thrd->curr_cluster ) {
+		lock  ( ready_queue_lock __cfaabi_dbg_ctx2 );
+		bool was_empty = !(ready_queue != 0);
+		append( ready_queue, thrd );
+		unlock( ready_queue_lock );
+
+		if(was_empty) {
+			lock      (proc_list_lock __cfaabi_dbg_ctx2);
+			if(idles) {
+				wake_fast(idles.head);
+			}
+			unlock    (proc_list_lock);
+		}
+		else if( struct processor * idle = idles.head ) {
+			wake_fast(idle);
+		}
+
+	}
+
+	verify( ! kernelTLS.preemption_state.enabled );
+}
+
+// KERNEL ONLY
+thread_desc * nextThread(cluster * this) with( *this ) {
+	verify( ! kernelTLS.preemption_state.enabled );
+	lock( ready_queue_lock __cfaabi_dbg_ctx2 );
+	thread_desc * head = pop_head( ready_queue );
+	unlock( ready_queue_lock );
+	verify( ! kernelTLS.preemption_state.enabled );
+	return head;
+}
+
+void BlockInternal() {
+	disable_interrupts();
+	verify( ! kernelTLS.preemption_state.enabled );
+	returnToKernel();
+	verify( ! kernelTLS.preemption_state.enabled );
+	enable_interrupts( __cfaabi_dbg_ctx );
+}
+
+void BlockInternal( __spinlock_t * lock ) {
+	disable_interrupts();
+	with( *kernelTLS.this_processor ) {
+		finish.action_code = Release;
+		finish.lock        = lock;
+	}
+
+	verify( ! kernelTLS.preemption_state.enabled );
+	returnToKernel();
+	verify( ! kernelTLS.preemption_state.enabled );
+
+	enable_interrupts( __cfaabi_dbg_ctx );
+}
+
+void BlockInternal( thread_desc * thrd ) {
+	disable_interrupts();
+	with( * kernelTLS.this_processor ) {
+		finish.action_code = Schedule;
+		finish.thrd        = thrd;
+	}
+
+	verify( ! kernelTLS.preemption_state.enabled );
+	returnToKernel();
+	verify( ! kernelTLS.preemption_state.enabled );
+
+	enable_interrupts( __cfaabi_dbg_ctx );
+}
+
+void BlockInternal( __spinlock_t * lock, thread_desc * thrd ) {
+	assert(thrd);
+	disable_interrupts();
+	with( * kernelTLS.this_processor ) {
+		finish.action_code = Release_Schedule;
+		finish.lock        = lock;
+		finish.thrd        = thrd;
+	}
+
+	verify( ! kernelTLS.preemption_state.enabled );
+	returnToKernel();
+	verify( ! kernelTLS.preemption_state.enabled );
+
+	enable_interrupts( __cfaabi_dbg_ctx );
+}
+
+void BlockInternal(__spinlock_t * locks [], unsigned short count) {
+	disable_interrupts();
+	with( * kernelTLS.this_processor ) {
+		finish.action_code = Release_Multi;
+		finish.locks       = locks;
+		finish.lock_count  = count;
+	}
+
+	verify( ! kernelTLS.preemption_state.enabled );
+	returnToKernel();
+	verify( ! kernelTLS.preemption_state.enabled );
+
+	enable_interrupts( __cfaabi_dbg_ctx );
+}
+
+void BlockInternal(__spinlock_t * locks [], unsigned short lock_count, thread_desc * thrds [], unsigned short thrd_count) {
+	disable_interrupts();
+	with( *kernelTLS.this_processor ) {
+		finish.action_code = Release_Multi_Schedule;
+		finish.locks       = locks;
+		finish.lock_count  = lock_count;
+		finish.thrds       = thrds;
+		finish.thrd_count  = thrd_count;
+	}
+
+	verify( ! kernelTLS.preemption_state.enabled );
+	returnToKernel();
+	verify( ! kernelTLS.preemption_state.enabled );
+
+	enable_interrupts( __cfaabi_dbg_ctx );
+}
+
+void BlockInternal(__finish_callback_fptr_t callback) {
+	disable_interrupts();
+	with( *kernelTLS.this_processor ) {
+		finish.action_code = Callback;
+		finish.callback    = callback;
+	}
+
+	verify( ! kernelTLS.preemption_state.enabled );
+	returnToKernel();
+	verify( ! kernelTLS.preemption_state.enabled );
+
+	enable_interrupts( __cfaabi_dbg_ctx );
+}
+
+// KERNEL ONLY
+void LeaveThread(__spinlock_t * lock, thread_desc * thrd) {
+	verify( ! kernelTLS.preemption_state.enabled );
+	with( * kernelTLS.this_processor ) {
+		finish.action_code = thrd ? Release_Schedule : Release;
+		finish.lock        = lock;
+		finish.thrd        = thrd;
+	}
+
+	returnToKernel();
+}
+
+//=============================================================================================
+// Kernel Setup logic
+//=============================================================================================
+//-----------------------------------------------------------------------------
+// Kernel boot procedures
+static void kernel_startup(void) {
+	verify( ! kernelTLS.preemption_state.enabled );
+	__cfaabi_dbg_print_safe("Kernel : Starting\n");
+
+	__cfa_dbg_global_clusters.list{ __get };
+	__cfa_dbg_global_clusters.lock{};
+
+	// Initialize the main cluster
+	mainCluster = (cluster *)&storage_mainCluster;
+	(*mainCluster){"Main Cluster"};
+
+	__cfaabi_dbg_print_safe("Kernel : Main cluster ready\n");
+
+	// Start by initializing the main thread
+	// SKULLDUGGERY: the mainThread steals the process main thread
+	// which will then be scheduled by the mainProcessor normally
+	mainThread = (thread_desc *)&storage_mainThread;
+	current_stack_info_t info;
+	(*mainThread){ &info };
+
+	__cfaabi_dbg_print_safe("Kernel : Main thread ready\n");
+
+
+
+	// Construct the processor context of the main processor
+	void ?{}(processorCtx_t & this, processor * proc) {
+		(this.__cor){ "Processor" };
+		this.__cor.starter = NULL;
+		this.proc = proc;
+	}
+
+	void ?{}(processor & this) with( this ) {
+		name = "Main Processor";
+		cltr = mainCluster;
+		terminated{ 0 };
+		do_terminate = false;
+		preemption_alarm = NULL;
+		pending_preemption = false;
+		kernel_thread = pthread_self();
+
+		runner{ &this };
+		__cfaabi_dbg_print_safe("Kernel : constructed main processor context %p\n", &runner);
+	}
+
+	// Initialize the main processor and the main processor ctx
+	// (the coroutine that contains the processing control flow)
+	mainProcessor = (processor *)&storage_mainProcessor;
+	(*mainProcessor){};
+
+	//initialize the global state variables
+	kernelTLS.this_processor = mainProcessor;
+	kernelTLS.this_thread    = mainThread;
+	kernelTLS.this_coroutine = &mainThread->self_cor;
+
+	// Enable preemption
+	kernel_start_preemption();
+
+	// Add the main thread to the ready queue
+	// once resume is called on mainProcessor->runner the mainThread needs to be scheduled like any normal thread
+	ScheduleThread(mainThread);
+
+	// SKULLDUGGERY: Force a context switch to the main processor to set the main thread's context to the current UNIX
+	// context. Hence, the main thread does not begin through CtxInvokeThread, like all other threads. The trick here is that
+	// mainThread is on the ready queue when this call is made.
+	kernel_first_resume( kernelTLS.this_processor );
+
+
+
+	// THE SYSTEM IS NOW COMPLETELY RUNNING
+	__cfaabi_dbg_print_safe("Kernel : Started\n--------------------------------------------------\n\n");
+
+	verify( ! kernelTLS.preemption_state.enabled );
+	enable_interrupts( __cfaabi_dbg_ctx );
+	verify( TL_GET( preemption_state.enabled ) );
+}
+
+static void kernel_shutdown(void) {
+	__cfaabi_dbg_print_safe("\n--------------------------------------------------\nKernel : Shutting down\n");
+
+	verify( TL_GET( preemption_state.enabled ) );
+	disable_interrupts();
+	verify( ! kernelTLS.preemption_state.enabled );
+
+	// SKULLDUGGERY: Notify the mainProcessor it needs to terminates.
+	// When its coroutine terminates, it return control to the mainThread
+	// which is currently here
+	__atomic_store_n(&mainProcessor->do_terminate, true, __ATOMIC_RELEASE);
+	returnToKernel();
+	mainThread->self_cor.state = Halted;
+
+	// THE SYSTEM IS NOW COMPLETELY STOPPED
+
+	// Disable preemption
+	kernel_stop_preemption();
+
+	// Destroy the main processor and its context in reverse order of construction
+	// These were manually constructed so we need manually destroy them
+	^(mainProcessor->runner){};
+	^(mainProcessor){};
+
+	// Final step, destroy the main thread since it is no longer needed
+	// Since we provided a stack to this taxk it will not destroy anything
+	^(mainThread){};
+
+	^(__cfa_dbg_global_clusters.list){};
+	^(__cfa_dbg_global_clusters.lock){};
+
+	__cfaabi_dbg_print_safe("Kernel : Shutdown complete\n");
+}
+
+//=============================================================================================
+// Kernel Quiescing
+//=============================================================================================
+static void halt(processor * this) with( *this ) {
+	// verify( ! __atomic_load_n(&do_terminate, __ATOMIC_SEQ_CST) );
+
+	with( *cltr ) {
+		lock      (proc_list_lock __cfaabi_dbg_ctx2);
+		remove    (procs, *this);
+		push_front(idles, *this);
+		unlock    (proc_list_lock);
+	}
+
+	__cfaabi_dbg_print_safe("Kernel : Processor %p ready to sleep\n", this);
+
+	wait( idleLock );
+
+	__cfaabi_dbg_print_safe("Kernel : Processor %p woke up and ready to run\n", this);
+
+	with( *cltr ) {
+		lock      (proc_list_lock __cfaabi_dbg_ctx2);
+		remove    (idles, *this);
+		push_front(procs, *this);
+		unlock    (proc_list_lock);
+	}
+}
+
+//=============================================================================================
+// Unexpected Terminating logic
+//=============================================================================================
+static __spinlock_t kernel_abort_lock;
+static bool kernel_abort_called = false;
+
+void * kernel_abort(void) __attribute__ ((__nothrow__)) {
+	// abort cannot be recursively entered by the same or different processors because all signal handlers return when
+	// the globalAbort flag is true.
+	lock( kernel_abort_lock __cfaabi_dbg_ctx2 );
+
+	// first task to abort ?
+	if ( kernel_abort_called ) {			// not first task to abort ?
+		unlock( kernel_abort_lock );
+
+		sigset_t mask;
+		sigemptyset( &mask );
+		sigaddset( &mask, SIGALRM );		// block SIGALRM signals
+		sigsuspend( &mask );			// block the processor to prevent further damage during abort
+		_exit( EXIT_FAILURE );			// if processor unblocks before it is killed, terminate it
+	}
+	else {
+		kernel_abort_called = true;
+		unlock( kernel_abort_lock );
+	}
+
+	return kernelTLS.this_thread;
+}
+
+void kernel_abort_msg( void * kernel_data, char * abort_text, int abort_text_size ) {
+	thread_desc * thrd = kernel_data;
+
+	if(thrd) {
+		int len = snprintf( abort_text, abort_text_size, "Error occurred while executing thread %.256s (%p)", thrd->self_cor.name, thrd );
+		__cfaabi_dbg_bits_write( abort_text, len );
+
+		if ( get_coroutine(thrd) != kernelTLS.this_coroutine ) {
+			len = snprintf( abort_text, abort_text_size, " in coroutine %.256s (%p).\n", kernelTLS.this_coroutine->name, kernelTLS.this_coroutine );
+			__cfaabi_dbg_bits_write( abort_text, len );
+		}
+		else {
+			__cfaabi_dbg_bits_write( ".\n", 2 );
+		}
+	}
+	else {
+		int len = snprintf( abort_text, abort_text_size, "Error occurred outside of any thread.\n" );
+		__cfaabi_dbg_bits_write( abort_text, len );
+	}
+}
+
+int kernel_abort_lastframe( void ) __attribute__ ((__nothrow__)) {
+	return get_coroutine(kernelTLS.this_thread) == get_coroutine(mainThread) ? 4 : 2;
+}
+
+static __spinlock_t kernel_debug_lock;
+
+extern "C" {
+	void __cfaabi_dbg_bits_acquire() {
+		lock( kernel_debug_lock __cfaabi_dbg_ctx2 );
+	}
+
+	void __cfaabi_dbg_bits_release() {
+		unlock( kernel_debug_lock );
+	}
+}
+
+//=============================================================================================
+// Kernel Utilities
+//=============================================================================================
+//-----------------------------------------------------------------------------
+// Locks
+void  ?{}( semaphore & this, int count = 1 ) {
+	(this.lock){};
+	this.count = count;
+	(this.waiting){};
+}
+void ^?{}(semaphore & this) {}
+
+void P(semaphore & this) with( this ){
+	lock( lock __cfaabi_dbg_ctx2 );
+	count -= 1;
+	if ( count < 0 ) {
+		// queue current task
+		append( waiting, kernelTLS.this_thread );
+
+		// atomically release spin lock and block
+		BlockInternal( &lock );
+	}
+	else {
+	    unlock( lock );
+	}
+}
+
+void V(semaphore & this) with( this ) {
+	thread_desc * thrd = NULL;
+	lock( lock __cfaabi_dbg_ctx2 );
+	count += 1;
+	if ( count <= 0 ) {
+		// remove task at head of waiting list
+		thrd = pop_head( waiting );
+	}
+
+	unlock( lock );
+
+	// make new owner
+	WakeThread( thrd );
+}
+
+//-----------------------------------------------------------------------------
+// Global Queues
+void doregister( cluster     & cltr ) {
+	lock      ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
+	push_front( __cfa_dbg_global_clusters.list, cltr );
+	unlock    ( __cfa_dbg_global_clusters.lock );
+}
+
+void unregister( cluster     & cltr ) {
+	lock  ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
+	remove( __cfa_dbg_global_clusters.list, cltr );
+	unlock( __cfa_dbg_global_clusters.lock );
+}
+
+void doregister( cluster * cltr, thread_desc & thrd ) {
+	lock      (cltr->thread_list_lock __cfaabi_dbg_ctx2);
+	push_front(cltr->threads, thrd);
+	unlock    (cltr->thread_list_lock);
+}
+
+void unregister( cluster * cltr, thread_desc & thrd ) {
+	lock  (cltr->thread_list_lock __cfaabi_dbg_ctx2);
+	remove(cltr->threads, thrd );
+	unlock(cltr->thread_list_lock);
+}
+
+void doregister( cluster * cltr, processor * proc ) {
+	lock      (cltr->proc_list_lock __cfaabi_dbg_ctx2);
+	push_front(cltr->procs, *proc);
+	unlock    (cltr->proc_list_lock);
+}
+
+void unregister( cluster * cltr, processor * proc ) {
+	lock  (cltr->proc_list_lock __cfaabi_dbg_ctx2);
+	remove(cltr->procs, *proc );
+	unlock(cltr->proc_list_lock);
+}
+
+//-----------------------------------------------------------------------------
+// Debug
+__cfaabi_dbg_debug_do(
+	void __cfaabi_dbg_record(__spinlock_t & this, const char * prev_name) {
+		this.prev_name = prev_name;
+		this.prev_thrd = kernelTLS.this_thread;
+	}
+)
+// Local Variables: //
+// mode: c //
+// tab-width: 4 //
+// End: //
Index: libcfa/src/concurrency/monitor.c
===================================================================
--- libcfa/src/concurrency/monitor.c	(revision bf71cfdb7285490eee552b461158846f626cc52f)
+++ 	(revision )
@@ -1,900 +1,0 @@
-//
-// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
-//
-// The contents of this file are covered under the licence agreement in the
-// file "LICENCE" distributed with Cforall.
-//
-// monitor_desc.c --
-//
-// Author           : Thierry Delisle
-// Created On       : Thd Feb 23 12:27:26 2017
-// Last Modified By : Peter A. Buhr
-// Last Modified On : Fri Mar 30 14:30:26 2018
-// Update Count     : 9
-//
-
-#include "monitor"
-
-#include <stdlib>
-#include <inttypes.h>
-
-#include "kernel_private.h"
-
-#include "bits/algorithms.h"
-
-//-----------------------------------------------------------------------------
-// Forward declarations
-static inline void set_owner ( monitor_desc * this, thread_desc * owner );
-static inline void set_owner ( monitor_desc * storage [], __lock_size_t count, thread_desc * owner );
-static inline void set_mask  ( monitor_desc * storage [], __lock_size_t count, const __waitfor_mask_t & mask );
-static inline void reset_mask( monitor_desc * this );
-
-static inline thread_desc * next_thread( monitor_desc * this );
-static inline bool is_accepted( monitor_desc * this, const __monitor_group_t & monitors );
-
-static inline void lock_all  ( __spinlock_t * locks [], __lock_size_t count );
-static inline void lock_all  ( monitor_desc * source [], __spinlock_t * /*out*/ locks [], __lock_size_t count );
-static inline void unlock_all( __spinlock_t * locks [], __lock_size_t count );
-static inline void unlock_all( monitor_desc * locks [], __lock_size_t count );
-
-static inline void save   ( monitor_desc * ctx [], __lock_size_t count, __spinlock_t * locks [], unsigned int /*out*/ recursions [], __waitfor_mask_t /*out*/ masks [] );
-static inline void restore( monitor_desc * ctx [], __lock_size_t count, __spinlock_t * locks [], unsigned int /*in */ recursions [], __waitfor_mask_t /*in */ masks [] );
-
-static inline void init     ( __lock_size_t count, monitor_desc * monitors [], __condition_node_t & waiter, __condition_criterion_t criteria [] );
-static inline void init_push( __lock_size_t count, monitor_desc * monitors [], __condition_node_t & waiter, __condition_criterion_t criteria [] );
-
-static inline thread_desc *        check_condition   ( __condition_criterion_t * );
-static inline void                 brand_condition   ( condition & );
-static inline [thread_desc *, int] search_entry_queue( const __waitfor_mask_t &, monitor_desc * monitors [], __lock_size_t count );
-
-forall(dtype T | sized( T ))
-static inline __lock_size_t insert_unique( T * array [], __lock_size_t & size, T * val );
-static inline __lock_size_t count_max    ( const __waitfor_mask_t & mask );
-static inline __lock_size_t aggregate    ( monitor_desc * storage [], const __waitfor_mask_t & mask );
-
-//-----------------------------------------------------------------------------
-// Useful defines
-#define wait_ctx(thrd, user_info)                               /* Create the necessary information to use the signaller stack                         */ \
-	__condition_node_t waiter = { thrd, count, user_info };   /* Create the node specific to this wait operation                                     */ \
-	__condition_criterion_t criteria[count];                  /* Create the creteria this wait operation needs to wake up                            */ \
-	init( count, monitors, waiter, criteria );                /* Link everything together                                                            */ \
-
-#define wait_ctx_primed(thrd, user_info)                        /* Create the necessary information to use the signaller stack                         */ \
-	__condition_node_t waiter = { thrd, count, user_info };   /* Create the node specific to this wait operation                                     */ \
-	__condition_criterion_t criteria[count];                  /* Create the creteria this wait operation needs to wake up                            */ \
-	init_push( count, monitors, waiter, criteria );           /* Link everything together and push it to the AS-Stack                                */ \
-
-#define monitor_ctx( mons, cnt )                                /* Define that create the necessary struct for internal/external scheduling operations */ \
-	monitor_desc ** monitors = mons;                          /* Save the targeted monitors                                                          */ \
-	__lock_size_t count = cnt;                                /* Save the count to a local variable                                                  */ \
-	unsigned int recursions[ count ];                         /* Save the current recursion levels to restore them later                             */ \
-	__waitfor_mask_t masks [ count ];                         /* Save the current waitfor masks to restore them later                                */ \
-	__spinlock_t *   locks [ count ];                         /* We need to pass-in an array of locks to BlockInternal                               */ \
-
-#define monitor_save    save   ( monitors, count, locks, recursions, masks )
-#define monitor_restore restore( monitors, count, locks, recursions, masks )
-
-
-//-----------------------------------------------------------------------------
-// Enter/Leave routines
-
-
-extern "C" {
-	// Enter single monitor
-	static void __enter_monitor_desc( monitor_desc * this, const __monitor_group_t & group ) {
-		// Lock the monitor spinlock
-		lock( this->lock __cfaabi_dbg_ctx2 );
-		// Interrupts disable inside critical section
-		thread_desc * thrd = kernelTLS.this_thread;
-
-		__cfaabi_dbg_print_safe( "Kernel : %10p Entering mon %p (%p)\n", thrd, this, this->owner);
-
-		if( !this->owner ) {
-			// No one has the monitor, just take it
-			set_owner( this, thrd );
-
-			__cfaabi_dbg_print_safe( "Kernel :  mon is free \n" );
-		}
-		else if( this->owner == thrd) {
-			// We already have the monitor, just note how many times we took it
-			this->recursion += 1;
-
-			__cfaabi_dbg_print_safe( "Kernel :  mon already owned \n" );
-		}
-		else if( is_accepted( this, group) ) {
-			// Some one was waiting for us, enter
-			set_owner( this, thrd );
-
-			// Reset mask
-			reset_mask( this );
-
-			__cfaabi_dbg_print_safe( "Kernel :  mon accepts \n" );
-		}
-		else {
-			__cfaabi_dbg_print_safe( "Kernel :  blocking \n" );
-
-			// Some one else has the monitor, wait in line for it
-			append( this->entry_queue, thrd );
-
-			BlockInternal( &this->lock );
-
-			__cfaabi_dbg_print_safe( "Kernel : %10p Entered  mon %p\n", thrd, this);
-
-			// BlockInternal will unlock spinlock, no need to unlock ourselves
-			return;
-		}
-
-		__cfaabi_dbg_print_safe( "Kernel : %10p Entered  mon %p\n", thrd, this);
-
-		// Release the lock and leave
-		unlock( this->lock );
-		return;
-	}
-
-	static void __enter_monitor_dtor( monitor_desc * this, fptr_t func ) {
-		// Lock the monitor spinlock
-		lock( this->lock __cfaabi_dbg_ctx2 );
-		// Interrupts disable inside critical section
-		thread_desc * thrd = kernelTLS.this_thread;
-
-		__cfaabi_dbg_print_safe( "Kernel : %10p Entering dtor for mon %p (%p)\n", thrd, this, this->owner);
-
-
-		if( !this->owner ) {
-			__cfaabi_dbg_print_safe( "Kernel : Destroying free mon %p\n", this);
-
-			// No one has the monitor, just take it
-			set_owner( this, thrd );
-
-			unlock( this->lock );
-			return;
-		}
-		else if( this->owner == thrd) {
-			// We already have the monitor... but where about to destroy it so the nesting will fail
-			// Abort!
-			abort( "Attempt to destroy monitor %p by thread \"%.256s\" (%p) in nested mutex.", this, thrd->self_cor.name, thrd );
-		}
-
-		__lock_size_t count = 1;
-		monitor_desc ** monitors = &this;
-		__monitor_group_t group = { &this, 1, func };
-		if( is_accepted( this, group) ) {
-			__cfaabi_dbg_print_safe( "Kernel :  mon accepts dtor, block and signal it \n" );
-
-			// Wake the thread that is waiting for this
-			__condition_criterion_t * urgent = pop( this->signal_stack );
-			verify( urgent );
-
-			// Reset mask
-			reset_mask( this );
-
-			// Create the node specific to this wait operation
-			wait_ctx_primed( thrd, 0 )
-
-			// Some one else has the monitor, wait for him to finish and then run
-			BlockInternal( &this->lock, urgent->owner->waiting_thread );
-
-			// Some one was waiting for us, enter
-			set_owner( this, thrd );
-		}
-		else {
-			__cfaabi_dbg_print_safe( "Kernel :  blocking \n" );
-
-			wait_ctx( thrd, 0 )
-			this->dtor_node = &waiter;
-
-			// Some one else has the monitor, wait in line for it
-			append( this->entry_queue, thrd );
-			BlockInternal( &this->lock );
-
-			// BlockInternal will unlock spinlock, no need to unlock ourselves
-			return;
-		}
-
-		__cfaabi_dbg_print_safe( "Kernel : Destroying %p\n", this);
-
-	}
-
-	// Leave single monitor
-	void __leave_monitor_desc( monitor_desc * this ) {
-		// Lock the monitor spinlock
-		lock( this->lock __cfaabi_dbg_ctx2 );
-
-		__cfaabi_dbg_print_safe( "Kernel : %10p Leaving mon %p (%p)\n", kernelTLS.this_thread, this, this->owner);
-
-		verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
-
-		// Leaving a recursion level, decrement the counter
-		this->recursion -= 1;
-
-		// If we haven't left the last level of recursion
-		// it means we don't need to do anything
-		if( this->recursion != 0) {
-			__cfaabi_dbg_print_safe( "Kernel :  recursion still %d\n", this->recursion);
-			unlock( this->lock );
-			return;
-		}
-
-		// Get the next thread, will be null on low contention monitor
-		thread_desc * new_owner = next_thread( this );
-
-		// We can now let other threads in safely
-		unlock( this->lock );
-
-		//We need to wake-up the thread
-		WakeThread( new_owner );
-	}
-
-	// Leave single monitor for the last time
-	void __leave_dtor_monitor_desc( monitor_desc * this ) {
-		__cfaabi_dbg_debug_do(
-			if( TL_GET( this_thread ) != this->owner ) {
-				abort( "Destroyed monitor %p has inconsistent owner, expected %p got %p.\n", this, TL_GET( this_thread ), this->owner);
-			}
-			if( this->recursion != 1 ) {
-				abort( "Destroyed monitor %p has %d outstanding nested calls.\n", this, this->recursion - 1);
-			}
-		)
-	}
-
-	// Leave the thread monitor
-	// last routine called by a thread.
-	// Should never return
-	void __leave_thread_monitor( thread_desc * thrd ) {
-		monitor_desc * this = &thrd->self_mon;
-
-		// Lock the monitor now
-		lock( this->lock __cfaabi_dbg_ctx2 );
-
-		disable_interrupts();
-
-		thrd->self_cor.state = Halted;
-
-		verifyf( thrd == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", thrd, this->owner, this->recursion, this );
-
-		// Leaving a recursion level, decrement the counter
-		this->recursion -= 1;
-
-		// If we haven't left the last level of recursion
-		// it must mean there is an error
-		if( this->recursion != 0) { abort( "Thread internal monitor has unbalanced recursion" ); }
-
-		// Fetch the next thread, can be null
-		thread_desc * new_owner = next_thread( this );
-
-		// Leave the thread, this will unlock the spinlock
-		// Use leave thread instead of BlockInternal which is
-		// specialized for this case and supports null new_owner
-		LeaveThread( &this->lock, new_owner );
-
-		// Control flow should never reach here!
-	}
-}
-
-// Enter multiple monitor
-// relies on the monitor array being sorted
-static inline void enter( __monitor_group_t monitors ) {
-	for( __lock_size_t i = 0; i < monitors.size; i++) {
-		__enter_monitor_desc( monitors[i], monitors );
-	}
-}
-
-// Leave multiple monitor
-// relies on the monitor array being sorted
-static inline void leave(monitor_desc * monitors [], __lock_size_t count) {
-	for( __lock_size_t i = count - 1; i >= 0; i--) {
-		__leave_monitor_desc( monitors[i] );
-	}
-}
-
-// Ctor for monitor guard
-// Sorts monitors before entering
-void ?{}( monitor_guard_t & this, monitor_desc * m [], __lock_size_t count, fptr_t func ) {
-	thread_desc * thrd = TL_GET( this_thread );
-
-	// Store current array
-	this.m = m;
-	this.count = count;
-
-	// Sort monitors based on address
-	__libcfa_small_sort(this.m, count);
-
-	// Save previous thread context
-	this.prev = thrd->monitors;
-
-	// Update thread context (needed for conditions)
-	(thrd->monitors){m, count, func};
-
-	// __cfaabi_dbg_print_safe( "MGUARD : enter %d\n", count);
-
-	// Enter the monitors in order
-	__monitor_group_t group = {this.m, this.count, func};
-	enter( group );
-
-	// __cfaabi_dbg_print_safe( "MGUARD : entered\n" );
-}
-
-
-// Dtor for monitor guard
-void ^?{}( monitor_guard_t & this ) {
-	// __cfaabi_dbg_print_safe( "MGUARD : leaving %d\n", this.count);
-
-	// Leave the monitors in order
-	leave( this.m, this.count );
-
-	// __cfaabi_dbg_print_safe( "MGUARD : left\n" );
-
-	// Restore thread context
-	TL_GET( this_thread )->monitors = this.prev;
-}
-
-// Ctor for monitor guard
-// Sorts monitors before entering
-void ?{}( monitor_dtor_guard_t & this, monitor_desc * m [], fptr_t func ) {
-	// optimization
-	thread_desc * thrd = TL_GET( this_thread );
-
-	// Store current array
-	this.m = *m;
-
-	// Save previous thread context
-	this.prev = thrd->monitors;
-
-	// Update thread context (needed for conditions)
-	(thrd->monitors){m, 1, func};
-
-	__enter_monitor_dtor( this.m, func );
-}
-
-// Dtor for monitor guard
-void ^?{}( monitor_dtor_guard_t & this ) {
-	// Leave the monitors in order
-	__leave_dtor_monitor_desc( this.m );
-
-	// Restore thread context
-	TL_GET( this_thread )->monitors = this.prev;
-}
-
-//-----------------------------------------------------------------------------
-// Internal scheduling types
-void ?{}(__condition_node_t & this, thread_desc * waiting_thread, __lock_size_t count, uintptr_t user_info ) {
-	this.waiting_thread = waiting_thread;
-	this.count = count;
-	this.next = NULL;
-	this.user_info = user_info;
-}
-
-void ?{}(__condition_criterion_t & this ) with( this ) {
-	ready  = false;
-	target = NULL;
-	owner  = NULL;
-	next   = NULL;
-}
-
-void ?{}(__condition_criterion_t & this, monitor_desc * target, __condition_node_t & owner ) {
-	this.ready  = false;
-	this.target = target;
-	this.owner  = &owner;
-	this.next   = NULL;
-}
-
-//-----------------------------------------------------------------------------
-// Internal scheduling
-void wait( condition & this, uintptr_t user_info = 0 ) {
-	brand_condition( this );
-
-	// Check that everything is as expected
-	assertf( this.monitors != NULL, "Waiting with no monitors (%p)", this.monitors );
-	verifyf( this.monitor_count != 0, "Waiting with 0 monitors (%"PRIiFAST16")", this.monitor_count );
-	verifyf( this.monitor_count < 32u, "Excessive monitor count (%"PRIiFAST16")", this.monitor_count );
-
-	// Create storage for monitor context
-	monitor_ctx( this.monitors, this.monitor_count );
-
-	// Create the node specific to this wait operation
-	wait_ctx( TL_GET( this_thread ), user_info );
-
-	// Append the current wait operation to the ones already queued on the condition
-	// We don't need locks for that since conditions must always be waited on inside monitor mutual exclusion
-	append( this.blocked, &waiter );
-
-	// Lock all monitors (aggregates the locks as well)
-	lock_all( monitors, locks, count );
-
-	// Find the next thread(s) to run
-	__lock_size_t thread_count = 0;
-	thread_desc * threads[ count ];
-	__builtin_memset( threads, 0, sizeof( threads ) );
-
-	// Save monitor states
-	monitor_save;
-
-	// Remove any duplicate threads
-	for( __lock_size_t i = 0; i < count; i++) {
-		thread_desc * new_owner = next_thread( monitors[i] );
-		insert_unique( threads, thread_count, new_owner );
-	}
-
-	// Everything is ready to go to sleep
-	BlockInternal( locks, count, threads, thread_count );
-
-	// We are back, restore the owners and recursions
-	monitor_restore;
-}
-
-bool signal( condition & this ) {
-	if( is_empty( this ) ) { return false; }
-
-	//Check that everything is as expected
-	verify( this.monitors );
-	verify( this.monitor_count != 0 );
-
-	//Some more checking in debug
-	__cfaabi_dbg_debug_do(
-		thread_desc * this_thrd = TL_GET( this_thread );
-		if ( this.monitor_count != this_thrd->monitors.size ) {
-			abort( "Signal on condition %p made with different number of monitor(s), expected %zi got %zi", &this, this.monitor_count, this_thrd->monitors.size );
-		}
-
-		for(int i = 0; i < this.monitor_count; i++) {
-			if ( this.monitors[i] != this_thrd->monitors[i] ) {
-				abort( "Signal on condition %p made with different monitor, expected %p got %p", &this, this.monitors[i], this_thrd->monitors[i] );
-			}
-		}
-	);
-
-	__lock_size_t count = this.monitor_count;
-
-	// Lock all monitors
-	lock_all( this.monitors, NULL, count );
-
-	//Pop the head of the waiting queue
-	__condition_node_t * node = pop_head( this.blocked );
-
-	//Add the thread to the proper AS stack
-	for(int i = 0; i < count; i++) {
-		__condition_criterion_t * crit = &node->criteria[i];
-		assert( !crit->ready );
-		push( crit->target->signal_stack, crit );
-	}
-
-	//Release
-	unlock_all( this.monitors, count );
-
-	return true;
-}
-
-bool signal_block( condition & this ) {
-	if( !this.blocked.head ) { return false; }
-
-	//Check that everything is as expected
-	verifyf( this.monitors != NULL, "Waiting with no monitors (%p)", this.monitors );
-	verifyf( this.monitor_count != 0, "Waiting with 0 monitors (%"PRIiFAST16")", this.monitor_count );
-
-	// Create storage for monitor context
-	monitor_ctx( this.monitors, this.monitor_count );
-
-	// Lock all monitors (aggregates the locks them as well)
-	lock_all( monitors, locks, count );
-
-
-	// Create the node specific to this wait operation
-	wait_ctx_primed( kernelTLS.this_thread, 0 )
-
-	//save contexts
-	monitor_save;
-
-	//Find the thread to run
-	thread_desc * signallee = pop_head( this.blocked )->waiting_thread;
-	set_owner( monitors, count, signallee );
-
-	__cfaabi_dbg_print_buffer_decl( "Kernel : signal_block condition %p (s: %p)\n", &this, signallee );
-
-	//Everything is ready to go to sleep
-	BlockInternal( locks, count, &signallee, 1 );
-
-
-	// WE WOKE UP
-
-
-	__cfaabi_dbg_print_buffer_local( "Kernel :   signal_block returned\n" );
-
-	//We are back, restore the masks and recursions
-	monitor_restore;
-
-	return true;
-}
-
-// Access the user_info of the thread waiting at the front of the queue
-uintptr_t front( condition & this ) {
-	verifyf( !is_empty(this),
-		"Attempt to access user data on an empty condition.\n"
-		"Possible cause is not checking if the condition is empty before reading stored data."
-	);
-	return ((typeof(this.blocked.head))this.blocked.head)->user_info;
-}
-
-//-----------------------------------------------------------------------------
-// External scheduling
-// cases to handle :
-// 	- target already there :
-// 		block and wake
-// 	- dtor already there
-// 		put thread on signaller stack
-// 	- non-blocking
-// 		return else
-// 	- timeout
-// 		return timeout
-// 	- block
-// 		setup mask
-// 		block
-void __waitfor_internal( const __waitfor_mask_t & mask, int duration ) {
-	// This statment doesn't have a contiguous list of monitors...
-	// Create one!
-	__lock_size_t max = count_max( mask );
-	monitor_desc * mon_storage[max];
-	__builtin_memset( mon_storage, 0, sizeof( mon_storage ) );
-	__lock_size_t actual_count = aggregate( mon_storage, mask );
-
-	__cfaabi_dbg_print_buffer_decl( "Kernel : waitfor %"PRIdFAST16" (s: %"PRIdFAST16", m: %"PRIdFAST16")\n", actual_count, mask.size, (__lock_size_t)max);
-
-	if(actual_count == 0) return;
-
-	__cfaabi_dbg_print_buffer_local( "Kernel : waitfor internal proceeding\n" );
-
-	// Create storage for monitor context
-	monitor_ctx( mon_storage, actual_count );
-
-	// Lock all monitors (aggregates the locks as well)
-	lock_all( monitors, locks, count );
-
-	{
-		// Check if the entry queue
-		thread_desc * next; int index;
-		[next, index] = search_entry_queue( mask, monitors, count );
-
-		if( next ) {
-			*mask.accepted = index;
-			__acceptable_t& accepted = mask[index];
-			if( accepted.is_dtor ) {
-				__cfaabi_dbg_print_buffer_local( "Kernel : dtor already there\n" );
-				verifyf( accepted.size == 1,  "ERROR: Accepted dtor has more than 1 mutex parameter." );
-
-				monitor_desc * mon2dtor = accepted[0];
-				verifyf( mon2dtor->dtor_node, "ERROR: Accepted monitor has no dtor_node." );
-
-				__condition_criterion_t * dtor_crit = mon2dtor->dtor_node->criteria;
-				push( mon2dtor->signal_stack, dtor_crit );
-
-				unlock_all( locks, count );
-			}
-			else {
-				__cfaabi_dbg_print_buffer_local( "Kernel : thread present, baton-passing\n" );
-
-				// Create the node specific to this wait operation
-				wait_ctx_primed( kernelTLS.this_thread, 0 );
-
-				// Save monitor states
-				monitor_save;
-
-				__cfaabi_dbg_print_buffer_local( "Kernel :  baton of %"PRIdFAST16" monitors : ", count );
-				#ifdef __CFA_DEBUG_PRINT__
-					for( int i = 0; i < count; i++) {
-						__cfaabi_dbg_print_buffer_local( "%p %p ", monitors[i], monitors[i]->signal_stack.top );
-					}
-				#endif
-				__cfaabi_dbg_print_buffer_local( "\n" );
-
-				// Set the owners to be the next thread
-				set_owner( monitors, count, next );
-
-				// Everything is ready to go to sleep
-				BlockInternal( locks, count, &next, 1 );
-
-				// We are back, restore the owners and recursions
-				monitor_restore;
-
-				__cfaabi_dbg_print_buffer_local( "Kernel : thread present, returned\n" );
-			}
-
-			__cfaabi_dbg_print_buffer_local( "Kernel : accepted %d\n", *mask.accepted);
-			return;
-		}
-	}
-
-
-	if( duration == 0 ) {
-		__cfaabi_dbg_print_buffer_local( "Kernel : non-blocking, exiting\n" );
-
-		unlock_all( locks, count );
-
-		__cfaabi_dbg_print_buffer_local( "Kernel : accepted %d\n", *mask.accepted);
-		return;
-	}
-
-
-	verifyf( duration < 0, "Timeout on waitfor statments not supported yet." );
-
-	__cfaabi_dbg_print_buffer_local( "Kernel : blocking waitfor\n" );
-
-	// Create the node specific to this wait operation
-	wait_ctx_primed( kernelTLS.this_thread, 0 );
-
-	monitor_save;
-	set_mask( monitors, count, mask );
-
-	for( __lock_size_t i = 0; i < count; i++) {
-		verify( monitors[i]->owner == kernelTLS.this_thread );
-	}
-
-	//Everything is ready to go to sleep
-	BlockInternal( locks, count );
-
-
-	// WE WOKE UP
-
-
-	//We are back, restore the masks and recursions
-	monitor_restore;
-
-	__cfaabi_dbg_print_buffer_local( "Kernel : exiting\n" );
-
-	__cfaabi_dbg_print_buffer_local( "Kernel : accepted %d\n", *mask.accepted);
-}
-
-//-----------------------------------------------------------------------------
-// Utilities
-
-static inline void set_owner( monitor_desc * this, thread_desc * owner ) {
-	// __cfaabi_dbg_print_safe( "Kernal :   Setting owner of %p to %p ( was %p)\n", this, owner, this->owner );
-
-	//Pass the monitor appropriately
-	this->owner = owner;
-
-	//We are passing the monitor to someone else, which means recursion level is not 0
-	this->recursion = owner ? 1 : 0;
-}
-
-static inline void set_owner( monitor_desc * monitors [], __lock_size_t count, thread_desc * owner ) {
-	monitors[0]->owner     = owner;
-	monitors[0]->recursion = 1;
-	for( __lock_size_t i = 1; i < count; i++ ) {
-		monitors[i]->owner     = owner;
-		monitors[i]->recursion = 0;
-	}
-}
-
-static inline void set_mask( monitor_desc * storage [], __lock_size_t count, const __waitfor_mask_t & mask ) {
-	for( __lock_size_t i = 0; i < count; i++) {
-		storage[i]->mask = mask;
-	}
-}
-
-static inline void reset_mask( monitor_desc * this ) {
-	this->mask.accepted = NULL;
-	this->mask.data = NULL;
-	this->mask.size = 0;
-}
-
-static inline thread_desc * next_thread( monitor_desc * this ) {
-	//Check the signaller stack
-	__cfaabi_dbg_print_safe( "Kernel :  mon %p AS-stack top %p\n", this, this->signal_stack.top);
-	__condition_criterion_t * urgent = pop( this->signal_stack );
-	if( urgent ) {
-		//The signaller stack is not empty,
-		//regardless of if we are ready to baton pass,
-		//we need to set the monitor as in use
-		set_owner( this,  urgent->owner->waiting_thread );
-
-		return check_condition( urgent );
-	}
-
-	// No signaller thread
-	// Get the next thread in the entry_queue
-	thread_desc * new_owner = pop_head( this->entry_queue );
-	set_owner( this, new_owner );
-
-	return new_owner;
-}
-
-static inline bool is_accepted( monitor_desc * this, const __monitor_group_t & group ) {
-	__acceptable_t * it = this->mask.data; // Optim
-	__lock_size_t count = this->mask.size;
-
-	// Check if there are any acceptable functions
-	if( !it ) return false;
-
-	// If this isn't the first monitor to test this, there is no reason to repeat the test.
-	if( this != group[0] ) return group[0]->mask.accepted >= 0;
-
-	// For all acceptable functions check if this is the current function.
-	for( __lock_size_t i = 0; i < count; i++, it++ ) {
-		if( *it == group ) {
-			*this->mask.accepted = i;
-			return true;
-		}
-	}
-
-	// No function matched
-	return false;
-}
-
-static inline void init( __lock_size_t count, monitor_desc * monitors [], __condition_node_t & waiter, __condition_criterion_t criteria [] ) {
-	for( __lock_size_t i = 0; i < count; i++) {
-		(criteria[i]){ monitors[i], waiter };
-	}
-
-	waiter.criteria = criteria;
-}
-
-static inline void init_push( __lock_size_t count, monitor_desc * monitors [], __condition_node_t & waiter, __condition_criterion_t criteria [] ) {
-	for( __lock_size_t i = 0; i < count; i++) {
-		(criteria[i]){ monitors[i], waiter };
-		__cfaabi_dbg_print_safe( "Kernel :  target %p = %p\n", criteria[i].target, &criteria[i] );
-		push( criteria[i].target->signal_stack, &criteria[i] );
-	}
-
-	waiter.criteria = criteria;
-}
-
-static inline void lock_all( __spinlock_t * locks [], __lock_size_t count ) {
-	for( __lock_size_t i = 0; i < count; i++ ) {
-		lock( *locks[i] __cfaabi_dbg_ctx2 );
-	}
-}
-
-static inline void lock_all( monitor_desc * source [], __spinlock_t * /*out*/ locks [], __lock_size_t count ) {
-	for( __lock_size_t i = 0; i < count; i++ ) {
-		__spinlock_t * l = &source[i]->lock;
-		lock( *l __cfaabi_dbg_ctx2 );
-		if(locks) locks[i] = l;
-	}
-}
-
-static inline void unlock_all( __spinlock_t * locks [], __lock_size_t count ) {
-	for( __lock_size_t i = 0; i < count; i++ ) {
-		unlock( *locks[i] );
-	}
-}
-
-static inline void unlock_all( monitor_desc * locks [], __lock_size_t count ) {
-	for( __lock_size_t i = 0; i < count; i++ ) {
-		unlock( locks[i]->lock );
-	}
-}
-
-static inline void save(
-	monitor_desc * ctx [],
-	__lock_size_t count,
-	__attribute((unused)) __spinlock_t * locks [],
-	unsigned int /*out*/ recursions [],
-	__waitfor_mask_t /*out*/ masks []
-) {
-	for( __lock_size_t i = 0; i < count; i++ ) {
-		recursions[i] = ctx[i]->recursion;
-		masks[i]      = ctx[i]->mask;
-	}
-}
-
-static inline void restore(
-	monitor_desc * ctx [],
-	__lock_size_t count,
-	__spinlock_t * locks [],
-	unsigned int /*out*/ recursions [],
-	__waitfor_mask_t /*out*/ masks []
-) {
-	lock_all( locks, count );
-	for( __lock_size_t i = 0; i < count; i++ ) {
-		ctx[i]->recursion = recursions[i];
-		ctx[i]->mask      = masks[i];
-	}
-	unlock_all( locks, count );
-}
-
-// Function has 2 different behavior
-// 1 - Marks a monitors as being ready to run
-// 2 - Checks if all the monitors are ready to run
-//     if so return the thread to run
-static inline thread_desc * check_condition( __condition_criterion_t * target ) {
-	__condition_node_t * node = target->owner;
-	unsigned short count = node->count;
-	__condition_criterion_t * criteria = node->criteria;
-
-	bool ready2run = true;
-
-	for(	int i = 0; i < count; i++ ) {
-
-		// __cfaabi_dbg_print_safe( "Checking %p for %p\n", &criteria[i], target );
-		if( &criteria[i] == target ) {
-			criteria[i].ready = true;
-			// __cfaabi_dbg_print_safe( "True\n" );
-		}
-
-		ready2run = criteria[i].ready && ready2run;
-	}
-
-	__cfaabi_dbg_print_safe( "Kernel :  Runing %i (%p)\n", ready2run, ready2run ? node->waiting_thread : NULL );
-	return ready2run ? node->waiting_thread : NULL;
-}
-
-static inline void brand_condition( condition & this ) {
-	thread_desc * thrd = TL_GET( this_thread );
-	if( !this.monitors ) {
-		// __cfaabi_dbg_print_safe( "Branding\n" );
-		assertf( thrd->monitors.data != NULL, "No current monitor to brand condition %p", thrd->monitors.data );
-		this.monitor_count = thrd->monitors.size;
-
-		this.monitors = (monitor_desc **)malloc( this.monitor_count * sizeof( *this.monitors ) );
-		for( int i = 0; i < this.monitor_count; i++ ) {
-			this.monitors[i] = thrd->monitors[i];
-		}
-	}
-}
-
-static inline [thread_desc *, int] search_entry_queue( const __waitfor_mask_t & mask, monitor_desc * monitors [], __lock_size_t count ) {
-
-	__queue_t(thread_desc) & entry_queue = monitors[0]->entry_queue;
-
-	// For each thread in the entry-queue
-	for(	thread_desc ** thrd_it = &entry_queue.head;
-		*thrd_it;
-		thrd_it = &(*thrd_it)->next
-	) {
-		// For each acceptable check if it matches
-		int i = 0;
-		__acceptable_t * end   = end  (mask);
-		__acceptable_t * begin = begin(mask);
-		for( __acceptable_t * it = begin; it != end; it++, i++ ) {
-			// Check if we have a match
-			if( *it == (*thrd_it)->monitors ) {
-
-				// If we have a match return it
-				// after removeing it from the entry queue
-				return [remove( entry_queue, thrd_it ), i];
-			}
-		}
-	}
-
-	return [0, -1];
-}
-
-forall(dtype T | sized( T ))
-static inline __lock_size_t insert_unique( T * array [], __lock_size_t & size, T * val ) {
-	if( !val ) return size;
-
-	for( __lock_size_t i = 0; i <= size; i++) {
-		if( array[i] == val ) return size;
-	}
-
-	array[size] = val;
-	size = size + 1;
-	return size;
-}
-
-static inline __lock_size_t count_max( const __waitfor_mask_t & mask ) {
-	__lock_size_t max = 0;
-	for( __lock_size_t i = 0; i < mask.size; i++ ) {
-		__acceptable_t & accepted = mask[i];
-		max += accepted.size;
-	}
-	return max;
-}
-
-static inline __lock_size_t aggregate( monitor_desc * storage [], const __waitfor_mask_t & mask ) {
-	__lock_size_t size = 0;
-	for( __lock_size_t i = 0; i < mask.size; i++ ) {
-		__acceptable_t & accepted = mask[i];
-		__libcfa_small_sort( accepted.data, accepted.size );
-		for( __lock_size_t j = 0; j < accepted.size; j++) {
-			insert_unique( storage, size, accepted[j] );
-		}
-	}
-	// TODO insertion sort instead of this
-	__libcfa_small_sort( storage, size );
-	return size;
-}
-
-// Local Variables: //
-// mode: c //
-// tab-width: 4 //
-// End: //
Index: libcfa/src/concurrency/monitor.cfa
===================================================================
--- libcfa/src/concurrency/monitor.cfa	(revision 4dcaed2c212a42d633824e9b88053258a2d0f969)
+++ libcfa/src/concurrency/monitor.cfa	(revision 4dcaed2c212a42d633824e9b88053258a2d0f969)
@@ -0,0 +1,900 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// monitor_desc.c --
+//
+// Author           : Thierry Delisle
+// Created On       : Thd Feb 23 12:27:26 2017
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Fri Mar 30 14:30:26 2018
+// Update Count     : 9
+//
+
+#include "monitor"
+
+#include <stdlib>
+#include <inttypes.h>
+
+#include "kernel_private.h"
+
+#include "bits/algorithms.h"
+
+//-----------------------------------------------------------------------------
+// Forward declarations
+static inline void set_owner ( monitor_desc * this, thread_desc * owner );
+static inline void set_owner ( monitor_desc * storage [], __lock_size_t count, thread_desc * owner );
+static inline void set_mask  ( monitor_desc * storage [], __lock_size_t count, const __waitfor_mask_t & mask );
+static inline void reset_mask( monitor_desc * this );
+
+static inline thread_desc * next_thread( monitor_desc * this );
+static inline bool is_accepted( monitor_desc * this, const __monitor_group_t & monitors );
+
+static inline void lock_all  ( __spinlock_t * locks [], __lock_size_t count );
+static inline void lock_all  ( monitor_desc * source [], __spinlock_t * /*out*/ locks [], __lock_size_t count );
+static inline void unlock_all( __spinlock_t * locks [], __lock_size_t count );
+static inline void unlock_all( monitor_desc * locks [], __lock_size_t count );
+
+static inline void save   ( monitor_desc * ctx [], __lock_size_t count, __spinlock_t * locks [], unsigned int /*out*/ recursions [], __waitfor_mask_t /*out*/ masks [] );
+static inline void restore( monitor_desc * ctx [], __lock_size_t count, __spinlock_t * locks [], unsigned int /*in */ recursions [], __waitfor_mask_t /*in */ masks [] );
+
+static inline void init     ( __lock_size_t count, monitor_desc * monitors [], __condition_node_t & waiter, __condition_criterion_t criteria [] );
+static inline void init_push( __lock_size_t count, monitor_desc * monitors [], __condition_node_t & waiter, __condition_criterion_t criteria [] );
+
+static inline thread_desc *        check_condition   ( __condition_criterion_t * );
+static inline void                 brand_condition   ( condition & );
+static inline [thread_desc *, int] search_entry_queue( const __waitfor_mask_t &, monitor_desc * monitors [], __lock_size_t count );
+
+forall(dtype T | sized( T ))
+static inline __lock_size_t insert_unique( T * array [], __lock_size_t & size, T * val );
+static inline __lock_size_t count_max    ( const __waitfor_mask_t & mask );
+static inline __lock_size_t aggregate    ( monitor_desc * storage [], const __waitfor_mask_t & mask );
+
+//-----------------------------------------------------------------------------
+// Useful defines
+#define wait_ctx(thrd, user_info)                               /* Create the necessary information to use the signaller stack                         */ \
+	__condition_node_t waiter = { thrd, count, user_info };   /* Create the node specific to this wait operation                                     */ \
+	__condition_criterion_t criteria[count];                  /* Create the creteria this wait operation needs to wake up                            */ \
+	init( count, monitors, waiter, criteria );                /* Link everything together                                                            */ \
+
+#define wait_ctx_primed(thrd, user_info)                        /* Create the necessary information to use the signaller stack                         */ \
+	__condition_node_t waiter = { thrd, count, user_info };   /* Create the node specific to this wait operation                                     */ \
+	__condition_criterion_t criteria[count];                  /* Create the creteria this wait operation needs to wake up                            */ \
+	init_push( count, monitors, waiter, criteria );           /* Link everything together and push it to the AS-Stack                                */ \
+
+#define monitor_ctx( mons, cnt )                                /* Define that create the necessary struct for internal/external scheduling operations */ \
+	monitor_desc ** monitors = mons;                          /* Save the targeted monitors                                                          */ \
+	__lock_size_t count = cnt;                                /* Save the count to a local variable                                                  */ \
+	unsigned int recursions[ count ];                         /* Save the current recursion levels to restore them later                             */ \
+	__waitfor_mask_t masks [ count ];                         /* Save the current waitfor masks to restore them later                                */ \
+	__spinlock_t *   locks [ count ];                         /* We need to pass-in an array of locks to BlockInternal                               */ \
+
+#define monitor_save    save   ( monitors, count, locks, recursions, masks )
+#define monitor_restore restore( monitors, count, locks, recursions, masks )
+
+
+//-----------------------------------------------------------------------------
+// Enter/Leave routines
+
+
+extern "C" {
+	// Enter single monitor
+	static void __enter_monitor_desc( monitor_desc * this, const __monitor_group_t & group ) {
+		// Lock the monitor spinlock
+		lock( this->lock __cfaabi_dbg_ctx2 );
+		// Interrupts disable inside critical section
+		thread_desc * thrd = kernelTLS.this_thread;
+
+		__cfaabi_dbg_print_safe( "Kernel : %10p Entering mon %p (%p)\n", thrd, this, this->owner);
+
+		if( !this->owner ) {
+			// No one has the monitor, just take it
+			set_owner( this, thrd );
+
+			__cfaabi_dbg_print_safe( "Kernel :  mon is free \n" );
+		}
+		else if( this->owner == thrd) {
+			// We already have the monitor, just note how many times we took it
+			this->recursion += 1;
+
+			__cfaabi_dbg_print_safe( "Kernel :  mon already owned \n" );
+		}
+		else if( is_accepted( this, group) ) {
+			// Some one was waiting for us, enter
+			set_owner( this, thrd );
+
+			// Reset mask
+			reset_mask( this );
+
+			__cfaabi_dbg_print_safe( "Kernel :  mon accepts \n" );
+		}
+		else {
+			__cfaabi_dbg_print_safe( "Kernel :  blocking \n" );
+
+			// Some one else has the monitor, wait in line for it
+			append( this->entry_queue, thrd );
+
+			BlockInternal( &this->lock );
+
+			__cfaabi_dbg_print_safe( "Kernel : %10p Entered  mon %p\n", thrd, this);
+
+			// BlockInternal will unlock spinlock, no need to unlock ourselves
+			return;
+		}
+
+		__cfaabi_dbg_print_safe( "Kernel : %10p Entered  mon %p\n", thrd, this);
+
+		// Release the lock and leave
+		unlock( this->lock );
+		return;
+	}
+
+	static void __enter_monitor_dtor( monitor_desc * this, fptr_t func ) {
+		// Lock the monitor spinlock
+		lock( this->lock __cfaabi_dbg_ctx2 );
+		// Interrupts disable inside critical section
+		thread_desc * thrd = kernelTLS.this_thread;
+
+		__cfaabi_dbg_print_safe( "Kernel : %10p Entering dtor for mon %p (%p)\n", thrd, this, this->owner);
+
+
+		if( !this->owner ) {
+			__cfaabi_dbg_print_safe( "Kernel : Destroying free mon %p\n", this);
+
+			// No one has the monitor, just take it
+			set_owner( this, thrd );
+
+			unlock( this->lock );
+			return;
+		}
+		else if( this->owner == thrd) {
+			// We already have the monitor... but where about to destroy it so the nesting will fail
+			// Abort!
+			abort( "Attempt to destroy monitor %p by thread \"%.256s\" (%p) in nested mutex.", this, thrd->self_cor.name, thrd );
+		}
+
+		__lock_size_t count = 1;
+		monitor_desc ** monitors = &this;
+		__monitor_group_t group = { &this, 1, func };
+		if( is_accepted( this, group) ) {
+			__cfaabi_dbg_print_safe( "Kernel :  mon accepts dtor, block and signal it \n" );
+
+			// Wake the thread that is waiting for this
+			__condition_criterion_t * urgent = pop( this->signal_stack );
+			verify( urgent );
+
+			// Reset mask
+			reset_mask( this );
+
+			// Create the node specific to this wait operation
+			wait_ctx_primed( thrd, 0 )
+
+			// Some one else has the monitor, wait for him to finish and then run
+			BlockInternal( &this->lock, urgent->owner->waiting_thread );
+
+			// Some one was waiting for us, enter
+			set_owner( this, thrd );
+		}
+		else {
+			__cfaabi_dbg_print_safe( "Kernel :  blocking \n" );
+
+			wait_ctx( thrd, 0 )
+			this->dtor_node = &waiter;
+
+			// Some one else has the monitor, wait in line for it
+			append( this->entry_queue, thrd );
+			BlockInternal( &this->lock );
+
+			// BlockInternal will unlock spinlock, no need to unlock ourselves
+			return;
+		}
+
+		__cfaabi_dbg_print_safe( "Kernel : Destroying %p\n", this);
+
+	}
+
+	// Leave single monitor
+	void __leave_monitor_desc( monitor_desc * this ) {
+		// Lock the monitor spinlock
+		lock( this->lock __cfaabi_dbg_ctx2 );
+
+		__cfaabi_dbg_print_safe( "Kernel : %10p Leaving mon %p (%p)\n", kernelTLS.this_thread, this, this->owner);
+
+		verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+
+		// Leaving a recursion level, decrement the counter
+		this->recursion -= 1;
+
+		// If we haven't left the last level of recursion
+		// it means we don't need to do anything
+		if( this->recursion != 0) {
+			__cfaabi_dbg_print_safe( "Kernel :  recursion still %d\n", this->recursion);
+			unlock( this->lock );
+			return;
+		}
+
+		// Get the next thread, will be null on low contention monitor
+		thread_desc * new_owner = next_thread( this );
+
+		// We can now let other threads in safely
+		unlock( this->lock );
+
+		//We need to wake-up the thread
+		WakeThread( new_owner );
+	}
+
+	// Leave single monitor for the last time
+	void __leave_dtor_monitor_desc( monitor_desc * this ) {
+		__cfaabi_dbg_debug_do(
+			if( TL_GET( this_thread ) != this->owner ) {
+				abort( "Destroyed monitor %p has inconsistent owner, expected %p got %p.\n", this, TL_GET( this_thread ), this->owner);
+			}
+			if( this->recursion != 1 ) {
+				abort( "Destroyed monitor %p has %d outstanding nested calls.\n", this, this->recursion - 1);
+			}
+		)
+	}
+
+	// Leave the thread monitor
+	// last routine called by a thread.
+	// Should never return
+	void __leave_thread_monitor( thread_desc * thrd ) {
+		monitor_desc * this = &thrd->self_mon;
+
+		// Lock the monitor now
+		lock( this->lock __cfaabi_dbg_ctx2 );
+
+		disable_interrupts();
+
+		thrd->self_cor.state = Halted;
+
+		verifyf( thrd == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", thrd, this->owner, this->recursion, this );
+
+		// Leaving a recursion level, decrement the counter
+		this->recursion -= 1;
+
+		// If we haven't left the last level of recursion
+		// it must mean there is an error
+		if( this->recursion != 0) { abort( "Thread internal monitor has unbalanced recursion" ); }
+
+		// Fetch the next thread, can be null
+		thread_desc * new_owner = next_thread( this );
+
+		// Leave the thread, this will unlock the spinlock
+		// Use leave thread instead of BlockInternal which is
+		// specialized for this case and supports null new_owner
+		LeaveThread( &this->lock, new_owner );
+
+		// Control flow should never reach here!
+	}
+}
+
+// Enter multiple monitor
+// relies on the monitor array being sorted
+static inline void enter( __monitor_group_t monitors ) {
+	for( __lock_size_t i = 0; i < monitors.size; i++) {
+		__enter_monitor_desc( monitors[i], monitors );
+	}
+}
+
+// Leave multiple monitor
+// relies on the monitor array being sorted
+static inline void leave(monitor_desc * monitors [], __lock_size_t count) {
+	for( __lock_size_t i = count - 1; i >= 0; i--) {
+		__leave_monitor_desc( monitors[i] );
+	}
+}
+
+// Ctor for monitor guard
+// Sorts monitors before entering
+void ?{}( monitor_guard_t & this, monitor_desc * m [], __lock_size_t count, fptr_t func ) {
+	thread_desc * thrd = TL_GET( this_thread );
+
+	// Store current array
+	this.m = m;
+	this.count = count;
+
+	// Sort monitors based on address
+	__libcfa_small_sort(this.m, count);
+
+	// Save previous thread context
+	this.prev = thrd->monitors;
+
+	// Update thread context (needed for conditions)
+	(thrd->monitors){m, count, func};
+
+	// __cfaabi_dbg_print_safe( "MGUARD : enter %d\n", count);
+
+	// Enter the monitors in order
+	__monitor_group_t group = {this.m, this.count, func};
+	enter( group );
+
+	// __cfaabi_dbg_print_safe( "MGUARD : entered\n" );
+}
+
+
+// Dtor for monitor guard
+void ^?{}( monitor_guard_t & this ) {
+	// __cfaabi_dbg_print_safe( "MGUARD : leaving %d\n", this.count);
+
+	// Leave the monitors in order
+	leave( this.m, this.count );
+
+	// __cfaabi_dbg_print_safe( "MGUARD : left\n" );
+
+	// Restore thread context
+	TL_GET( this_thread )->monitors = this.prev;
+}
+
+// Ctor for monitor guard
+// Sorts monitors before entering
+void ?{}( monitor_dtor_guard_t & this, monitor_desc * m [], fptr_t func ) {
+	// optimization
+	thread_desc * thrd = TL_GET( this_thread );
+
+	// Store current array
+	this.m = *m;
+
+	// Save previous thread context
+	this.prev = thrd->monitors;
+
+	// Update thread context (needed for conditions)
+	(thrd->monitors){m, 1, func};
+
+	__enter_monitor_dtor( this.m, func );
+}
+
+// Dtor for monitor guard
+void ^?{}( monitor_dtor_guard_t & this ) {
+	// Leave the monitors in order
+	__leave_dtor_monitor_desc( this.m );
+
+	// Restore thread context
+	TL_GET( this_thread )->monitors = this.prev;
+}
+
+//-----------------------------------------------------------------------------
+// Internal scheduling types
+void ?{}(__condition_node_t & this, thread_desc * waiting_thread, __lock_size_t count, uintptr_t user_info ) {
+	this.waiting_thread = waiting_thread;
+	this.count = count;
+	this.next = NULL;
+	this.user_info = user_info;
+}
+
+void ?{}(__condition_criterion_t & this ) with( this ) {
+	ready  = false;
+	target = NULL;
+	owner  = NULL;
+	next   = NULL;
+}
+
+void ?{}(__condition_criterion_t & this, monitor_desc * target, __condition_node_t & owner ) {
+	this.ready  = false;
+	this.target = target;
+	this.owner  = &owner;
+	this.next   = NULL;
+}
+
+//-----------------------------------------------------------------------------
+// Internal scheduling
+void wait( condition & this, uintptr_t user_info = 0 ) {
+	brand_condition( this );
+
+	// Check that everything is as expected
+	assertf( this.monitors != NULL, "Waiting with no monitors (%p)", this.monitors );
+	verifyf( this.monitor_count != 0, "Waiting with 0 monitors (%"PRIiFAST16")", this.monitor_count );
+	verifyf( this.monitor_count < 32u, "Excessive monitor count (%"PRIiFAST16")", this.monitor_count );
+
+	// Create storage for monitor context
+	monitor_ctx( this.monitors, this.monitor_count );
+
+	// Create the node specific to this wait operation
+	wait_ctx( TL_GET( this_thread ), user_info );
+
+	// Append the current wait operation to the ones already queued on the condition
+	// We don't need locks for that since conditions must always be waited on inside monitor mutual exclusion
+	append( this.blocked, &waiter );
+
+	// Lock all monitors (aggregates the locks as well)
+	lock_all( monitors, locks, count );
+
+	// Find the next thread(s) to run
+	__lock_size_t thread_count = 0;
+	thread_desc * threads[ count ];
+	__builtin_memset( threads, 0, sizeof( threads ) );
+
+	// Save monitor states
+	monitor_save;
+
+	// Remove any duplicate threads
+	for( __lock_size_t i = 0; i < count; i++) {
+		thread_desc * new_owner = next_thread( monitors[i] );
+		insert_unique( threads, thread_count, new_owner );
+	}
+
+	// Everything is ready to go to sleep
+	BlockInternal( locks, count, threads, thread_count );
+
+	// We are back, restore the owners and recursions
+	monitor_restore;
+}
+
+bool signal( condition & this ) {
+	if( is_empty( this ) ) { return false; }
+
+	//Check that everything is as expected
+	verify( this.monitors );
+	verify( this.monitor_count != 0 );
+
+	//Some more checking in debug
+	__cfaabi_dbg_debug_do(
+		thread_desc * this_thrd = TL_GET( this_thread );
+		if ( this.monitor_count != this_thrd->monitors.size ) {
+			abort( "Signal on condition %p made with different number of monitor(s), expected %zi got %zi", &this, this.monitor_count, this_thrd->monitors.size );
+		}
+
+		for(int i = 0; i < this.monitor_count; i++) {
+			if ( this.monitors[i] != this_thrd->monitors[i] ) {
+				abort( "Signal on condition %p made with different monitor, expected %p got %p", &this, this.monitors[i], this_thrd->monitors[i] );
+			}
+		}
+	);
+
+	__lock_size_t count = this.monitor_count;
+
+	// Lock all monitors
+	lock_all( this.monitors, NULL, count );
+
+	//Pop the head of the waiting queue
+	__condition_node_t * node = pop_head( this.blocked );
+
+	//Add the thread to the proper AS stack
+	for(int i = 0; i < count; i++) {
+		__condition_criterion_t * crit = &node->criteria[i];
+		assert( !crit->ready );
+		push( crit->target->signal_stack, crit );
+	}
+
+	//Release
+	unlock_all( this.monitors, count );
+
+	return true;
+}
+
+bool signal_block( condition & this ) {
+	if( !this.blocked.head ) { return false; }
+
+	//Check that everything is as expected
+	verifyf( this.monitors != NULL, "Waiting with no monitors (%p)", this.monitors );
+	verifyf( this.monitor_count != 0, "Waiting with 0 monitors (%"PRIiFAST16")", this.monitor_count );
+
+	// Create storage for monitor context
+	monitor_ctx( this.monitors, this.monitor_count );
+
+	// Lock all monitors (aggregates the locks them as well)
+	lock_all( monitors, locks, count );
+
+
+	// Create the node specific to this wait operation
+	wait_ctx_primed( kernelTLS.this_thread, 0 )
+
+	//save contexts
+	monitor_save;
+
+	//Find the thread to run
+	thread_desc * signallee = pop_head( this.blocked )->waiting_thread;
+	set_owner( monitors, count, signallee );
+
+	__cfaabi_dbg_print_buffer_decl( "Kernel : signal_block condition %p (s: %p)\n", &this, signallee );
+
+	//Everything is ready to go to sleep
+	BlockInternal( locks, count, &signallee, 1 );
+
+
+	// WE WOKE UP
+
+
+	__cfaabi_dbg_print_buffer_local( "Kernel :   signal_block returned\n" );
+
+	//We are back, restore the masks and recursions
+	monitor_restore;
+
+	return true;
+}
+
+// Access the user_info of the thread waiting at the front of the queue
+uintptr_t front( condition & this ) {
+	verifyf( !is_empty(this),
+		"Attempt to access user data on an empty condition.\n"
+		"Possible cause is not checking if the condition is empty before reading stored data."
+	);
+	return ((typeof(this.blocked.head))this.blocked.head)->user_info;
+}
+
+//-----------------------------------------------------------------------------
+// External scheduling
+// cases to handle :
+// 	- target already there :
+// 		block and wake
+// 	- dtor already there
+// 		put thread on signaller stack
+// 	- non-blocking
+// 		return else
+// 	- timeout
+// 		return timeout
+// 	- block
+// 		setup mask
+// 		block
+void __waitfor_internal( const __waitfor_mask_t & mask, int duration ) {
+	// This statment doesn't have a contiguous list of monitors...
+	// Create one!
+	__lock_size_t max = count_max( mask );
+	monitor_desc * mon_storage[max];
+	__builtin_memset( mon_storage, 0, sizeof( mon_storage ) );
+	__lock_size_t actual_count = aggregate( mon_storage, mask );
+
+	__cfaabi_dbg_print_buffer_decl( "Kernel : waitfor %"PRIdFAST16" (s: %"PRIdFAST16", m: %"PRIdFAST16")\n", actual_count, mask.size, (__lock_size_t)max);
+
+	if(actual_count == 0) return;
+
+	__cfaabi_dbg_print_buffer_local( "Kernel : waitfor internal proceeding\n" );
+
+	// Create storage for monitor context
+	monitor_ctx( mon_storage, actual_count );
+
+	// Lock all monitors (aggregates the locks as well)
+	lock_all( monitors, locks, count );
+
+	{
+		// Check if the entry queue
+		thread_desc * next; int index;
+		[next, index] = search_entry_queue( mask, monitors, count );
+
+		if( next ) {
+			*mask.accepted = index;
+			__acceptable_t& accepted = mask[index];
+			if( accepted.is_dtor ) {
+				__cfaabi_dbg_print_buffer_local( "Kernel : dtor already there\n" );
+				verifyf( accepted.size == 1,  "ERROR: Accepted dtor has more than 1 mutex parameter." );
+
+				monitor_desc * mon2dtor = accepted[0];
+				verifyf( mon2dtor->dtor_node, "ERROR: Accepted monitor has no dtor_node." );
+
+				__condition_criterion_t * dtor_crit = mon2dtor->dtor_node->criteria;
+				push( mon2dtor->signal_stack, dtor_crit );
+
+				unlock_all( locks, count );
+			}
+			else {
+				__cfaabi_dbg_print_buffer_local( "Kernel : thread present, baton-passing\n" );
+
+				// Create the node specific to this wait operation
+				wait_ctx_primed( kernelTLS.this_thread, 0 );
+
+				// Save monitor states
+				monitor_save;
+
+				__cfaabi_dbg_print_buffer_local( "Kernel :  baton of %"PRIdFAST16" monitors : ", count );
+				#ifdef __CFA_DEBUG_PRINT__
+					for( int i = 0; i < count; i++) {
+						__cfaabi_dbg_print_buffer_local( "%p %p ", monitors[i], monitors[i]->signal_stack.top );
+					}
+				#endif
+				__cfaabi_dbg_print_buffer_local( "\n" );
+
+				// Set the owners to be the next thread
+				set_owner( monitors, count, next );
+
+				// Everything is ready to go to sleep
+				BlockInternal( locks, count, &next, 1 );
+
+				// We are back, restore the owners and recursions
+				monitor_restore;
+
+				__cfaabi_dbg_print_buffer_local( "Kernel : thread present, returned\n" );
+			}
+
+			__cfaabi_dbg_print_buffer_local( "Kernel : accepted %d\n", *mask.accepted);
+			return;
+		}
+	}
+
+
+	if( duration == 0 ) {
+		__cfaabi_dbg_print_buffer_local( "Kernel : non-blocking, exiting\n" );
+
+		unlock_all( locks, count );
+
+		__cfaabi_dbg_print_buffer_local( "Kernel : accepted %d\n", *mask.accepted);
+		return;
+	}
+
+
+	verifyf( duration < 0, "Timeout on waitfor statments not supported yet." );
+
+	__cfaabi_dbg_print_buffer_local( "Kernel : blocking waitfor\n" );
+
+	// Create the node specific to this wait operation
+	wait_ctx_primed( kernelTLS.this_thread, 0 );
+
+	monitor_save;
+	set_mask( monitors, count, mask );
+
+	for( __lock_size_t i = 0; i < count; i++) {
+		verify( monitors[i]->owner == kernelTLS.this_thread );
+	}
+
+	//Everything is ready to go to sleep
+	BlockInternal( locks, count );
+
+
+	// WE WOKE UP
+
+
+	//We are back, restore the masks and recursions
+	monitor_restore;
+
+	__cfaabi_dbg_print_buffer_local( "Kernel : exiting\n" );
+
+	__cfaabi_dbg_print_buffer_local( "Kernel : accepted %d\n", *mask.accepted);
+}
+
+//-----------------------------------------------------------------------------
+// Utilities
+
+static inline void set_owner( monitor_desc * this, thread_desc * owner ) {
+	// __cfaabi_dbg_print_safe( "Kernal :   Setting owner of %p to %p ( was %p)\n", this, owner, this->owner );
+
+	//Pass the monitor appropriately
+	this->owner = owner;
+
+	//We are passing the monitor to someone else, which means recursion level is not 0
+	this->recursion = owner ? 1 : 0;
+}
+
+static inline void set_owner( monitor_desc * monitors [], __lock_size_t count, thread_desc * owner ) {
+	monitors[0]->owner     = owner;
+	monitors[0]->recursion = 1;
+	for( __lock_size_t i = 1; i < count; i++ ) {
+		monitors[i]->owner     = owner;
+		monitors[i]->recursion = 0;
+	}
+}
+
+static inline void set_mask( monitor_desc * storage [], __lock_size_t count, const __waitfor_mask_t & mask ) {
+	for( __lock_size_t i = 0; i < count; i++) {
+		storage[i]->mask = mask;
+	}
+}
+
+static inline void reset_mask( monitor_desc * this ) {
+	this->mask.accepted = NULL;
+	this->mask.data = NULL;
+	this->mask.size = 0;
+}
+
+static inline thread_desc * next_thread( monitor_desc * this ) {
+	//Check the signaller stack
+	__cfaabi_dbg_print_safe( "Kernel :  mon %p AS-stack top %p\n", this, this->signal_stack.top);
+	__condition_criterion_t * urgent = pop( this->signal_stack );
+	if( urgent ) {
+		//The signaller stack is not empty,
+		//regardless of if we are ready to baton pass,
+		//we need to set the monitor as in use
+		set_owner( this,  urgent->owner->waiting_thread );
+
+		return check_condition( urgent );
+	}
+
+	// No signaller thread
+	// Get the next thread in the entry_queue
+	thread_desc * new_owner = pop_head( this->entry_queue );
+	set_owner( this, new_owner );
+
+	return new_owner;
+}
+
+static inline bool is_accepted( monitor_desc * this, const __monitor_group_t & group ) {
+	__acceptable_t * it = this->mask.data; // Optim
+	__lock_size_t count = this->mask.size;
+
+	// Check if there are any acceptable functions
+	if( !it ) return false;
+
+	// If this isn't the first monitor to test this, there is no reason to repeat the test.
+	if( this != group[0] ) return group[0]->mask.accepted >= 0;
+
+	// For all acceptable functions check if this is the current function.
+	for( __lock_size_t i = 0; i < count; i++, it++ ) {
+		if( *it == group ) {
+			*this->mask.accepted = i;
+			return true;
+		}
+	}
+
+	// No function matched
+	return false;
+}
+
+static inline void init( __lock_size_t count, monitor_desc * monitors [], __condition_node_t & waiter, __condition_criterion_t criteria [] ) {
+	for( __lock_size_t i = 0; i < count; i++) {
+		(criteria[i]){ monitors[i], waiter };
+	}
+
+	waiter.criteria = criteria;
+}
+
+static inline void init_push( __lock_size_t count, monitor_desc * monitors [], __condition_node_t & waiter, __condition_criterion_t criteria [] ) {
+	for( __lock_size_t i = 0; i < count; i++) {
+		(criteria[i]){ monitors[i], waiter };
+		__cfaabi_dbg_print_safe( "Kernel :  target %p = %p\n", criteria[i].target, &criteria[i] );
+		push( criteria[i].target->signal_stack, &criteria[i] );
+	}
+
+	waiter.criteria = criteria;
+}
+
+static inline void lock_all( __spinlock_t * locks [], __lock_size_t count ) {
+	for( __lock_size_t i = 0; i < count; i++ ) {
+		lock( *locks[i] __cfaabi_dbg_ctx2 );
+	}
+}
+
+static inline void lock_all( monitor_desc * source [], __spinlock_t * /*out*/ locks [], __lock_size_t count ) {
+	for( __lock_size_t i = 0; i < count; i++ ) {
+		__spinlock_t * l = &source[i]->lock;
+		lock( *l __cfaabi_dbg_ctx2 );
+		if(locks) locks[i] = l;
+	}
+}
+
+static inline void unlock_all( __spinlock_t * locks [], __lock_size_t count ) {
+	for( __lock_size_t i = 0; i < count; i++ ) {
+		unlock( *locks[i] );
+	}
+}
+
+static inline void unlock_all( monitor_desc * locks [], __lock_size_t count ) {
+	for( __lock_size_t i = 0; i < count; i++ ) {
+		unlock( locks[i]->lock );
+	}
+}
+
+static inline void save(
+	monitor_desc * ctx [],
+	__lock_size_t count,
+	__attribute((unused)) __spinlock_t * locks [],
+	unsigned int /*out*/ recursions [],
+	__waitfor_mask_t /*out*/ masks []
+) {
+	for( __lock_size_t i = 0; i < count; i++ ) {
+		recursions[i] = ctx[i]->recursion;
+		masks[i]      = ctx[i]->mask;
+	}
+}
+
+static inline void restore(
+	monitor_desc * ctx [],
+	__lock_size_t count,
+	__spinlock_t * locks [],
+	unsigned int /*out*/ recursions [],
+	__waitfor_mask_t /*out*/ masks []
+) {
+	lock_all( locks, count );
+	for( __lock_size_t i = 0; i < count; i++ ) {
+		ctx[i]->recursion = recursions[i];
+		ctx[i]->mask      = masks[i];
+	}
+	unlock_all( locks, count );
+}
+
+// Function has 2 different behavior
+// 1 - Marks a monitors as being ready to run
+// 2 - Checks if all the monitors are ready to run
+//     if so return the thread to run
+static inline thread_desc * check_condition( __condition_criterion_t * target ) {
+	__condition_node_t * node = target->owner;
+	unsigned short count = node->count;
+	__condition_criterion_t * criteria = node->criteria;
+
+	bool ready2run = true;
+
+	for(	int i = 0; i < count; i++ ) {
+
+		// __cfaabi_dbg_print_safe( "Checking %p for %p\n", &criteria[i], target );
+		if( &criteria[i] == target ) {
+			criteria[i].ready = true;
+			// __cfaabi_dbg_print_safe( "True\n" );
+		}
+
+		ready2run = criteria[i].ready && ready2run;
+	}
+
+	__cfaabi_dbg_print_safe( "Kernel :  Runing %i (%p)\n", ready2run, ready2run ? node->waiting_thread : NULL );
+	return ready2run ? node->waiting_thread : NULL;
+}
+
+static inline void brand_condition( condition & this ) {
+	thread_desc * thrd = TL_GET( this_thread );
+	if( !this.monitors ) {
+		// __cfaabi_dbg_print_safe( "Branding\n" );
+		assertf( thrd->monitors.data != NULL, "No current monitor to brand condition %p", thrd->monitors.data );
+		this.monitor_count = thrd->monitors.size;
+
+		this.monitors = (monitor_desc **)malloc( this.monitor_count * sizeof( *this.monitors ) );
+		for( int i = 0; i < this.monitor_count; i++ ) {
+			this.monitors[i] = thrd->monitors[i];
+		}
+	}
+}
+
+static inline [thread_desc *, int] search_entry_queue( const __waitfor_mask_t & mask, monitor_desc * monitors [], __lock_size_t count ) {
+
+	__queue_t(thread_desc) & entry_queue = monitors[0]->entry_queue;
+
+	// For each thread in the entry-queue
+	for(	thread_desc ** thrd_it = &entry_queue.head;
+		*thrd_it;
+		thrd_it = &(*thrd_it)->next
+	) {
+		// For each acceptable check if it matches
+		int i = 0;
+		__acceptable_t * end   = end  (mask);
+		__acceptable_t * begin = begin(mask);
+		for( __acceptable_t * it = begin; it != end; it++, i++ ) {
+			// Check if we have a match
+			if( *it == (*thrd_it)->monitors ) {
+
+				// If we have a match return it
+				// after removeing it from the entry queue
+				return [remove( entry_queue, thrd_it ), i];
+			}
+		}
+	}
+
+	return [0, -1];
+}
+
+forall(dtype T | sized( T ))
+static inline __lock_size_t insert_unique( T * array [], __lock_size_t & size, T * val ) {
+	if( !val ) return size;
+
+	for( __lock_size_t i = 0; i <= size; i++) {
+		if( array[i] == val ) return size;
+	}
+
+	array[size] = val;
+	size = size + 1;
+	return size;
+}
+
+static inline __lock_size_t count_max( const __waitfor_mask_t & mask ) {
+	__lock_size_t max = 0;
+	for( __lock_size_t i = 0; i < mask.size; i++ ) {
+		__acceptable_t & accepted = mask[i];
+		max += accepted.size;
+	}
+	return max;
+}
+
+static inline __lock_size_t aggregate( monitor_desc * storage [], const __waitfor_mask_t & mask ) {
+	__lock_size_t size = 0;
+	for( __lock_size_t i = 0; i < mask.size; i++ ) {
+		__acceptable_t & accepted = mask[i];
+		__libcfa_small_sort( accepted.data, accepted.size );
+		for( __lock_size_t j = 0; j < accepted.size; j++) {
+			insert_unique( storage, size, accepted[j] );
+		}
+	}
+	// TODO insertion sort instead of this
+	__libcfa_small_sort( storage, size );
+	return size;
+}
+
+// Local Variables: //
+// mode: c //
+// tab-width: 4 //
+// End: //
Index: libcfa/src/concurrency/mutex.c
===================================================================
--- libcfa/src/concurrency/mutex.c	(revision bf71cfdb7285490eee552b461158846f626cc52f)
+++ 	(revision )
@@ -1,193 +1,0 @@
-
-//                              -*- Mode: CFA -*-
-//
-// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
-//
-// The contents of this file are covered under the licence agreement in the
-// file "LICENCE" distributed with Cforall.
-//
-// mutex.c --
-//
-// Author           : Thierry Delisle
-// Created On       : Fri May 25 01:37:11 2018
-// Last Modified By : Thierry Delisle
-// Last Modified On : Fri May 25 01:37:51 2018
-// Update Count     : 0
-//
-
-#include "mutex"
-
-#include "kernel_private.h"
-
-//-----------------------------------------------------------------------------
-// Locks
-
-// Exclusive lock - non-recursive
-// ---
-void ?{}(mutex_lock & this) {
-	this.lock{};
-	this.blocked_threads{};
-}
-
-void ^?{}(mutex_lock & this) {
-	// default
-}
-
-void lock(mutex_lock & this) with(this) {
-	lock( lock __cfaabi_dbg_ctx2 );
-	if( is_locked ) {
-		append( blocked_threads, kernelTLS.this_thread );
-		BlockInternal( &lock );
-	}
-	else {
-		is_locked = true;
-		unlock( lock );
-	}
-}
-
-bool try_lock(mutex_lock & this) with(this) {
-	bool ret = false;
-	lock( lock __cfaabi_dbg_ctx2 );
-	if( !is_locked ) {
-		ret = true;
-		is_locked = true;
-	}
-	unlock( lock );
-	return ret;
-}
-
-void unlock(mutex_lock & this) {
-	lock( this.lock __cfaabi_dbg_ctx2 );
-	this.is_locked = (this.blocked_threads != 0);
-	WakeThread(
-		pop_head( this.blocked_threads )
-	);
-	unlock( this.lock );
-}
-
-// Exclusive lock - non-recursive
-// ---
-void ?{}(recursive_mutex_lock & this) {
-	this.lock{};
-	this.blocked_threads{};
-	this.owner = NULL;
-	this.recursion_count = 0;
-}
-
-void ^?{}(recursive_mutex_lock & this) {
-	// default
-}
-
-void lock(recursive_mutex_lock & this) with(this) {
-	lock( lock __cfaabi_dbg_ctx2 );
-	if( owner == NULL ) {
-		owner = kernelTLS.this_thread;
-		recursion_count = 1;
-		unlock( lock );
-	}
-	else if( owner == kernelTLS.this_thread ) {
-		recursion_count++;
-		unlock( lock );
-	}
-	else {
-		append( blocked_threads, kernelTLS.this_thread );
-		BlockInternal( &lock );
-	}
-}
-
-bool try_lock(recursive_mutex_lock & this) with(this) {
-	bool ret = false;
-	lock( lock __cfaabi_dbg_ctx2 );
-	if( owner == NULL ) {
-		owner = kernelTLS.this_thread;
-		recursion_count = 1;
-		ret = true;
-	}
-	else if( owner == kernelTLS.this_thread ) {
-		recursion_count++;
-		ret = true;
-	}
-	unlock( lock );
-	return ret;
-}
-
-void unlock(recursive_mutex_lock & this) with(this) {
-	lock( lock __cfaabi_dbg_ctx2 );
-	recursion_count--;
-	if( recursion_count == 0 ) {
-		thread_desc * thrd = pop_head( blocked_threads );
-		owner = thrd;
-		recursion_count = (thrd ? 1 : 0);
-		WakeThread( thrd );
-	}
-	unlock( lock );
-}
-
-//-----------------------------------------------------------------------------
-// Conditions
-void ?{}(condition_variable & this) {
-	this.blocked_threads{};
-}
-
-void ^?{}(condition_variable & this) {
-	// default
-}
-
-void notify_one(condition_variable & this) with(this) {
-	lock( lock __cfaabi_dbg_ctx2 );
-	WakeThread(
-		pop_head( this.blocked_threads )
-	);
-	unlock( lock );
-}
-
-void notify_all(condition_variable & this) with(this) {
-	lock( lock __cfaabi_dbg_ctx2 );
-	while(this.blocked_threads) {
-		WakeThread(
-			pop_head( this.blocked_threads )
-		);
-	}
-	unlock( lock );
-}
-
-void wait(condition_variable & this) {
-	lock( this.lock __cfaabi_dbg_ctx2 );
-	append( this.blocked_threads, kernelTLS.this_thread );
-	BlockInternal( &this.lock );
-}
-
-forall(dtype L | is_lock(L))
-void wait(condition_variable & this, L & l) {
-	lock( this.lock __cfaabi_dbg_ctx2 );
-	append( this.blocked_threads, kernelTLS.this_thread );
-	void __unlock(void) {
-		unlock(l);
-		unlock(this.lock);
-	}
-	BlockInternal( __unlock );
-	lock(l);
-}
-
-//-----------------------------------------------------------------------------
-// Scopes
-forall(dtype L | is_lock(L))
-void lock_all  ( L * locks[], size_t count) {
-	// Sort locks based on addresses
-	__libcfa_small_sort(locks, count);
-
-	// Lock all
-	for(size_t i = 0; i < count; i++) {
-		L * l = locks[i];
-		lock( *l );
-	}
-}
-
-forall(dtype L | is_lock(L))
-void unlock_all( L * locks[], size_t count) {
-	// Lock all
-	for(size_t i = 0; i < count; i++) {
-		L * l = locks[i];
-		unlock( *l );
-	}
-}
Index: libcfa/src/concurrency/mutex.cfa
===================================================================
--- libcfa/src/concurrency/mutex.cfa	(revision 4dcaed2c212a42d633824e9b88053258a2d0f969)
+++ libcfa/src/concurrency/mutex.cfa	(revision 4dcaed2c212a42d633824e9b88053258a2d0f969)
@@ -0,0 +1,193 @@
+
+//                              -*- Mode: CFA -*-
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// mutex.c --
+//
+// Author           : Thierry Delisle
+// Created On       : Fri May 25 01:37:11 2018
+// Last Modified By : Thierry Delisle
+// Last Modified On : Fri May 25 01:37:51 2018
+// Update Count     : 0
+//
+
+#include "mutex"
+
+#include "kernel_private.h"
+
+//-----------------------------------------------------------------------------
+// Locks
+
+// Exclusive lock - non-recursive
+// ---
+void ?{}(mutex_lock & this) {
+	this.lock{};
+	this.blocked_threads{};
+}
+
+void ^?{}(mutex_lock & this) {
+	// default
+}
+
+void lock(mutex_lock & this) with(this) {
+	lock( lock __cfaabi_dbg_ctx2 );
+	if( is_locked ) {
+		append( blocked_threads, kernelTLS.this_thread );
+		BlockInternal( &lock );
+	}
+	else {
+		is_locked = true;
+		unlock( lock );
+	}
+}
+
+bool try_lock(mutex_lock & this) with(this) {
+	bool ret = false;
+	lock( lock __cfaabi_dbg_ctx2 );
+	if( !is_locked ) {
+		ret = true;
+		is_locked = true;
+	}
+	unlock( lock );
+	return ret;
+}
+
+void unlock(mutex_lock & this) {
+	lock( this.lock __cfaabi_dbg_ctx2 );
+	this.is_locked = (this.blocked_threads != 0);
+	WakeThread(
+		pop_head( this.blocked_threads )
+	);
+	unlock( this.lock );
+}
+
+// Exclusive lock - non-recursive
+// ---
+void ?{}(recursive_mutex_lock & this) {
+	this.lock{};
+	this.blocked_threads{};
+	this.owner = NULL;
+	this.recursion_count = 0;
+}
+
+void ^?{}(recursive_mutex_lock & this) {
+	// default
+}
+
+void lock(recursive_mutex_lock & this) with(this) {
+	lock( lock __cfaabi_dbg_ctx2 );
+	if( owner == NULL ) {
+		owner = kernelTLS.this_thread;
+		recursion_count = 1;
+		unlock( lock );
+	}
+	else if( owner == kernelTLS.this_thread ) {
+		recursion_count++;
+		unlock( lock );
+	}
+	else {
+		append( blocked_threads, kernelTLS.this_thread );
+		BlockInternal( &lock );
+	}
+}
+
+bool try_lock(recursive_mutex_lock & this) with(this) {
+	bool ret = false;
+	lock( lock __cfaabi_dbg_ctx2 );
+	if( owner == NULL ) {
+		owner = kernelTLS.this_thread;
+		recursion_count = 1;
+		ret = true;
+	}
+	else if( owner == kernelTLS.this_thread ) {
+		recursion_count++;
+		ret = true;
+	}
+	unlock( lock );
+	return ret;
+}
+
+void unlock(recursive_mutex_lock & this) with(this) {
+	lock( lock __cfaabi_dbg_ctx2 );
+	recursion_count--;
+	if( recursion_count == 0 ) {
+		thread_desc * thrd = pop_head( blocked_threads );
+		owner = thrd;
+		recursion_count = (thrd ? 1 : 0);
+		WakeThread( thrd );
+	}
+	unlock( lock );
+}
+
+//-----------------------------------------------------------------------------
+// Conditions
+void ?{}(condition_variable & this) {
+	this.blocked_threads{};
+}
+
+void ^?{}(condition_variable & this) {
+	// default
+}
+
+void notify_one(condition_variable & this) with(this) {
+	lock( lock __cfaabi_dbg_ctx2 );
+	WakeThread(
+		pop_head( this.blocked_threads )
+	);
+	unlock( lock );
+}
+
+void notify_all(condition_variable & this) with(this) {
+	lock( lock __cfaabi_dbg_ctx2 );
+	while(this.blocked_threads) {
+		WakeThread(
+			pop_head( this.blocked_threads )
+		);
+	}
+	unlock( lock );
+}
+
+void wait(condition_variable & this) {
+	lock( this.lock __cfaabi_dbg_ctx2 );
+	append( this.blocked_threads, kernelTLS.this_thread );
+	BlockInternal( &this.lock );
+}
+
+forall(dtype L | is_lock(L))
+void wait(condition_variable & this, L & l) {
+	lock( this.lock __cfaabi_dbg_ctx2 );
+	append( this.blocked_threads, kernelTLS.this_thread );
+	void __unlock(void) {
+		unlock(l);
+		unlock(this.lock);
+	}
+	BlockInternal( __unlock );
+	lock(l);
+}
+
+//-----------------------------------------------------------------------------
+// Scopes
+forall(dtype L | is_lock(L))
+void lock_all  ( L * locks[], size_t count) {
+	// Sort locks based on addresses
+	__libcfa_small_sort(locks, count);
+
+	// Lock all
+	for(size_t i = 0; i < count; i++) {
+		L * l = locks[i];
+		lock( *l );
+	}
+}
+
+forall(dtype L | is_lock(L))
+void unlock_all( L * locks[], size_t count) {
+	// Lock all
+	for(size_t i = 0; i < count; i++) {
+		L * l = locks[i];
+		unlock( *l );
+	}
+}
Index: libcfa/src/concurrency/preemption.c
===================================================================
--- libcfa/src/concurrency/preemption.c	(revision bf71cfdb7285490eee552b461158846f626cc52f)
+++ 	(revision )
@@ -1,491 +1,0 @@
-//
-// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
-//
-// The contents of this file are covered under the licence agreement in the
-// file "LICENCE" distributed with Cforall.
-//
-// signal.c --
-//
-// Author           : Thierry Delisle
-// Created On       : Mon Jun 5 14:20:42 2017
-// Last Modified By : Peter A. Buhr
-// Last Modified On : Tue Jun  5 17:35:49 2018
-// Update Count     : 37
-//
-
-#include "preemption.h"
-#include <assert.h>
-
-extern "C" {
-#include <errno.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-}
-
-#include "bits/signal.h"
-
-#if !defined(__CFA_DEFAULT_PREEMPTION__)
-#define __CFA_DEFAULT_PREEMPTION__ 10`ms
-#endif
-
-Duration default_preemption() __attribute__((weak)) {
-	return __CFA_DEFAULT_PREEMPTION__;
-}
-
-// FwdDeclarations : timeout handlers
-static void preempt( processor   * this );
-static void timeout( thread_desc * this );
-
-// FwdDeclarations : Signal handlers
-static void sigHandler_ctxSwitch( __CFA_SIGPARMS__ );
-static void sigHandler_segv     ( __CFA_SIGPARMS__ );
-static void sigHandler_ill      ( __CFA_SIGPARMS__ );
-static void sigHandler_fpe      ( __CFA_SIGPARMS__ );
-static void sigHandler_abort    ( __CFA_SIGPARMS__ );
-
-// FwdDeclarations : alarm thread main
-static void * alarm_loop( __attribute__((unused)) void * args );
-
-// Machine specific register name
-#if   defined( __i386 )
-#define CFA_REG_IP gregs[REG_EIP]
-#elif defined( __x86_64 )
-#define CFA_REG_IP gregs[REG_RIP]
-#elif defined( __ARM_ARCH )
-#define CFA_REG_IP arm_pc
-#else
-#error unknown hardware architecture
-#endif
-
-KERNEL_STORAGE(event_kernel_t, event_kernel);         // private storage for event kernel
-event_kernel_t * event_kernel;                        // kernel public handle to even kernel
-static pthread_t alarm_thread;                        // pthread handle to alarm thread
-
-static void ?{}(event_kernel_t & this) with( this ) {
-	alarms{};
-	lock{};
-}
-
-enum {
-	PREEMPT_NORMAL    = 0,
-	PREEMPT_TERMINATE = 1,
-};
-
-//=============================================================================================
-// Kernel Preemption logic
-//=============================================================================================
-
-// Get next expired node
-static inline alarm_node_t * get_expired( alarm_list_t * alarms, Time currtime ) {
-	if( !alarms->head ) return NULL;                          // If no alarms return null
-	if( alarms->head->alarm >= currtime ) return NULL;        // If alarms head not expired return null
-	return pop(alarms);                                       // Otherwise just pop head
-}
-
-// Tick one frame of the Discrete Event Simulation for alarms
-static void tick_preemption() {
-	alarm_node_t * node = NULL;                     // Used in the while loop but cannot be declared in the while condition
-	alarm_list_t * alarms = &event_kernel->alarms;  // Local copy for ease of reading
-	Time currtime = __kernel_get_time();			// Check current time once so we everything "happens at once"
-
-	//Loop throught every thing expired
-	while( node = get_expired( alarms, currtime ) ) {
-		// __cfaabi_dbg_print_buffer_decl( " KERNEL: preemption tick.\n" );
-
-		// Check if this is a kernel
-		if( node->kernel_alarm ) {
-			preempt( node->proc );
-		}
-		else {
-			timeout( node->thrd );
-		}
-
-		// Check if this is a periodic alarm
-		Duration period = node->period;
-		if( period > 0 ) {
-			// __cfaabi_dbg_print_buffer_local( " KERNEL: alarm period is %lu.\n", period.tv );
-			node->alarm = currtime + period;    // Alarm is periodic, add currtime to it (used cached current time)
-			insert( alarms, node );             // Reinsert the node for the next time it triggers
-		}
-		else {
-			node->set = false;                  // Node is one-shot, just mark it as not pending
-		}
-	}
-
-	// If there are still alarms pending, reset the timer
-	if( alarms->head ) {
-		__cfaabi_dbg_print_buffer_decl( " KERNEL: @%ju(%ju) resetting alarm to %ju.\n", currtime.tv, __kernel_get_time().tv, (alarms->head->alarm - currtime).tv);
-		Duration delta = alarms->head->alarm - currtime;
-		Duration caped = max(delta, 50`us);
-		// itimerval tim  = { caped };
-		// __cfaabi_dbg_print_buffer_local( "    Values are %lu, %lu, %lu %lu.\n", delta.tv, caped.tv, tim.it_value.tv_sec, tim.it_value.tv_usec);
-
-		__kernel_set_timer( caped );
-	}
-}
-
-// Update the preemption of a processor and notify interested parties
-void update_preemption( processor * this, Duration duration ) {
-	alarm_node_t * alarm = this->preemption_alarm;
-
-	// Alarms need to be enabled
-	if ( duration > 0 && ! alarm->set ) {
-		alarm->alarm = __kernel_get_time() + duration;
-		alarm->period = duration;
-		register_self( alarm );
-	}
-	// Zero duration but alarm is set
-	else if ( duration == 0 && alarm->set ) {
-		unregister_self( alarm );
-		alarm->alarm = 0;
-		alarm->period = 0;
-	}
-	// If alarm is different from previous, change it
-	else if ( duration > 0 && alarm->period != duration ) {
-		unregister_self( alarm );
-		alarm->alarm = __kernel_get_time() + duration;
-		alarm->period = duration;
-		register_self( alarm );
-	}
-}
-
-//=============================================================================================
-// Kernel Signal Tools
-//=============================================================================================
-
-__cfaabi_dbg_debug_do( static thread_local void * last_interrupt = 0; )
-
-extern "C" {
-	// Disable interrupts by incrementing the counter
-	void disable_interrupts() {
-		with( kernelTLS.preemption_state ) {
-			#if GCC_VERSION > 50000
-			static_assert(__atomic_always_lock_free(sizeof(enabled), &enabled), "Must be lock-free");
-			#endif
-
-			// Set enabled flag to false
-			// should be atomic to avoid preemption in the middle of the operation.
-			// use memory order RELAXED since there is no inter-thread on this variable requirements
-			__atomic_store_n(&enabled, false, __ATOMIC_RELAXED);
-
-			// Signal the compiler that a fence is needed but only for signal handlers
-			__atomic_signal_fence(__ATOMIC_ACQUIRE);
-
-			__attribute__((unused)) unsigned short new_val = disable_count + 1;
-			disable_count = new_val;
-			verify( new_val < 65_000u );              // If this triggers someone is disabling interrupts without enabling them
-		}
-	}
-
-	// Enable interrupts by decrementing the counter
-	// If counter reaches 0, execute any pending CtxSwitch
-	void enable_interrupts( __cfaabi_dbg_ctx_param ) {
-		processor   * proc = kernelTLS.this_processor; // Cache the processor now since interrupts can start happening after the atomic store
-		thread_desc * thrd = kernelTLS.this_thread;	  // Cache the thread now since interrupts can start happening after the atomic store
-
-		with( kernelTLS.preemption_state ){
-			unsigned short prev = disable_count;
-			disable_count -= 1;
-			verify( prev != 0u );                     // If this triggers someone is enabled already enabled interruptsverify( prev != 0u );
-
-			// Check if we need to prempt the thread because an interrupt was missed
-			if( prev == 1 ) {
-				#if GCC_VERSION > 50000
-				static_assert(__atomic_always_lock_free(sizeof(enabled), &enabled), "Must be lock-free");
-				#endif
-
-				// Set enabled flag to true
-				// should be atomic to avoid preemption in the middle of the operation.
-				// use memory order RELAXED since there is no inter-thread on this variable requirements
-				__atomic_store_n(&enabled, true, __ATOMIC_RELAXED);
-
-				// Signal the compiler that a fence is needed but only for signal handlers
-				__atomic_signal_fence(__ATOMIC_RELEASE);
-				if( proc->pending_preemption ) {
-					proc->pending_preemption = false;
-					BlockInternal( thrd );
-				}
-			}
-		}
-
-		// For debugging purposes : keep track of the last person to enable the interrupts
-		__cfaabi_dbg_debug_do( proc->last_enable = caller; )
-	}
-
-	// Disable interrupts by incrementint the counter
-	// Don't execute any pending CtxSwitch even if counter reaches 0
-	void enable_interrupts_noPoll() {
-		unsigned short prev = kernelTLS.preemption_state.disable_count;
-		kernelTLS.preemption_state.disable_count -= 1;
-		verifyf( prev != 0u, "Incremented from %u\n", prev );                     // If this triggers someone is enabled already enabled interrupts
-		if( prev == 1 ) {
-			#if GCC_VERSION > 50000
-			static_assert(__atomic_always_lock_free(sizeof(kernelTLS.preemption_state.enabled), &kernelTLS.preemption_state.enabled), "Must be lock-free");
-			#endif
-			// Set enabled flag to true
-			// should be atomic to avoid preemption in the middle of the operation.
-			// use memory order RELAXED since there is no inter-thread on this variable requirements
-			__atomic_store_n(&kernelTLS.preemption_state.enabled, true, __ATOMIC_RELAXED);
-
-			// Signal the compiler that a fence is needed but only for signal handlers
-			__atomic_signal_fence(__ATOMIC_RELEASE);
-		}
-	}
-}
-
-// sigprocmask wrapper : unblock a single signal
-static inline void signal_unblock( int sig ) {
-	sigset_t mask;
-	sigemptyset( &mask );
-	sigaddset( &mask, sig );
-
-	if ( pthread_sigmask( SIG_UNBLOCK, &mask, NULL ) == -1 ) {
-	    abort( "internal error, pthread_sigmask" );
-	}
-}
-
-// sigprocmask wrapper : block a single signal
-static inline void signal_block( int sig ) {
-	sigset_t mask;
-	sigemptyset( &mask );
-	sigaddset( &mask, sig );
-
-	if ( pthread_sigmask( SIG_BLOCK, &mask, NULL ) == -1 ) {
-	    abort( "internal error, pthread_sigmask" );
-	}
-}
-
-// kill wrapper : signal a processor
-static void preempt( processor * this ) {
-	sigval_t value = { PREEMPT_NORMAL };
-	pthread_sigqueue( this->kernel_thread, SIGUSR1, value );
-}
-
-// reserved for future use
-static void timeout( thread_desc * this ) {
-	//TODO : implement waking threads
-}
-
-// KERNEL ONLY
-// Check if a CtxSwitch signal handler shoud defer
-// If true  : preemption is safe
-// If false : preemption is unsafe and marked as pending
-static inline bool preemption_ready() {
-	// Check if preemption is safe
-	bool ready = kernelTLS.preemption_state.enabled && ! kernelTLS.preemption_state.in_progress;
-
-	// Adjust the pending flag accordingly
-	kernelTLS.this_processor->pending_preemption = !ready;
-	return ready;
-}
-
-//=============================================================================================
-// Kernel Signal Startup/Shutdown logic
-//=============================================================================================
-
-// Startup routine to activate preemption
-// Called from kernel_startup
-void kernel_start_preemption() {
-	__cfaabi_dbg_print_safe( "Kernel : Starting preemption\n" );
-
-	// Start with preemption disabled until ready
-	kernelTLS.preemption_state.enabled = false;
-	kernelTLS.preemption_state.disable_count = 1;
-
-	// Initialize the event kernel
-	event_kernel = (event_kernel_t *)&storage_event_kernel;
-	(*event_kernel){};
-
-	// Setup proper signal handlers
-	__cfaabi_sigaction( SIGUSR1, sigHandler_ctxSwitch, SA_SIGINFO | SA_RESTART );         // CtxSwitch handler
-
-	signal_block( SIGALRM );
-
-	pthread_create( &alarm_thread, NULL, alarm_loop, NULL );
-}
-
-// Shutdown routine to deactivate preemption
-// Called from kernel_shutdown
-void kernel_stop_preemption() {
-	__cfaabi_dbg_print_safe( "Kernel : Preemption stopping\n" );
-
-	// Block all signals since we are already shutting down
-	sigset_t mask;
-	sigfillset( &mask );
-	sigprocmask( SIG_BLOCK, &mask, NULL );
-
-	// Notify the alarm thread of the shutdown
-	sigval val = { 1 };
-	pthread_sigqueue( alarm_thread, SIGALRM, val );
-
-	// Wait for the preemption thread to finish
-	pthread_join( alarm_thread, NULL );
-
-	// Preemption is now fully stopped
-
-	__cfaabi_dbg_print_safe( "Kernel : Preemption stopped\n" );
-}
-
-// Raii ctor/dtor for the preemption_scope
-// Used by thread to control when they want to receive preemption signals
-void ?{}( preemption_scope & this, processor * proc ) {
-	(this.alarm){ proc, (Time){ 0 }, 0`s };
-	this.proc = proc;
-	this.proc->preemption_alarm = &this.alarm;
-
-	update_preemption( this.proc, this.proc->cltr->preemption_rate );
-}
-
-void ^?{}( preemption_scope & this ) {
-	disable_interrupts();
-
-	update_preemption( this.proc, 0`s );
-}
-
-//=============================================================================================
-// Kernel Signal Handlers
-//=============================================================================================
-
-// Context switch signal handler
-// Receives SIGUSR1 signal and causes the current thread to yield
-static void sigHandler_ctxSwitch( __CFA_SIGPARMS__ ) {
-	__cfaabi_dbg_debug_do( last_interrupt = (void *)(cxt->uc_mcontext.CFA_REG_IP); )
-
-	// SKULLDUGGERY: if a thread creates a processor and the immediately deletes it,
-	// the interrupt that is supposed to force the kernel thread to preempt might arrive
-	// before the kernel thread has even started running. When that happens an iterrupt
-	// we a null 'this_processor' will be caught, just ignore it.
-	if(! kernelTLS.this_processor ) return;
-
-	choose(sfp->si_value.sival_int) {
-		case PREEMPT_NORMAL   : ;// Normal case, nothing to do here
-		case PREEMPT_TERMINATE: verify( __atomic_load_n( &kernelTLS.this_processor->do_terminate, __ATOMIC_SEQ_CST ) );
-		default:
-			abort( "internal error, signal value is %d", sfp->si_value.sival_int );
-	}
-
-	// Check if it is safe to preempt here
-	if( !preemption_ready() ) { return; }
-
-	__cfaabi_dbg_print_buffer_decl( " KERNEL: preempting core %p (%p @ %p).\n", kernelTLS.this_processor, kernelTLS.this_thread, (void *)(cxt->uc_mcontext.CFA_REG_IP) );
-
-	// Sync flag : prevent recursive calls to the signal handler
-	kernelTLS.preemption_state.in_progress = true;
-
-	// Clear sighandler mask before context switching.
-	#if GCC_VERSION > 50000
-	static_assert( sizeof( sigset_t ) == sizeof( cxt->uc_sigmask ), "Expected cxt->uc_sigmask to be of sigset_t" );
-	#endif
-	if ( pthread_sigmask( SIG_SETMASK, (sigset_t *)&(cxt->uc_sigmask), NULL ) == -1 ) {
-		abort( "internal error, sigprocmask" );
-	}
-
-	// TODO: this should go in finish action
-	// Clear the in progress flag
-	kernelTLS.preemption_state.in_progress = false;
-
-	// Preemption can occur here
-
-	BlockInternal( kernelTLS.this_thread ); // Do the actual CtxSwitch
-}
-
-// Main of the alarm thread
-// Waits on SIGALRM and send SIGUSR1 to whom ever needs it
-static void * alarm_loop( __attribute__((unused)) void * args ) {
-	// Block sigalrms to control when they arrive
-	sigset_t mask;
-	sigfillset(&mask);
-	if ( pthread_sigmask( SIG_BLOCK, &mask, NULL ) == -1 ) {
-	    abort( "internal error, pthread_sigmask" );
-	}
-
-	sigemptyset( &mask );
-	sigaddset( &mask, SIGALRM );
-
-	// Main loop
-	while( true ) {
-		// Wait for a sigalrm
-		siginfo_t info;
-		int sig = sigwaitinfo( &mask, &info );
-
-		if( sig < 0 ) {
-			//Error!
-			int err = errno;
-			switch( err ) {
-				case EAGAIN :
-				case EINTR :
-					{__cfaabi_dbg_print_buffer_decl( " KERNEL: Spurious wakeup %d.\n", err );}
-					continue;
-       			case EINVAL :
-				 	abort( "Timeout was invalid." );
-				default:
-				 	abort( "Unhandled error %d", err);
-			}
-		}
-
-		// If another signal arrived something went wrong
-		assertf(sig == SIGALRM, "Kernel Internal Error, sigwait: Unexpected signal %d (%d : %d)\n", sig, info.si_code, info.si_value.sival_int);
-
-		// __cfaabi_dbg_print_safe( "Kernel : Caught alarm from %d with %d\n", info.si_code, info.si_value.sival_int );
-		// Switch on the code (a.k.a. the sender) to
-		switch( info.si_code )
-		{
-		// Timers can apparently be marked as sent for the kernel
-		// In either case, tick preemption
-		case SI_TIMER:
-		case SI_KERNEL:
-			// __cfaabi_dbg_print_safe( "Kernel : Preemption thread tick\n" );
-			lock( event_kernel->lock __cfaabi_dbg_ctx2 );
-			tick_preemption();
-			unlock( event_kernel->lock );
-			break;
-		// Signal was not sent by the kernel but by an other thread
-		case SI_QUEUE:
-			// For now, other thread only signal the alarm thread to shut it down
-			// If this needs to change use info.si_value and handle the case here
-			goto EXIT;
-		}
-	}
-
-EXIT:
-	__cfaabi_dbg_print_safe( "Kernel : Preemption thread stopping\n" );
-	return NULL;
-}
-
-//=============================================================================================
-// Kernel Signal Debug
-//=============================================================================================
-
-void __cfaabi_check_preemption() {
-	bool ready = kernelTLS.preemption_state.enabled;
-	if(!ready) { abort("Preemption should be ready"); }
-
-	sigset_t oldset;
-	int ret;
-	ret = pthread_sigmask(0, NULL, &oldset);
-	if(ret != 0) { abort("ERROR sigprocmask returned %d", ret); }
-
-	ret = sigismember(&oldset, SIGUSR1);
-	if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
-	if(ret == 1) { abort("ERROR SIGUSR1 is disabled"); }
-
-	ret = sigismember(&oldset, SIGALRM);
-	if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
-	if(ret == 0) { abort("ERROR SIGALRM is enabled"); }
-
-	ret = sigismember(&oldset, SIGTERM);
-	if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
-	if(ret == 1) { abort("ERROR SIGTERM is disabled"); }
-}
-
-#ifdef __CFA_WITH_VERIFY__
-bool __cfaabi_dbg_in_kernel() {
-	return !kernelTLS.preemption_state.enabled;
-}
-#endif
-
-// Local Variables: //
-// mode: c //
-// tab-width: 4 //
-// End: //
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 4dcaed2c212a42d633824e9b88053258a2d0f969)
+++ libcfa/src/concurrency/preemption.cfa	(revision 4dcaed2c212a42d633824e9b88053258a2d0f969)
@@ -0,0 +1,491 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// signal.c --
+//
+// Author           : Thierry Delisle
+// Created On       : Mon Jun 5 14:20:42 2017
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Tue Jun  5 17:35:49 2018
+// Update Count     : 37
+//
+
+#include "preemption.h"
+#include <assert.h>
+
+extern "C" {
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+}
+
+#include "bits/signal.h"
+
+#if !defined(__CFA_DEFAULT_PREEMPTION__)
+#define __CFA_DEFAULT_PREEMPTION__ 10`ms
+#endif
+
+Duration default_preemption() __attribute__((weak)) {
+	return __CFA_DEFAULT_PREEMPTION__;
+}
+
+// FwdDeclarations : timeout handlers
+static void preempt( processor   * this );
+static void timeout( thread_desc * this );
+
+// FwdDeclarations : Signal handlers
+static void sigHandler_ctxSwitch( __CFA_SIGPARMS__ );
+static void sigHandler_segv     ( __CFA_SIGPARMS__ );
+static void sigHandler_ill      ( __CFA_SIGPARMS__ );
+static void sigHandler_fpe      ( __CFA_SIGPARMS__ );
+static void sigHandler_abort    ( __CFA_SIGPARMS__ );
+
+// FwdDeclarations : alarm thread main
+static void * alarm_loop( __attribute__((unused)) void * args );
+
+// Machine specific register name
+#if   defined( __i386 )
+#define CFA_REG_IP gregs[REG_EIP]
+#elif defined( __x86_64 )
+#define CFA_REG_IP gregs[REG_RIP]
+#elif defined( __ARM_ARCH )
+#define CFA_REG_IP arm_pc
+#else
+#error unknown hardware architecture
+#endif
+
+KERNEL_STORAGE(event_kernel_t, event_kernel);         // private storage for event kernel
+event_kernel_t * event_kernel;                        // kernel public handle to even kernel
+static pthread_t alarm_thread;                        // pthread handle to alarm thread
+
+static void ?{}(event_kernel_t & this) with( this ) {
+	alarms{};
+	lock{};
+}
+
+enum {
+	PREEMPT_NORMAL    = 0,
+	PREEMPT_TERMINATE = 1,
+};
+
+//=============================================================================================
+// Kernel Preemption logic
+//=============================================================================================
+
+// Get next expired node
+static inline alarm_node_t * get_expired( alarm_list_t * alarms, Time currtime ) {
+	if( !alarms->head ) return NULL;                          // If no alarms return null
+	if( alarms->head->alarm >= currtime ) return NULL;        // If alarms head not expired return null
+	return pop(alarms);                                       // Otherwise just pop head
+}
+
+// Tick one frame of the Discrete Event Simulation for alarms
+static void tick_preemption() {
+	alarm_node_t * node = NULL;                     // Used in the while loop but cannot be declared in the while condition
+	alarm_list_t * alarms = &event_kernel->alarms;  // Local copy for ease of reading
+	Time currtime = __kernel_get_time();			// Check current time once so we everything "happens at once"
+
+	//Loop throught every thing expired
+	while( node = get_expired( alarms, currtime ) ) {
+		// __cfaabi_dbg_print_buffer_decl( " KERNEL: preemption tick.\n" );
+
+		// Check if this is a kernel
+		if( node->kernel_alarm ) {
+			preempt( node->proc );
+		}
+		else {
+			timeout( node->thrd );
+		}
+
+		// Check if this is a periodic alarm
+		Duration period = node->period;
+		if( period > 0 ) {
+			// __cfaabi_dbg_print_buffer_local( " KERNEL: alarm period is %lu.\n", period.tv );
+			node->alarm = currtime + period;    // Alarm is periodic, add currtime to it (used cached current time)
+			insert( alarms, node );             // Reinsert the node for the next time it triggers
+		}
+		else {
+			node->set = false;                  // Node is one-shot, just mark it as not pending
+		}
+	}
+
+	// If there are still alarms pending, reset the timer
+	if( alarms->head ) {
+		__cfaabi_dbg_print_buffer_decl( " KERNEL: @%ju(%ju) resetting alarm to %ju.\n", currtime.tv, __kernel_get_time().tv, (alarms->head->alarm - currtime).tv);
+		Duration delta = alarms->head->alarm - currtime;
+		Duration caped = max(delta, 50`us);
+		// itimerval tim  = { caped };
+		// __cfaabi_dbg_print_buffer_local( "    Values are %lu, %lu, %lu %lu.\n", delta.tv, caped.tv, tim.it_value.tv_sec, tim.it_value.tv_usec);
+
+		__kernel_set_timer( caped );
+	}
+}
+
+// Update the preemption of a processor and notify interested parties
+void update_preemption( processor * this, Duration duration ) {
+	alarm_node_t * alarm = this->preemption_alarm;
+
+	// Alarms need to be enabled
+	if ( duration > 0 && ! alarm->set ) {
+		alarm->alarm = __kernel_get_time() + duration;
+		alarm->period = duration;
+		register_self( alarm );
+	}
+	// Zero duration but alarm is set
+	else if ( duration == 0 && alarm->set ) {
+		unregister_self( alarm );
+		alarm->alarm = 0;
+		alarm->period = 0;
+	}
+	// If alarm is different from previous, change it
+	else if ( duration > 0 && alarm->period != duration ) {
+		unregister_self( alarm );
+		alarm->alarm = __kernel_get_time() + duration;
+		alarm->period = duration;
+		register_self( alarm );
+	}
+}
+
+//=============================================================================================
+// Kernel Signal Tools
+//=============================================================================================
+
+__cfaabi_dbg_debug_do( static thread_local void * last_interrupt = 0; )
+
+extern "C" {
+	// Disable interrupts by incrementing the counter
+	void disable_interrupts() {
+		with( kernelTLS.preemption_state ) {
+			#if GCC_VERSION > 50000
+			static_assert(__atomic_always_lock_free(sizeof(enabled), &enabled), "Must be lock-free");
+			#endif
+
+			// Set enabled flag to false
+			// should be atomic to avoid preemption in the middle of the operation.
+			// use memory order RELAXED since there is no inter-thread on this variable requirements
+			__atomic_store_n(&enabled, false, __ATOMIC_RELAXED);
+
+			// Signal the compiler that a fence is needed but only for signal handlers
+			__atomic_signal_fence(__ATOMIC_ACQUIRE);
+
+			__attribute__((unused)) unsigned short new_val = disable_count + 1;
+			disable_count = new_val;
+			verify( new_val < 65_000u );              // If this triggers someone is disabling interrupts without enabling them
+		}
+	}
+
+	// Enable interrupts by decrementing the counter
+	// If counter reaches 0, execute any pending CtxSwitch
+	void enable_interrupts( __cfaabi_dbg_ctx_param ) {
+		processor   * proc = kernelTLS.this_processor; // Cache the processor now since interrupts can start happening after the atomic store
+		thread_desc * thrd = kernelTLS.this_thread;	  // Cache the thread now since interrupts can start happening after the atomic store
+
+		with( kernelTLS.preemption_state ){
+			unsigned short prev = disable_count;
+			disable_count -= 1;
+			verify( prev != 0u );                     // If this triggers someone is enabled already enabled interruptsverify( prev != 0u );
+
+			// Check if we need to prempt the thread because an interrupt was missed
+			if( prev == 1 ) {
+				#if GCC_VERSION > 50000
+				static_assert(__atomic_always_lock_free(sizeof(enabled), &enabled), "Must be lock-free");
+				#endif
+
+				// Set enabled flag to true
+				// should be atomic to avoid preemption in the middle of the operation.
+				// use memory order RELAXED since there is no inter-thread on this variable requirements
+				__atomic_store_n(&enabled, true, __ATOMIC_RELAXED);
+
+				// Signal the compiler that a fence is needed but only for signal handlers
+				__atomic_signal_fence(__ATOMIC_RELEASE);
+				if( proc->pending_preemption ) {
+					proc->pending_preemption = false;
+					BlockInternal( thrd );
+				}
+			}
+		}
+
+		// For debugging purposes : keep track of the last person to enable the interrupts
+		__cfaabi_dbg_debug_do( proc->last_enable = caller; )
+	}
+
+	// Disable interrupts by incrementint the counter
+	// Don't execute any pending CtxSwitch even if counter reaches 0
+	void enable_interrupts_noPoll() {
+		unsigned short prev = kernelTLS.preemption_state.disable_count;
+		kernelTLS.preemption_state.disable_count -= 1;
+		verifyf( prev != 0u, "Incremented from %u\n", prev );                     // If this triggers someone is enabled already enabled interrupts
+		if( prev == 1 ) {
+			#if GCC_VERSION > 50000
+			static_assert(__atomic_always_lock_free(sizeof(kernelTLS.preemption_state.enabled), &kernelTLS.preemption_state.enabled), "Must be lock-free");
+			#endif
+			// Set enabled flag to true
+			// should be atomic to avoid preemption in the middle of the operation.
+			// use memory order RELAXED since there is no inter-thread on this variable requirements
+			__atomic_store_n(&kernelTLS.preemption_state.enabled, true, __ATOMIC_RELAXED);
+
+			// Signal the compiler that a fence is needed but only for signal handlers
+			__atomic_signal_fence(__ATOMIC_RELEASE);
+		}
+	}
+}
+
+// sigprocmask wrapper : unblock a single signal
+static inline void signal_unblock( int sig ) {
+	sigset_t mask;
+	sigemptyset( &mask );
+	sigaddset( &mask, sig );
+
+	if ( pthread_sigmask( SIG_UNBLOCK, &mask, NULL ) == -1 ) {
+	    abort( "internal error, pthread_sigmask" );
+	}
+}
+
+// sigprocmask wrapper : block a single signal
+static inline void signal_block( int sig ) {
+	sigset_t mask;
+	sigemptyset( &mask );
+	sigaddset( &mask, sig );
+
+	if ( pthread_sigmask( SIG_BLOCK, &mask, NULL ) == -1 ) {
+	    abort( "internal error, pthread_sigmask" );
+	}
+}
+
+// kill wrapper : signal a processor
+static void preempt( processor * this ) {
+	sigval_t value = { PREEMPT_NORMAL };
+	pthread_sigqueue( this->kernel_thread, SIGUSR1, value );
+}
+
+// reserved for future use
+static void timeout( thread_desc * this ) {
+	//TODO : implement waking threads
+}
+
+// KERNEL ONLY
+// Check if a CtxSwitch signal handler shoud defer
+// If true  : preemption is safe
+// If false : preemption is unsafe and marked as pending
+static inline bool preemption_ready() {
+	// Check if preemption is safe
+	bool ready = kernelTLS.preemption_state.enabled && ! kernelTLS.preemption_state.in_progress;
+
+	// Adjust the pending flag accordingly
+	kernelTLS.this_processor->pending_preemption = !ready;
+	return ready;
+}
+
+//=============================================================================================
+// Kernel Signal Startup/Shutdown logic
+//=============================================================================================
+
+// Startup routine to activate preemption
+// Called from kernel_startup
+void kernel_start_preemption() {
+	__cfaabi_dbg_print_safe( "Kernel : Starting preemption\n" );
+
+	// Start with preemption disabled until ready
+	kernelTLS.preemption_state.enabled = false;
+	kernelTLS.preemption_state.disable_count = 1;
+
+	// Initialize the event kernel
+	event_kernel = (event_kernel_t *)&storage_event_kernel;
+	(*event_kernel){};
+
+	// Setup proper signal handlers
+	__cfaabi_sigaction( SIGUSR1, sigHandler_ctxSwitch, SA_SIGINFO | SA_RESTART );         // CtxSwitch handler
+
+	signal_block( SIGALRM );
+
+	pthread_create( &alarm_thread, NULL, alarm_loop, NULL );
+}
+
+// Shutdown routine to deactivate preemption
+// Called from kernel_shutdown
+void kernel_stop_preemption() {
+	__cfaabi_dbg_print_safe( "Kernel : Preemption stopping\n" );
+
+	// Block all signals since we are already shutting down
+	sigset_t mask;
+	sigfillset( &mask );
+	sigprocmask( SIG_BLOCK, &mask, NULL );
+
+	// Notify the alarm thread of the shutdown
+	sigval val = { 1 };
+	pthread_sigqueue( alarm_thread, SIGALRM, val );
+
+	// Wait for the preemption thread to finish
+	pthread_join( alarm_thread, NULL );
+
+	// Preemption is now fully stopped
+
+	__cfaabi_dbg_print_safe( "Kernel : Preemption stopped\n" );
+}
+
+// Raii ctor/dtor for the preemption_scope
+// Used by thread to control when they want to receive preemption signals
+void ?{}( preemption_scope & this, processor * proc ) {
+	(this.alarm){ proc, (Time){ 0 }, 0`s };
+	this.proc = proc;
+	this.proc->preemption_alarm = &this.alarm;
+
+	update_preemption( this.proc, this.proc->cltr->preemption_rate );
+}
+
+void ^?{}( preemption_scope & this ) {
+	disable_interrupts();
+
+	update_preemption( this.proc, 0`s );
+}
+
+//=============================================================================================
+// Kernel Signal Handlers
+//=============================================================================================
+
+// Context switch signal handler
+// Receives SIGUSR1 signal and causes the current thread to yield
+static void sigHandler_ctxSwitch( __CFA_SIGPARMS__ ) {
+	__cfaabi_dbg_debug_do( last_interrupt = (void *)(cxt->uc_mcontext.CFA_REG_IP); )
+
+	// SKULLDUGGERY: if a thread creates a processor and the immediately deletes it,
+	// the interrupt that is supposed to force the kernel thread to preempt might arrive
+	// before the kernel thread has even started running. When that happens an iterrupt
+	// we a null 'this_processor' will be caught, just ignore it.
+	if(! kernelTLS.this_processor ) return;
+
+	choose(sfp->si_value.sival_int) {
+		case PREEMPT_NORMAL   : ;// Normal case, nothing to do here
+		case PREEMPT_TERMINATE: verify( __atomic_load_n( &kernelTLS.this_processor->do_terminate, __ATOMIC_SEQ_CST ) );
+		default:
+			abort( "internal error, signal value is %d", sfp->si_value.sival_int );
+	}
+
+	// Check if it is safe to preempt here
+	if( !preemption_ready() ) { return; }
+
+	__cfaabi_dbg_print_buffer_decl( " KERNEL: preempting core %p (%p @ %p).\n", kernelTLS.this_processor, kernelTLS.this_thread, (void *)(cxt->uc_mcontext.CFA_REG_IP) );
+
+	// Sync flag : prevent recursive calls to the signal handler
+	kernelTLS.preemption_state.in_progress = true;
+
+	// Clear sighandler mask before context switching.
+	#if GCC_VERSION > 50000
+	static_assert( sizeof( sigset_t ) == sizeof( cxt->uc_sigmask ), "Expected cxt->uc_sigmask to be of sigset_t" );
+	#endif
+	if ( pthread_sigmask( SIG_SETMASK, (sigset_t *)&(cxt->uc_sigmask), NULL ) == -1 ) {
+		abort( "internal error, sigprocmask" );
+	}
+
+	// TODO: this should go in finish action
+	// Clear the in progress flag
+	kernelTLS.preemption_state.in_progress = false;
+
+	// Preemption can occur here
+
+	BlockInternal( kernelTLS.this_thread ); // Do the actual CtxSwitch
+}
+
+// Main of the alarm thread
+// Waits on SIGALRM and send SIGUSR1 to whom ever needs it
+static void * alarm_loop( __attribute__((unused)) void * args ) {
+	// Block sigalrms to control when they arrive
+	sigset_t mask;
+	sigfillset(&mask);
+	if ( pthread_sigmask( SIG_BLOCK, &mask, NULL ) == -1 ) {
+	    abort( "internal error, pthread_sigmask" );
+	}
+
+	sigemptyset( &mask );
+	sigaddset( &mask, SIGALRM );
+
+	// Main loop
+	while( true ) {
+		// Wait for a sigalrm
+		siginfo_t info;
+		int sig = sigwaitinfo( &mask, &info );
+
+		if( sig < 0 ) {
+			//Error!
+			int err = errno;
+			switch( err ) {
+				case EAGAIN :
+				case EINTR :
+					{__cfaabi_dbg_print_buffer_decl( " KERNEL: Spurious wakeup %d.\n", err );}
+					continue;
+       			case EINVAL :
+				 	abort( "Timeout was invalid." );
+				default:
+				 	abort( "Unhandled error %d", err);
+			}
+		}
+
+		// If another signal arrived something went wrong
+		assertf(sig == SIGALRM, "Kernel Internal Error, sigwait: Unexpected signal %d (%d : %d)\n", sig, info.si_code, info.si_value.sival_int);
+
+		// __cfaabi_dbg_print_safe( "Kernel : Caught alarm from %d with %d\n", info.si_code, info.si_value.sival_int );
+		// Switch on the code (a.k.a. the sender) to
+		switch( info.si_code )
+		{
+		// Timers can apparently be marked as sent for the kernel
+		// In either case, tick preemption
+		case SI_TIMER:
+		case SI_KERNEL:
+			// __cfaabi_dbg_print_safe( "Kernel : Preemption thread tick\n" );
+			lock( event_kernel->lock __cfaabi_dbg_ctx2 );
+			tick_preemption();
+			unlock( event_kernel->lock );
+			break;
+		// Signal was not sent by the kernel but by an other thread
+		case SI_QUEUE:
+			// For now, other thread only signal the alarm thread to shut it down
+			// If this needs to change use info.si_value and handle the case here
+			goto EXIT;
+		}
+	}
+
+EXIT:
+	__cfaabi_dbg_print_safe( "Kernel : Preemption thread stopping\n" );
+	return NULL;
+}
+
+//=============================================================================================
+// Kernel Signal Debug
+//=============================================================================================
+
+void __cfaabi_check_preemption() {
+	bool ready = kernelTLS.preemption_state.enabled;
+	if(!ready) { abort("Preemption should be ready"); }
+
+	sigset_t oldset;
+	int ret;
+	ret = pthread_sigmask(0, NULL, &oldset);
+	if(ret != 0) { abort("ERROR sigprocmask returned %d", ret); }
+
+	ret = sigismember(&oldset, SIGUSR1);
+	if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
+	if(ret == 1) { abort("ERROR SIGUSR1 is disabled"); }
+
+	ret = sigismember(&oldset, SIGALRM);
+	if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
+	if(ret == 0) { abort("ERROR SIGALRM is enabled"); }
+
+	ret = sigismember(&oldset, SIGTERM);
+	if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
+	if(ret == 1) { abort("ERROR SIGTERM is disabled"); }
+}
+
+#ifdef __CFA_WITH_VERIFY__
+bool __cfaabi_dbg_in_kernel() {
+	return !kernelTLS.preemption_state.enabled;
+}
+#endif
+
+// Local Variables: //
+// mode: c //
+// tab-width: 4 //
+// End: //
Index: libcfa/src/concurrency/thread.c
===================================================================
--- libcfa/src/concurrency/thread.c	(revision bf71cfdb7285490eee552b461158846f626cc52f)
+++ 	(revision )
@@ -1,135 +1,0 @@
-//
-// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
-//
-// The contents of this file are covered under the licence agreement in the
-// file "LICENCE" distributed with Cforall.
-//
-// thread.c --
-//
-// Author           : Thierry Delisle
-// Created On       : Tue Jan 17 12:27:26 2017
-// Last Modified By : Peter A. Buhr
-// Last Modified On : Fri Mar 30 17:19:52 2018
-// Update Count     : 8
-//
-
-#include "thread"
-
-#include "kernel_private.h"
-
-#define __CFA_INVOKE_PRIVATE__
-#include "invoke.h"
-
-extern "C" {
-	#include <fenv.h>
-	#include <stddef.h>
-}
-
-//extern volatile thread_local processor * this_processor;
-
-//-----------------------------------------------------------------------------
-// Thread ctors and dtors
-void ?{}(thread_desc & this, const char * const name, cluster & cl, void * storage, size_t storageSize ) with( this ) {
-	self_cor{ name, storage, storageSize };
-	verify(&self_cor);
-	curr_cor = &self_cor;
-	self_mon.owner = &this;
-	self_mon.recursion = 1;
-	self_mon_p = &self_mon;
-	curr_cluster = &cl;
-	next = NULL;
-
-	node.next = NULL;
-	node.prev = NULL;
-	doregister(curr_cluster, this);
-
-	monitors{ &self_mon_p, 1, (fptr_t)0 };
-}
-
-void ^?{}(thread_desc& this) with( this ) {
-	unregister(curr_cluster, this);
-	^self_cor{};
-}
-
-forall( dtype T | sized(T) | is_thread(T) | { void ?{}(T&); } )
-void ?{}( scoped(T)& this ) with( this ) {
-	handle{};
-	__thrd_start(handle);
-}
-
-forall( dtype T, ttype P | sized(T) | is_thread(T) | { void ?{}(T&, P); } )
-void ?{}( scoped(T)& this, P params ) with( this ) {
-	handle{ params };
-	__thrd_start(handle);
-}
-
-forall( dtype T | sized(T) | is_thread(T) )
-void ^?{}( scoped(T)& this ) with( this ) {
-	^handle{};
-}
-
-//-----------------------------------------------------------------------------
-// Starting and stopping threads
-forall( dtype T | is_thread(T) )
-void __thrd_start( T& this ) {
-	coroutine_desc* thrd_c = get_coroutine(this);
-	thread_desc   * thrd_h = get_thread   (this);
-	thrd_c->last = TL_GET( this_coroutine );
-
-	// __cfaabi_dbg_print_safe("Thread start : %p (t %p, c %p)\n", this, thrd_c, thrd_h);
-
-	disable_interrupts();
-	create_stack(&thrd_c->stack, thrd_c->stack.size);
-	kernelTLS.this_coroutine = thrd_c;
-	CtxStart(&this, CtxInvokeThread);
-	assert( thrd_c->last->stack.context );
-	CtxSwitch( thrd_c->last->stack.context, thrd_c->stack.context );
-
-	ScheduleThread(thrd_h);
-	enable_interrupts( __cfaabi_dbg_ctx );
-}
-
-extern "C" {
-	// KERNEL ONLY
-	void __finish_creation(void) {
-		coroutine_desc* thrd_c = kernelTLS.this_coroutine;
-		ThreadCtxSwitch( thrd_c, thrd_c->last );
-	}
-}
-
-void yield( void ) {
-	// Safety note : This could cause some false positives due to preemption
-      verify( TL_GET( preemption_state.enabled ) );
-	BlockInternal( TL_GET( this_thread ) );
-	// Safety note : This could cause some false positives due to preemption
-      verify( TL_GET( preemption_state.enabled ) );
-}
-
-void yield( unsigned times ) {
-	for( unsigned i = 0; i < times; i++ ) {
-		yield();
-	}
-}
-
-// KERNEL ONLY
-void ThreadCtxSwitch(coroutine_desc* src, coroutine_desc* dst) {
-	// set state of current coroutine to inactive
-	src->state = src->state == Halted ? Halted : Inactive;
-	dst->state = Active;
-
-	// set new coroutine that the processor is executing
-	// and context switch to it
-	kernelTLS.this_coroutine = dst;
-	assert( src->stack.context );
-	CtxSwitch( src->stack.context, dst->stack.context );
-	kernelTLS.this_coroutine = src;
-
-	// set state of new coroutine to active
-	dst->state = dst->state == Halted ? Halted : Inactive;
-	src->state = Active;
-}
-
-// Local Variables: //
-// mode: c //
-// tab-width: 4 //
-// End: //
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 4dcaed2c212a42d633824e9b88053258a2d0f969)
+++ libcfa/src/concurrency/thread.cfa	(revision 4dcaed2c212a42d633824e9b88053258a2d0f969)
@@ -0,0 +1,135 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// thread.c --
+//
+// Author           : Thierry Delisle
+// Created On       : Tue Jan 17 12:27:26 2017
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Fri Mar 30 17:19:52 2018
+// Update Count     : 8
+//
+
+#include "thread"
+
+#include "kernel_private.h"
+
+#define __CFA_INVOKE_PRIVATE__
+#include "invoke.h"
+
+extern "C" {
+	#include <fenv.h>
+	#include <stddef.h>
+}
+
+//extern volatile thread_local processor * this_processor;
+
+//-----------------------------------------------------------------------------
+// Thread ctors and dtors
+void ?{}(thread_desc & this, const char * const name, cluster & cl, void * storage, size_t storageSize ) with( this ) {
+	self_cor{ name, storage, storageSize };
+	verify(&self_cor);
+	curr_cor = &self_cor;
+	self_mon.owner = &this;
+	self_mon.recursion = 1;
+	self_mon_p = &self_mon;
+	curr_cluster = &cl;
+	next = NULL;
+
+	node.next = NULL;
+	node.prev = NULL;
+	doregister(curr_cluster, this);
+
+	monitors{ &self_mon_p, 1, (fptr_t)0 };
+}
+
+void ^?{}(thread_desc& this) with( this ) {
+	unregister(curr_cluster, this);
+	^self_cor{};
+}
+
+forall( dtype T | sized(T) | is_thread(T) | { void ?{}(T&); } )
+void ?{}( scoped(T)& this ) with( this ) {
+	handle{};
+	__thrd_start(handle);
+}
+
+forall( dtype T, ttype P | sized(T) | is_thread(T) | { void ?{}(T&, P); } )
+void ?{}( scoped(T)& this, P params ) with( this ) {
+	handle{ params };
+	__thrd_start(handle);
+}
+
+forall( dtype T | sized(T) | is_thread(T) )
+void ^?{}( scoped(T)& this ) with( this ) {
+	^handle{};
+}
+
+//-----------------------------------------------------------------------------
+// Starting and stopping threads
+forall( dtype T | is_thread(T) )
+void __thrd_start( T& this ) {
+	coroutine_desc* thrd_c = get_coroutine(this);
+	thread_desc   * thrd_h = get_thread   (this);
+	thrd_c->last = TL_GET( this_coroutine );
+
+	// __cfaabi_dbg_print_safe("Thread start : %p (t %p, c %p)\n", this, thrd_c, thrd_h);
+
+	disable_interrupts();
+	create_stack(&thrd_c->stack, thrd_c->stack.size);
+	kernelTLS.this_coroutine = thrd_c;
+	CtxStart(&this, CtxInvokeThread);
+	assert( thrd_c->last->stack.context );
+	CtxSwitch( thrd_c->last->stack.context, thrd_c->stack.context );
+
+	ScheduleThread(thrd_h);
+	enable_interrupts( __cfaabi_dbg_ctx );
+}
+
+extern "C" {
+	// KERNEL ONLY
+	void __finish_creation(void) {
+		coroutine_desc* thrd_c = kernelTLS.this_coroutine;
+		ThreadCtxSwitch( thrd_c, thrd_c->last );
+	}
+}
+
+void yield( void ) {
+	// Safety note : This could cause some false positives due to preemption
+      verify( TL_GET( preemption_state.enabled ) );
+	BlockInternal( TL_GET( this_thread ) );
+	// Safety note : This could cause some false positives due to preemption
+      verify( TL_GET( preemption_state.enabled ) );
+}
+
+void yield( unsigned times ) {
+	for( unsigned i = 0; i < times; i++ ) {
+		yield();
+	}
+}
+
+// KERNEL ONLY
+void ThreadCtxSwitch(coroutine_desc* src, coroutine_desc* dst) {
+	// set state of current coroutine to inactive
+	src->state = src->state == Halted ? Halted : Inactive;
+	dst->state = Active;
+
+	// set new coroutine that the processor is executing
+	// and context switch to it
+	kernelTLS.this_coroutine = dst;
+	assert( src->stack.context );
+	CtxSwitch( src->stack.context, dst->stack.context );
+	kernelTLS.this_coroutine = src;
+
+	// set state of new coroutine to active
+	dst->state = dst->state == Halted ? Halted : Inactive;
+	src->state = Active;
+}
+
+// Local Variables: //
+// mode: c //
+// tab-width: 4 //
+// End: //