Index: libcfa/src/concurrency/coroutine.cfa
===================================================================
--- libcfa/src/concurrency/coroutine.cfa	(revision 2fab24e3bfb7b64299dc6bb1b6e45ef4248410cc)
+++ libcfa/src/concurrency/coroutine.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -46,10 +46,10 @@
 
 //-----------------------------------------------------------------------------
-FORALL_DATA_INSTANCE(CoroutineCancelled, (dtype coroutine_t), (coroutine_t))
-
-forall(dtype T)
+FORALL_DATA_INSTANCE(CoroutineCancelled, (coroutine_t &), (coroutine_t))
+
+forall(T &)
 void mark_exception(CoroutineCancelled(T) *) {}
 
-forall(dtype T)
+forall(T &)
 void copy(CoroutineCancelled(T) * dst, CoroutineCancelled(T) * src) {
 	dst->virtual_table = src->virtual_table;
@@ -58,5 +58,5 @@
 }
 
-forall(dtype T)
+forall(T &)
 const char * msg(CoroutineCancelled(T) *) {
 	return "CoroutineCancelled(...)";
@@ -64,5 +64,5 @@
 
 // This code should not be inlined. It is the error path on resume.
-forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 void __cfaehm_cancelled_coroutine( T & cor, $coroutine * desc ) {
 	verify( desc->cancellation );
@@ -148,5 +148,5 @@
 // Part of the Public API
 // Not inline since only ever called once per coroutine
-forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 void prime(T& cor) {
 	$coroutine* this = get_coroutine(cor);
Index: libcfa/src/concurrency/coroutine.hfa
===================================================================
--- libcfa/src/concurrency/coroutine.hfa	(revision 2fab24e3bfb7b64299dc6bb1b6e45ef4248410cc)
+++ libcfa/src/concurrency/coroutine.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -22,13 +22,13 @@
 //-----------------------------------------------------------------------------
 // Exception thrown from resume when a coroutine stack is cancelled.
-FORALL_DATA_EXCEPTION(CoroutineCancelled, (dtype coroutine_t), (coroutine_t)) (
+FORALL_DATA_EXCEPTION(CoroutineCancelled, (coroutine_t &), (coroutine_t)) (
 	coroutine_t * the_coroutine;
 	exception_t * the_exception;
 );
 
-forall(dtype T)
+forall(T &)
 void copy(CoroutineCancelled(T) * dst, CoroutineCancelled(T) * src);
 
-forall(dtype T)
+forall(T &)
 const char * msg(CoroutineCancelled(T) *);
 
@@ -37,5 +37,5 @@
 // Anything that implements this trait can be resumed.
 // Anything that is resumed is a coroutine.
-trait is_coroutine(dtype T | IS_RESUMPTION_EXCEPTION(CoroutineCancelled, (T))) {
+trait is_coroutine(T & | IS_RESUMPTION_EXCEPTION(CoroutineCancelled, (T))) {
 	void main(T & this);
 	$coroutine * get_coroutine(T & this);
@@ -60,5 +60,5 @@
 //-----------------------------------------------------------------------------
 // Public coroutine API
-forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 void prime(T & cor);
 
@@ -72,5 +72,5 @@
 	void __cfactx_invoke_coroutine(void (*main)(void *), void * this);
 
-	forall(dtype T)
+	forall(T &)
 	void __cfactx_start(void (*main)(T &), struct $coroutine * cor, T & this, void (*invoke)(void (*main)(void *), void *));
 
@@ -129,9 +129,9 @@
 }
 
-forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 void __cfaehm_cancelled_coroutine( T & cor, $coroutine * desc );
 
 // Resume implementation inlined for performance
-forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 static inline T & resume(T & cor) {
 	// optimization : read TLS once and reuse it
Index: libcfa/src/concurrency/future.hfa
===================================================================
--- libcfa/src/concurrency/future.hfa	(revision 2fab24e3bfb7b64299dc6bb1b6e45ef4248410cc)
+++ libcfa/src/concurrency/future.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -19,5 +19,5 @@
 #include "monitor.hfa"
 
-forall( otype T ) {
+forall( T ) {
 	struct future {
 		inline future_t;
@@ -58,5 +58,5 @@
 }
 
-forall( otype T ) {
+forall( T ) {
 	monitor multi_future {
 		inline future_t;
Index: libcfa/src/concurrency/io/types.hfa
===================================================================
--- libcfa/src/concurrency/io/types.hfa	(revision 2fab24e3bfb7b64299dc6bb1b6e45ef4248410cc)
+++ libcfa/src/concurrency/io/types.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -5,5 +5,6 @@
 // file "LICENCE" distributed with Cforall.
 //
-// io/types.hfa --
+// io/types.hfa -- PRIVATE
+// Types used by the I/O subsystem
 //
 // Author           : Thierry Delisle
@@ -21,4 +22,5 @@
 
 #include "bits/locks.hfa"
+#include "kernel/fwd.hfa"
 
 #if defined(CFA_HAVE_LINUX_IO_URING_H)
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 2fab24e3bfb7b64299dc6bb1b6e45ef4248410cc)
+++ libcfa/src/concurrency/kernel.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -224,5 +224,5 @@
 	}
 
-	V( this->terminated );
+	post( this->terminated );
 
 	if(this == mainProcessor) {
@@ -624,30 +624,6 @@
 // Unexpected Terminating logic
 //=============================================================================================
-
-extern "C" {
-	extern void __cfaabi_real_abort(void);
-}
-static volatile bool kernel_abort_called = false;
-
-void * kernel_abort(void) __attribute__ ((__nothrow__)) {
-	// abort cannot be recursively entered by the same or different processors because all signal handlers return when
-	// the globalAbort flag is true.
-	bool first = !__atomic_test_and_set( &kernel_abort_called, __ATOMIC_SEQ_CST);
-
-	// first task to abort ?
-	if ( !first ) {
-		// We aren't the first to abort.
-		// I give up, just let C handle it
-		__cfaabi_real_abort();
-	}
-
-	// disable interrupts, it no longer makes sense to try to interrupt this processor
-	disable_interrupts();
-
-	return __cfaabi_tls.this_thread;
-}
-
-void kernel_abort_msg( void * kernel_data, char * abort_text, int abort_text_size ) {
-	$thread * thrd = ( $thread * ) kernel_data;
+void __kernel_abort_msg( char * abort_text, int abort_text_size ) {
+	$thread * thrd = __cfaabi_tls.this_thread;
 
 	if(thrd) {
@@ -669,6 +645,6 @@
 }
 
-int kernel_abort_lastframe( void ) __attribute__ ((__nothrow__)) {
-	return get_coroutine(kernelTLS().this_thread) == get_coroutine(mainThread) ? 4 : 2;
+int __kernel_abort_lastframe( void ) __attribute__ ((__nothrow__)) {
+	return get_coroutine(__cfaabi_tls.this_thread) == get_coroutine(mainThread) ? 4 : 2;
 }
 
@@ -688,62 +664,4 @@
 // Kernel Utilities
 //=============================================================================================
-//-----------------------------------------------------------------------------
-// Locks
-void  ?{}( semaphore & this, int count = 1 ) {
-	(this.lock){};
-	this.count = count;
-	(this.waiting){};
-}
-void ^?{}(semaphore & this) {}
-
-bool P(semaphore & this) with( this ){
-	lock( lock __cfaabi_dbg_ctx2 );
-	count -= 1;
-	if ( count < 0 ) {
-		// queue current task
-		append( waiting, active_thread() );
-
-		// atomically release spin lock and block
-		unlock( lock );
-		park();
-		return true;
-	}
-	else {
-	    unlock( lock );
-	    return false;
-	}
-}
-
-bool V(semaphore & this) with( this ) {
-	$thread * thrd = 0p;
-	lock( lock __cfaabi_dbg_ctx2 );
-	count += 1;
-	if ( count <= 0 ) {
-		// remove task at head of waiting list
-		thrd = pop_head( waiting );
-	}
-
-	unlock( lock );
-
-	// make new owner
-	unpark( thrd );
-
-	return thrd != 0p;
-}
-
-bool V(semaphore & this, unsigned diff) with( this ) {
-	$thread * thrd = 0p;
-	lock( lock __cfaabi_dbg_ctx2 );
-	int release = max(-count, (int)diff);
-	count += diff;
-	for(release) {
-		unpark( pop_head( waiting ) );
-	}
-
-	unlock( lock );
-
-	return thrd != 0p;
-}
-
 //-----------------------------------------------------------------------------
 // Debug
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 2fab24e3bfb7b64299dc6bb1b6e45ef4248410cc)
+++ libcfa/src/concurrency/kernel.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -5,5 +5,5 @@
 // file "LICENCE" distributed with Cforall.
 //
-// kernel --
+// kernel -- Header containing the core of the kernel API
 //
 // Author           : Thierry Delisle
@@ -24,20 +24,66 @@
 extern "C" {
 	#include <bits/pthreadtypes.h>
+	#include <pthread.h>
 	#include <linux/types.h>
 }
 
 //-----------------------------------------------------------------------------
-// Locks
-struct semaphore {
-	__spinlock_t lock;
-	int count;
-	__queue_t($thread) waiting;
-};
-
-void  ?{}(semaphore & this, int count = 1);
-void ^?{}(semaphore & this);
-bool   P (semaphore & this);
-bool   V (semaphore & this);
-bool   V (semaphore & this, unsigned count);
+// Underlying Locks
+#ifdef __CFA_WITH_VERIFY__
+	extern bool __cfaabi_dbg_in_kernel();
+#endif
+
+extern "C" {
+	char * strerror(int);
+}
+#define CHECKED(x) { int err = x; if( err != 0 ) abort("KERNEL ERROR: Operation \"" #x "\" return error %d - %s\n", err, strerror(err)); }
+
+struct __bin_sem_t {
+	pthread_mutex_t 	lock;
+	pthread_cond_t  	cond;
+	int     		val;
+};
+
+static inline void ?{}(__bin_sem_t & this) with( this ) {
+	// Create the mutex with error checking
+	pthread_mutexattr_t mattr;
+	pthread_mutexattr_init( &mattr );
+	pthread_mutexattr_settype( &mattr, PTHREAD_MUTEX_ERRORCHECK_NP);
+	pthread_mutex_init(&lock, &mattr);
+
+	pthread_cond_init (&cond, (const pthread_condattr_t *)0p);  // workaround trac#208: cast should not be required
+	val = 0;
+}
+
+static inline void ^?{}(__bin_sem_t & this) with( this ) {
+	CHECKED( pthread_mutex_destroy(&lock) );
+	CHECKED( pthread_cond_destroy (&cond) );
+}
+
+static inline void wait(__bin_sem_t & this) with( this ) {
+	verify(__cfaabi_dbg_in_kernel());
+	CHECKED( pthread_mutex_lock(&lock) );
+		while(val < 1) {
+			pthread_cond_wait(&cond, &lock);
+		}
+		val -= 1;
+	CHECKED( pthread_mutex_unlock(&lock) );
+}
+
+static inline bool post(__bin_sem_t & this) with( this ) {
+	bool needs_signal = false;
+
+	CHECKED( pthread_mutex_lock(&lock) );
+		if(val < 1) {
+			val += 1;
+			pthread_cond_signal(&cond);
+			needs_signal = true;
+		}
+	CHECKED( pthread_mutex_unlock(&lock) );
+
+	return needs_signal;
+}
+
+#undef CHECKED
 
 
@@ -91,5 +137,5 @@
 
 	// Termination synchronisation (user semaphore)
-	semaphore terminated;
+	oneshot terminated;
 
 	// pthread Stack
Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision 2fab24e3bfb7b64299dc6bb1b6e45ef4248410cc)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -5,5 +5,6 @@
 // file "LICENCE" distributed with Cforall.
 //
-// kernel/fwd.hfa --
+// kernel/fwd.hfa -- PUBLIC
+// Fundamental code needed to implement threading M.E.S. algorithms.
 //
 // Author           : Thierry Delisle
@@ -134,4 +135,258 @@
 		extern uint64_t thread_rand();
 
+		// Semaphore which only supports a single thread
+		struct single_sem {
+			struct $thread * volatile ptr;
+		};
+
+		static inline {
+			void  ?{}(single_sem & this) {
+				this.ptr = 0p;
+			}
+
+			void ^?{}(single_sem &) {}
+
+			bool wait(single_sem & this) {
+				for() {
+					struct $thread * expected = this.ptr;
+					if(expected == 1p) {
+						if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+							return false;
+						}
+					}
+					else {
+						/* paranoid */ verify( expected == 0p );
+						if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+							park();
+							return true;
+						}
+					}
+
+				}
+			}
+
+			bool post(single_sem & this) {
+				for() {
+					struct $thread * expected = this.ptr;
+					if(expected == 1p) return false;
+					if(expected == 0p) {
+						if(__atomic_compare_exchange_n(&this.ptr, &expected, 1p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+							return false;
+						}
+					}
+					else {
+						if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+							unpark( expected );
+							return true;
+						}
+					}
+				}
+			}
+		}
+
+		// Synchronozation primitive which only supports a single thread and one post
+		// Similar to a binary semaphore with a 'one shot' semantic
+		// is expected to be discarded after each party call their side
+		struct oneshot {
+			// Internal state :
+			//     0p     : is initial state (wait will block)
+			//     1p     : fulfilled (wait won't block)
+			// any thread : a thread is currently waiting
+			struct $thread * volatile ptr;
+		};
+
+		static inline {
+			void  ?{}(oneshot & this) {
+				this.ptr = 0p;
+			}
+
+			void ^?{}(oneshot &) {}
+
+			// Wait for the post, return immidiately if it already happened.
+			// return true if the thread was parked
+			bool wait(oneshot & this) {
+				for() {
+					struct $thread * expected = this.ptr;
+					if(expected == 1p) return false;
+					/* paranoid */ verify( expected == 0p );
+					if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+						park();
+						/* paranoid */ verify( this.ptr == 1p );
+						return true;
+					}
+				}
+			}
+
+			// Mark as fulfilled, wake thread if needed
+			// return true if a thread was unparked
+			bool post(oneshot & this) {
+				struct $thread * got = __atomic_exchange_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
+				if( got == 0p ) return false;
+				unpark( got );
+				return true;
+			}
+		}
+
+		// base types for future to build upon
+		// It is based on the 'oneshot' type to allow multiple futures
+		// to block on the same instance, permitting users to block a single
+		// thread on "any of" [a given set of] futures.
+		// does not support multiple threads waiting on the same future
+		struct future_t {
+			// Internal state :
+			//     0p      : is initial state (wait will block)
+			//     1p      : fulfilled (wait won't block)
+			//     2p      : in progress ()
+			//     3p      : abandoned, server should delete
+			// any oneshot : a context has been setup to wait, a thread could wait on it
+			struct oneshot * volatile ptr;
+		};
+
+		static inline {
+			void  ?{}(future_t & this) {
+				this.ptr = 0p;
+			}
+
+			void ^?{}(future_t &) {}
+
+			void reset(future_t & this) {
+				// needs to be in 0p or 1p
+				__atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
+			}
+
+			// check if the future is available
+			bool available( future_t & this ) {
+				return this.ptr == 1p;
+			}
+
+			// Prepare the future to be waited on
+			// intented to be use by wait, wait_any, waitfor, etc. rather than used directly
+			bool setup( future_t & this, oneshot & wait_ctx ) {
+				/* paranoid */ verify( wait_ctx.ptr == 0p );
+				// The future needs to set the wait context
+				for() {
+					struct oneshot * expected = this.ptr;
+					// Is the future already fulfilled?
+					if(expected == 1p) return false; // Yes, just return false (didn't block)
+
+					// The future is not fulfilled, try to setup the wait context
+					/* paranoid */ verify( expected == 0p );
+					if(__atomic_compare_exchange_n(&this.ptr, &expected, &wait_ctx, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+						return true;
+					}
+				}
+			}
+
+			// Stop waiting on a future
+			// When multiple futures are waited for together in "any of" pattern
+			// futures that weren't fulfilled before the thread woke up
+			// should retract the wait ctx
+			// intented to be use by wait, wait_any, waitfor, etc. rather than used directly
+			void retract( future_t & this, oneshot & wait_ctx ) {
+				// Remove the wait context
+				struct oneshot * got = __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
+
+				// got == 0p: future was never actually setup, just return
+				if( got == 0p ) return;
+
+				// got == wait_ctx: since fulfil does an atomic_swap,
+				// if we got back the original then no one else saw context
+				// It is safe to delete (which could happen after the return)
+				if( got == &wait_ctx ) return;
+
+				// got == 1p: the future is ready and the context was fully consumed
+				// the server won't use the pointer again
+				// It is safe to delete (which could happen after the return)
+				if( got == 1p ) return;
+
+				// got == 2p: the future is ready but the context hasn't fully been consumed
+				// spin until it is safe to move on
+				if( got == 2p ) {
+					while( this.ptr != 1p ) Pause();
+					return;
+				}
+
+				// got == any thing else, something wen't wrong here, abort
+				abort("Future in unexpected state");
+			}
+
+			// Mark the future as abandoned, meaning it will be deleted by the server
+			bool abandon( future_t & this ) {
+				/* paranoid */ verify( this.ptr != 3p );
+
+				// Mark the future as abandonned
+				struct oneshot * got = __atomic_exchange_n( &this.ptr, 3p, __ATOMIC_SEQ_CST);
+
+				// If the future isn't already fulfilled, let the server delete it
+				if( got == 0p ) return false;
+
+				// got == 2p: the future is ready but the context hasn't fully been consumed
+				// spin until it is safe to move on
+				if( got == 2p ) {
+					while( this.ptr != 1p ) Pause();
+					got = 1p;
+				}
+
+				// The future is completed delete it now
+				/* paranoid */ verify( this.ptr != 1p );
+				free( &this );
+				return true;
+			}
+
+			// from the server side, mark the future as fulfilled
+			// delete it if needed
+			bool fulfil( future_t & this ) {
+				for() {
+					struct oneshot * expected = this.ptr;
+					// was this abandoned?
+					#if defined(__GNUC__) && __GNUC__ >= 7
+						#pragma GCC diagnostic push
+						#pragma GCC diagnostic ignored "-Wfree-nonheap-object"
+					#endif
+						if( expected == 3p ) { free( &this ); return false; }
+					#if defined(__GNUC__) && __GNUC__ >= 7
+						#pragma GCC diagnostic pop
+					#endif
+
+					/* paranoid */ verify( expected != 1p ); // Future is already fulfilled, should not happen
+					/* paranoid */ verify( expected != 2p ); // Future is bein fulfilled by someone else, this is even less supported then the previous case.
+
+					// If there is a wait context, we need to consume it and mark it as consumed after
+					// If there is no context then we can skip the in progress phase
+					struct oneshot * want = expected == 0p ? 1p : 2p;
+					if(__atomic_compare_exchange_n(&this.ptr, &expected, want, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+						if( expected == 0p ) { /* paranoid */ verify( this.ptr == 1p); return false; }
+						bool ret = post( *expected );
+						__atomic_store_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
+						return ret;
+					}
+				}
+
+			}
+
+			// Wait for the future to be fulfilled
+			bool wait( future_t & this ) {
+				oneshot temp;
+				if( !setup(this, temp) ) return false;
+
+				// Wait context is setup, just wait on it
+				bool ret = wait( temp );
+
+				// Wait for the future to tru
+				while( this.ptr == 2p ) Pause();
+				// Make sure the state makes sense
+				// Should be fulfilled, could be in progress but it's out of date if so
+				// since if that is the case, the oneshot was fulfilled (unparking this thread)
+				// and the oneshot should not be needed any more
+				__attribute__((unused)) struct oneshot * was = this.ptr;
+				/* paranoid */ verifyf( was == 1p, "Expected this.ptr to be 1p, was %p\n", was );
+
+				// Mark the future as fulfilled, to be consistent
+				// with potential calls to avail
+				// this.ptr = 1p;
+				return ret;
+			}
+		}
+
 		//-----------------------------------------------------------------------
 		// Statics call at the end of each thread to register statistics
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 2fab24e3bfb7b64299dc6bb1b6e45ef4248410cc)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -199,5 +199,5 @@
 	void ?{}(processor & this) with( this ) {
 		( this.idle ){};
-		( this.terminated ){ 0 };
+		( this.terminated ){};
 		( this.runner ){};
 		init( this, "Main Processor", *mainCluster );
@@ -528,5 +528,5 @@
 void ?{}(processor & this, const char name[], cluster & _cltr) {
 	( this.idle ){};
-	( this.terminated ){ 0 };
+	( this.terminated ){};
 	( this.runner ){};
 
@@ -549,5 +549,5 @@
 		__wake_proc( &this );
 
-		P( terminated );
+		wait( terminated );
 		/* paranoid */ verify( active_processor() != &this);
 	}
Index: libcfa/src/concurrency/locks.cfa
===================================================================
--- libcfa/src/concurrency/locks.cfa	(revision 2fab24e3bfb7b64299dc6bb1b6e45ef4248410cc)
+++ libcfa/src/concurrency/locks.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -7,5 +7,5 @@
 //-----------------------------------------------------------------------------
 // info_thread
-forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
 	struct info_thread {
 		// used to put info_thread on a dl queue (aka sequence)
@@ -195,5 +195,5 @@
 //-----------------------------------------------------------------------------
 // alarm node wrapper
-forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
 	struct alarm_node_wrap {
 		alarm_node_t alarm_node;
@@ -239,5 +239,5 @@
 //-----------------------------------------------------------------------------
 // condition variable
-forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
 
 	void ?{}( condition_variable(L) & this ){
@@ -356,2 +356,60 @@
 	bool wait( condition_variable(L) & this, L & l, uintptr_t info, Time time         ) with(this) { WAIT_TIME( info, &l , time ) }
 }
+
+//-----------------------------------------------------------------------------
+// Semaphore
+void  ?{}( semaphore & this, int count = 1 ) {
+	(this.lock){};
+	this.count = count;
+	(this.waiting){};
+}
+void ^?{}(semaphore & this) {}
+
+bool P(semaphore & this) with( this ){
+	lock( lock __cfaabi_dbg_ctx2 );
+	count -= 1;
+	if ( count < 0 ) {
+		// queue current task
+		append( waiting, active_thread() );
+
+		// atomically release spin lock and block
+		unlock( lock );
+		park();
+		return true;
+	}
+	else {
+	    unlock( lock );
+	    return false;
+	}
+}
+
+bool V(semaphore & this) with( this ) {
+	$thread * thrd = 0p;
+	lock( lock __cfaabi_dbg_ctx2 );
+	count += 1;
+	if ( count <= 0 ) {
+		// remove task at head of waiting list
+		thrd = pop_head( waiting );
+	}
+
+	unlock( lock );
+
+	// make new owner
+	unpark( thrd );
+
+	return thrd != 0p;
+}
+
+bool V(semaphore & this, unsigned diff) with( this ) {
+	$thread * thrd = 0p;
+	lock( lock __cfaabi_dbg_ctx2 );
+	int release = max(-count, (int)diff);
+	count += diff;
+	for(release) {
+		unpark( pop_head( waiting ) );
+	}
+
+	unlock( lock );
+
+	return thrd != 0p;
+}
Index: libcfa/src/concurrency/locks.hfa
===================================================================
--- libcfa/src/concurrency/locks.hfa	(revision 2fab24e3bfb7b64299dc6bb1b6e45ef4248410cc)
+++ libcfa/src/concurrency/locks.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -13,5 +13,5 @@
 //-----------------------------------------------------------------------------
 // is_blocking_lock
-trait is_blocking_lock(dtype L | sized(L)) {
+trait is_blocking_lock(L & | sized(L)) {
 	// For synchronization locks to use when acquiring
 	void on_notify( L &, struct $thread * );
@@ -31,5 +31,5 @@
 // the info thread is a wrapper around a thread used
 // to store extra data for use in the condition variable
-forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
 	struct info_thread;
 
@@ -120,5 +120,5 @@
 //-----------------------------------------------------------------------------
 // Synchronization Locks
-forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
 	struct condition_variable {
 		// Spin lock used for mutual exclusion
@@ -157,2 +157,16 @@
 	bool wait( condition_variable(L) & this, L & l, uintptr_t info, Time time );
 }
+
+//-----------------------------------------------------------------------------
+// Semaphore
+struct semaphore {
+	__spinlock_t lock;
+	int count;
+	__queue_t($thread) waiting;
+};
+
+void  ?{}(semaphore & this, int count = 1);
+void ^?{}(semaphore & this);
+bool   P (semaphore & this);
+bool   V (semaphore & this);
+bool   V (semaphore & this, unsigned count);
Index: libcfa/src/concurrency/monitor.cfa
===================================================================
--- libcfa/src/concurrency/monitor.cfa	(revision 2fab24e3bfb7b64299dc6bb1b6e45ef4248410cc)
+++ libcfa/src/concurrency/monitor.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -50,5 +50,5 @@
 static inline [$thread *, int] search_entry_queue( const __waitfor_mask_t &, $monitor * monitors [], __lock_size_t count );
 
-forall(dtype T | sized( T ))
+forall(T & | sized( T ))
 static inline __lock_size_t insert_unique( T * array [], __lock_size_t & size, T * val );
 static inline __lock_size_t count_max    ( const __waitfor_mask_t & mask );
@@ -949,5 +949,5 @@
 }
 
-forall(dtype T | sized( T ))
+forall(T & | sized( T ))
 static inline __lock_size_t insert_unique( T * array [], __lock_size_t & size, T * val ) {
 	if( !val ) return size;
Index: libcfa/src/concurrency/monitor.hfa
===================================================================
--- libcfa/src/concurrency/monitor.hfa	(revision 2fab24e3bfb7b64299dc6bb1b6e45ef4248410cc)
+++ libcfa/src/concurrency/monitor.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -22,5 +22,5 @@
 #include "stdlib.hfa"
 
-trait is_monitor(dtype T) {
+trait is_monitor(T &) {
 	$monitor * get_monitor( T & );
 	void ^?{}( T & mutex );
@@ -59,5 +59,5 @@
 void ^?{}( monitor_dtor_guard_t & this );
 
-static inline forall( dtype T | sized(T) | { void ^?{}( T & mutex ); } )
+static inline forall( T & | sized(T) | { void ^?{}( T & mutex ); } )
 void delete( T * th ) {
 	^(*th){};
Index: libcfa/src/concurrency/mutex.cfa
===================================================================
--- libcfa/src/concurrency/mutex.cfa	(revision 2fab24e3bfb7b64299dc6bb1b6e45ef4248410cc)
+++ libcfa/src/concurrency/mutex.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -164,5 +164,5 @@
 }
 
-forall(dtype L | is_lock(L))
+forall(L & | is_lock(L))
 void wait(condition_variable & this, L & l) {
 	lock( this.lock __cfaabi_dbg_ctx2 );
@@ -176,5 +176,5 @@
 //-----------------------------------------------------------------------------
 // Scopes
-forall(dtype L | is_lock(L))
+forall(L & | is_lock(L))
 void lock_all  ( L * locks[], size_t count) {
 	// Sort locks based on addresses
@@ -188,5 +188,5 @@
 }
 
-forall(dtype L | is_lock(L))
+forall(L & | is_lock(L))
 void unlock_all( L * locks[], size_t count) {
 	// Lock all
Index: libcfa/src/concurrency/mutex.hfa
===================================================================
--- libcfa/src/concurrency/mutex.hfa	(revision 2fab24e3bfb7b64299dc6bb1b6e45ef4248410cc)
+++ libcfa/src/concurrency/mutex.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -70,5 +70,5 @@
 void unlock(recursive_mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
 
-trait is_lock(dtype L | sized(L)) {
+trait is_lock(L & | sized(L)) {
 	void lock  (L &);
 	void unlock(L &);
@@ -94,10 +94,10 @@
 void wait(condition_variable & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
 
-forall(dtype L | is_lock(L))
+forall(L & | is_lock(L))
 void wait(condition_variable & this, L & l) __attribute__((deprecated("use concurrency/locks.hfa instead")));
 
 //-----------------------------------------------------------------------------
 // Scopes
-forall(dtype L | is_lock(L)) {
+forall(L & | is_lock(L)) {
 	#if !defined( __TUPLE_ARRAYS_EXIST__ )
 	void lock  ( L * locks [], size_t count);
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 2fab24e3bfb7b64299dc6bb1b6e45ef4248410cc)
+++ libcfa/src/concurrency/preemption.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -616,4 +616,9 @@
 }
 
+// Prevent preemption since we are about to start terminating things
+void __kernel_abort_lock(void) {
+	signal_block( SIGUSR1 );
+}
+
 // Raii ctor/dtor for the preemption_scope
 // Used by thread to control when they want to receive preemption signals
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 2fab24e3bfb7b64299dc6bb1b6e45ef4248410cc)
+++ libcfa/src/concurrency/thread.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -62,7 +62,7 @@
 }
 
-FORALL_DATA_INSTANCE(ThreadCancelled, (dtype thread_t), (thread_t))
+FORALL_DATA_INSTANCE(ThreadCancelled, (thread_t &), (thread_t))
 
-forall(dtype T)
+forall(T &)
 void copy(ThreadCancelled(T) * dst, ThreadCancelled(T) * src) {
 	dst->virtual_table = src->virtual_table;
@@ -71,23 +71,23 @@
 }
 
-forall(dtype T)
+forall(T &)
 const char * msg(ThreadCancelled(T) *) {
 	return "ThreadCancelled";
 }
 
-forall(dtype T)
+forall(T &)
 static void default_thread_cancel_handler(ThreadCancelled(T) & ) {
 	abort( "Unhandled thread cancellation.\n" );
 }
 
-forall(dtype T | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)))
+forall(T & | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)))
 void ?{}( thread_dtor_guard_t & this,
-		T & thrd, void(*defaultResumptionHandler)(ThreadCancelled(T) &)) {
-	$monitor * m = get_monitor(thrd);
+		T & thrd, void(*cancelHandler)(ThreadCancelled(T) &)) {
+ 	$monitor * m = get_monitor(thrd);
 	$thread * desc = get_thread(thrd);
 
 	// Setup the monitor guard
 	void (*dtor)(T& mutex this) = ^?{};
-	bool join = defaultResumptionHandler != (void(*)(ThreadCancelled(T)&))0;
+	bool join = cancelHandler != (void(*)(ThreadCancelled(T)&))0;
 	(this.mg){&m, (void(*)())dtor, join};
 
@@ -103,7 +103,6 @@
 	}
 	desc->state = Cancelled;
-	if (!join) {
-		defaultResumptionHandler = default_thread_cancel_handler;
-	}
+	void(*defaultResumptionHandler)(ThreadCancelled(T) &) = 
+		join ? cancelHandler : default_thread_cancel_handler;
 
 	ThreadCancelled(T) except;
@@ -125,5 +124,5 @@
 //-----------------------------------------------------------------------------
 // Starting and stopping threads
-forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 void __thrd_start( T & this, void (*main_p)(T &) ) {
 	$thread * this_thrd = get_thread(this);
@@ -141,5 +140,5 @@
 //-----------------------------------------------------------------------------
 // Support for threads that don't ues the thread keyword
-forall( dtype T | sized(T) | is_thread(T) | { void ?{}(T&); } )
+forall( T & | sized(T) | is_thread(T) | { void ?{}(T&); } )
 void ?{}( scoped(T)& this ) with( this ) {
 	handle{};
@@ -147,5 +146,5 @@
 }
 
-forall( dtype T, ttype P | sized(T) | is_thread(T) | { void ?{}(T&, P); } )
+forall( T &, P... | sized(T) | is_thread(T) | { void ?{}(T&, P); } )
 void ?{}( scoped(T)& this, P params ) with( this ) {
 	handle{ params };
@@ -153,5 +152,5 @@
 }
 
-forall( dtype T | sized(T) | is_thread(T) )
+forall( T & | sized(T) | is_thread(T) )
 void ^?{}( scoped(T)& this ) with( this ) {
 	^handle{};
@@ -159,5 +158,5 @@
 
 //-----------------------------------------------------------------------------
-forall(dtype T | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)))
+forall(T & | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)))
 T & join( T & this ) {
 	thread_dtor_guard_t guard = { this, defaultResumptionHandler };
Index: libcfa/src/concurrency/thread.hfa
===================================================================
--- libcfa/src/concurrency/thread.hfa	(revision 2fab24e3bfb7b64299dc6bb1b6e45ef4248410cc)
+++ libcfa/src/concurrency/thread.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -26,5 +26,5 @@
 //-----------------------------------------------------------------------------
 // thread trait
-trait is_thread(dtype T) {
+trait is_thread(T &) {
 	void ^?{}(T& mutex this);
 	void main(T& this);
@@ -32,13 +32,13 @@
 };
 
-FORALL_DATA_EXCEPTION(ThreadCancelled, (dtype thread_t), (thread_t)) (
+FORALL_DATA_EXCEPTION(ThreadCancelled, (thread_t &), (thread_t)) (
 	thread_t * the_thread;
 	exception_t * the_exception;
 );
 
-forall(dtype T)
+forall(T &)
 void copy(ThreadCancelled(T) * dst, ThreadCancelled(T) * src);
 
-forall(dtype T)
+forall(T &)
 const char * msg(ThreadCancelled(T) *);
 
@@ -47,8 +47,8 @@
 
 // Inline getters for threads/coroutines/monitors
-forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 static inline $coroutine* get_coroutine(T & this) __attribute__((const)) { return &get_thread(this)->self_cor; }
 
-forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 static inline $monitor  * get_monitor  (T & this) __attribute__((const)) { return &get_thread(this)->self_mon; }
 
@@ -60,5 +60,5 @@
 extern struct cluster * mainCluster;
 
-forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 void __thrd_start( T & this, void (*)(T &) );
 
@@ -82,5 +82,5 @@
 };
 
-forall( dtype T | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)) )
+forall( T & | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)) )
 void ?{}( thread_dtor_guard_t & this, T & thrd, void(*)(ThreadCancelled(T) &) );
 void ^?{}( thread_dtor_guard_t & this );
@@ -89,16 +89,16 @@
 // thread runner
 // Structure that actually start and stop threads
-forall( dtype T | sized(T) | is_thread(T) )
+forall( T & | sized(T) | is_thread(T) )
 struct scoped {
 	T handle;
 };
 
-forall( dtype T | sized(T) | is_thread(T) | { void ?{}(T&); } )
+forall( T & | sized(T) | is_thread(T) | { void ?{}(T&); } )
 void ?{}( scoped(T)& this );
 
-forall( dtype T, ttype P | sized(T) | is_thread(T) | { void ?{}(T&, P); } )
+forall( T &, P... | sized(T) | is_thread(T) | { void ?{}(T&, P); } )
 void ?{}( scoped(T)& this, P params );
 
-forall( dtype T | sized(T) | is_thread(T) )
+forall( T & | sized(T) | is_thread(T) )
 void ^?{}( scoped(T)& this );
 
@@ -115,5 +115,5 @@
 void unpark( $thread * this );
 
-forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 static inline void unpark( T & this ) { if(!&this) return; unpark( get_thread( this ) );}
 
@@ -128,5 +128,5 @@
 //----------
 // join
-forall( dtype T | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)) )
+forall( T & | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)) )
 T & join( T & this );