Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision 28c35e2df7bbf22ffd7561aa771276e2a4d55b0e)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision 9db2c920685ec71921fa077d452ed8cea1850f4e)
@@ -5,5 +5,6 @@
 // file "LICENCE" distributed with Cforall.
 //
-// kernel/fwd.hfa --
+// kernel/fwd.hfa -- PUBLIC
+// Fundamental code needed to implement threading M.E.S. algorithms.
 //
 // Author           : Thierry Delisle
@@ -134,4 +135,258 @@
 		extern uint64_t thread_rand();
 
+		// Semaphore which only supports a single thread
+		struct single_sem {
+			struct $thread * volatile ptr;
+		};
+
+		static inline {
+			void  ?{}(single_sem & this) {
+				this.ptr = 0p;
+			}
+
+			void ^?{}(single_sem &) {}
+
+			bool wait(single_sem & this) {
+				for() {
+					struct $thread * expected = this.ptr;
+					if(expected == 1p) {
+						if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+							return false;
+						}
+					}
+					else {
+						/* paranoid */ verify( expected == 0p );
+						if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+							park();
+							return true;
+						}
+					}
+
+				}
+			}
+
+			bool post(single_sem & this) {
+				for() {
+					struct $thread * expected = this.ptr;
+					if(expected == 1p) return false;
+					if(expected == 0p) {
+						if(__atomic_compare_exchange_n(&this.ptr, &expected, 1p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+							return false;
+						}
+					}
+					else {
+						if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+							unpark( expected );
+							return true;
+						}
+					}
+				}
+			}
+		}
+
+		// Synchronozation primitive which only supports a single thread and one post
+		// Similar to a binary semaphore with a 'one shot' semantic
+		// is expected to be discarded after each party call their side
+		struct oneshot {
+			// Internal state :
+			//     0p     : is initial state (wait will block)
+			//     1p     : fulfilled (wait won't block)
+			// any thread : a thread is currently waiting
+			struct $thread * volatile ptr;
+		};
+
+		static inline {
+			void  ?{}(oneshot & this) {
+				this.ptr = 0p;
+			}
+
+			void ^?{}(oneshot &) {}
+
+			// Wait for the post, return immidiately if it already happened.
+			// return true if the thread was parked
+			bool wait(oneshot & this) {
+				for() {
+					struct $thread * expected = this.ptr;
+					if(expected == 1p) return false;
+					/* paranoid */ verify( expected == 0p );
+					if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+						park();
+						/* paranoid */ verify( this.ptr == 1p );
+						return true;
+					}
+				}
+			}
+
+			// Mark as fulfilled, wake thread if needed
+			// return true if a thread was unparked
+			bool post(oneshot & this) {
+				struct $thread * got = __atomic_exchange_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
+				if( got == 0p ) return false;
+				unpark( got );
+				return true;
+			}
+		}
+
+		// base types for future to build upon
+		// It is based on the 'oneshot' type to allow multiple futures
+		// to block on the same instance, permitting users to block a single
+		// thread on "any of" [a given set of] futures.
+		// does not support multiple threads waiting on the same future
+		struct future_t {
+			// Internal state :
+			//     0p      : is initial state (wait will block)
+			//     1p      : fulfilled (wait won't block)
+			//     2p      : in progress ()
+			//     3p      : abandoned, server should delete
+			// any oneshot : a context has been setup to wait, a thread could wait on it
+			struct oneshot * volatile ptr;
+		};
+
+		static inline {
+			void  ?{}(future_t & this) {
+				this.ptr = 0p;
+			}
+
+			void ^?{}(future_t &) {}
+
+			void reset(future_t & this) {
+				// needs to be in 0p or 1p
+				__atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
+			}
+
+			// check if the future is available
+			bool available( future_t & this ) {
+				return this.ptr == 1p;
+			}
+
+			// Prepare the future to be waited on
+			// intented to be use by wait, wait_any, waitfor, etc. rather than used directly
+			bool setup( future_t & this, oneshot & wait_ctx ) {
+				/* paranoid */ verify( wait_ctx.ptr == 0p );
+				// The future needs to set the wait context
+				for() {
+					struct oneshot * expected = this.ptr;
+					// Is the future already fulfilled?
+					if(expected == 1p) return false; // Yes, just return false (didn't block)
+
+					// The future is not fulfilled, try to setup the wait context
+					/* paranoid */ verify( expected == 0p );
+					if(__atomic_compare_exchange_n(&this.ptr, &expected, &wait_ctx, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+						return true;
+					}
+				}
+			}
+
+			// Stop waiting on a future
+			// When multiple futures are waited for together in "any of" pattern
+			// futures that weren't fulfilled before the thread woke up
+			// should retract the wait ctx
+			// intented to be use by wait, wait_any, waitfor, etc. rather than used directly
+			void retract( future_t & this, oneshot & wait_ctx ) {
+				// Remove the wait context
+				struct oneshot * got = __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
+
+				// got == 0p: future was never actually setup, just return
+				if( got == 0p ) return;
+
+				// got == wait_ctx: since fulfil does an atomic_swap,
+				// if we got back the original then no one else saw context
+				// It is safe to delete (which could happen after the return)
+				if( got == &wait_ctx ) return;
+
+				// got == 1p: the future is ready and the context was fully consumed
+				// the server won't use the pointer again
+				// It is safe to delete (which could happen after the return)
+				if( got == 1p ) return;
+
+				// got == 2p: the future is ready but the context hasn't fully been consumed
+				// spin until it is safe to move on
+				if( got == 2p ) {
+					while( this.ptr != 1p ) Pause();
+					return;
+				}
+
+				// got == any thing else, something wen't wrong here, abort
+				abort("Future in unexpected state");
+			}
+
+			// Mark the future as abandoned, meaning it will be deleted by the server
+			bool abandon( future_t & this ) {
+				/* paranoid */ verify( this.ptr != 3p );
+
+				// Mark the future as abandonned
+				struct oneshot * got = __atomic_exchange_n( &this.ptr, 3p, __ATOMIC_SEQ_CST);
+
+				// If the future isn't already fulfilled, let the server delete it
+				if( got == 0p ) return false;
+
+				// got == 2p: the future is ready but the context hasn't fully been consumed
+				// spin until it is safe to move on
+				if( got == 2p ) {
+					while( this.ptr != 1p ) Pause();
+					got = 1p;
+				}
+
+				// The future is completed delete it now
+				/* paranoid */ verify( this.ptr != 1p );
+				free( &this );
+				return true;
+			}
+
+			// from the server side, mark the future as fulfilled
+			// delete it if needed
+			bool fulfil( future_t & this ) {
+				for() {
+					struct oneshot * expected = this.ptr;
+					// was this abandoned?
+					#if defined(__GNUC__) && __GNUC__ >= 7
+						#pragma GCC diagnostic push
+						#pragma GCC diagnostic ignored "-Wfree-nonheap-object"
+					#endif
+						if( expected == 3p ) { free( &this ); return false; }
+					#if defined(__GNUC__) && __GNUC__ >= 7
+						#pragma GCC diagnostic pop
+					#endif
+
+					/* paranoid */ verify( expected != 1p ); // Future is already fulfilled, should not happen
+					/* paranoid */ verify( expected != 2p ); // Future is bein fulfilled by someone else, this is even less supported then the previous case.
+
+					// If there is a wait context, we need to consume it and mark it as consumed after
+					// If there is no context then we can skip the in progress phase
+					struct oneshot * want = expected == 0p ? 1p : 2p;
+					if(__atomic_compare_exchange_n(&this.ptr, &expected, want, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+						if( expected == 0p ) { /* paranoid */ verify( this.ptr == 1p); return false; }
+						bool ret = post( *expected );
+						__atomic_store_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
+						return ret;
+					}
+				}
+
+			}
+
+			// Wait for the future to be fulfilled
+			bool wait( future_t & this ) {
+				oneshot temp;
+				if( !setup(this, temp) ) return false;
+
+				// Wait context is setup, just wait on it
+				bool ret = wait( temp );
+
+				// Wait for the future to tru
+				while( this.ptr == 2p ) Pause();
+				// Make sure the state makes sense
+				// Should be fulfilled, could be in progress but it's out of date if so
+				// since if that is the case, the oneshot was fulfilled (unparking this thread)
+				// and the oneshot should not be needed any more
+				__attribute__((unused)) struct oneshot * was = this.ptr;
+				/* paranoid */ verifyf( was == 1p, "Expected this.ptr to be 1p, was %p\n", was );
+
+				// Mark the future as fulfilled, to be consistent
+				// with potential calls to avail
+				// this.ptr = 1p;
+				return ret;
+			}
+		}
+
 		//-----------------------------------------------------------------------
 		// Statics call at the end of each thread to register statistics
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 28c35e2df7bbf22ffd7561aa771276e2a4d55b0e)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 9db2c920685ec71921fa077d452ed8cea1850f4e)
@@ -199,5 +199,5 @@
 	void ?{}(processor & this) with( this ) {
 		( this.idle ){};
-		( this.terminated ){ 0 };
+		( this.terminated ){};
 		( this.runner ){};
 		init( this, "Main Processor", *mainCluster );
@@ -528,5 +528,5 @@
 void ?{}(processor & this, const char name[], cluster & _cltr) {
 	( this.idle ){};
-	( this.terminated ){ 0 };
+	( this.terminated ){};
 	( this.runner ){};
 
@@ -549,5 +549,5 @@
 		__wake_proc( &this );
 
-		P( terminated );
+		wait( terminated );
 		/* paranoid */ verify( active_processor() != &this);
 	}
