Index: src/Common/Heap.cc
===================================================================
--- src/Common/Heap.cc	(revision 3d60c08ac51fee59e5e460399d8f60bfa15e20c6)
+++ src/Common/Heap.cc	(revision b4a835dd16cbb368bdb18250055a0e85c62e213c)
@@ -130,5 +130,5 @@
 				abort();
 			}
-#endif
+#endif // RTLD_NEXT
 		} // if
 
@@ -144,4 +144,13 @@
 		fptr = (generic_fptr_t)dlsym( library, symbol );
 #endif // _GNU_SOURCE
+
+		error = dlerror();
+		if ( error ) {
+			std::cerr << "interpose_symbol : internal error, " << error << std::endl;
+			abort();
+		}
+
+		return fptr;
+	}
 
 	extern "C" {
Index: src/Common/PassVisitor.proto.h
===================================================================
--- src/Common/PassVisitor.proto.h	(revision 3d60c08ac51fee59e5e460399d8f60bfa15e20c6)
+++ src/Common/PassVisitor.proto.h	(revision b4a835dd16cbb368bdb18250055a0e85c62e213c)
@@ -47,5 +47,5 @@
 
 	operator bool() { return m_ref ? *m_ref : true; }
-	bool operator=( bool val ) { return *m_ref = val; }
+	bool operator=( bool val ) { assert(m_ref); return *m_ref = val; }
 
 private:
@@ -53,7 +53,7 @@
 	friend class ChildrenGuard;
 
-	bool * set( bool & val ) {
+	bool * set( bool * val ) {
 		bool * prev = m_ref;
-		m_ref = &val;
+		m_ref = val;
 		return prev;
 	}
@@ -67,5 +67,5 @@
 	ChildrenGuard( bool_ref * ref )
 		: m_val ( true )
-		, m_prev( ref ? ref->set( m_val ) : nullptr )
+		, m_prev( ref ? ref->set( &m_val ) : nullptr )
 		, m_ref ( ref )
 	{}
@@ -73,5 +73,5 @@
 	~ChildrenGuard() {
 		if( m_ref ) {
-			m_ref->set( *m_prev );
+			m_ref->set( m_prev );
 		}
 	}
Index: src/libcfa/bits/containers.h
===================================================================
--- src/libcfa/bits/containers.h	(revision 3d60c08ac51fee59e5e460399d8f60bfa15e20c6)
+++ src/libcfa/bits/containers.h	(revision b4a835dd16cbb368bdb18250055a0e85c62e213c)
@@ -186,4 +186,78 @@
 #endif
 
+
+//-----------------------------------------------------------------------------
+// Doubly Linked List
+//-----------------------------------------------------------------------------
+#ifdef __cforall
+	trait is_db_node(dtype T) {
+		T*& get_next( T& );
+		T*& get_prev( T& );
+	};
+#endif
+
+#ifdef __cforall
+	forall(dtype TYPE | is_db_node(TYPE))
+	#define T TYPE
+#else
+	#define T void
+#endif
+struct __dllist {
+	T * head;
+};
+#undef T
+
+#ifdef __cforall
+#define __dllist_t(T) __dllist(T)
+#else
+#define __dllist_t(T) struct __dllist
+#endif
+
+#ifdef __cforall
+
+	forall(dtype T | is_db_node(T))
+	static inline void ?{}( __dllist(T) & this ) with( this ) {
+		head{ NULL };
+	}
+
+	// forall(dtype T | is_db_node(T) | sized(T))
+	// static inline void push_front( __dllist(T) & this, T & node ) with( this ) {
+	// 	if ( head ) {
+	// 		get_next( node ) = head;
+	// 		get_prev( node ) = get_prev( *head );
+	// 		// inserted node must be consistent before it is seen
+	// 		// prevent code movement across barrier
+	// 		asm( "" : : : "memory" );
+	// 		get_prev( *head ) = node;
+	// 		T & prev = *get_prev( node );
+	// 		get_next( prev ) = node;
+	// 	}
+	// 	else {
+	// 		get_next( node ) = &node;
+	// 		get_prev( node ) = &node;
+	// 	}
+
+	// 	// prevent code movement across barrier
+	// 	asm( "" : : : "memory" );
+	// 	head = val;
+	// }
+
+	// forall(dtype T | is_db_node(T) | sized(T))
+	// static inline T * remove( __dllist(T) & this, T & node ) with( this ) {
+	// 	if ( &node == head ) {
+	// 		if ( get_next( *head ) == head ) {
+	// 			head = NULL;
+	// 		}
+	// 		else {
+	// 			head = get_next( *head );
+	// 		}
+	// 	}
+	// 	get_prev( *get_next( node ) ) = get_prev( node );
+	// 	get_next( *get_prev( node ) ) = get_next( node );
+	// 	get_next( node ) = NULL;
+	// 	get_prev( node ) = NULL;
+	// }
+#endif
+
 //-----------------------------------------------------------------------------
 // Tools
Index: src/libcfa/concurrency/coroutine
===================================================================
--- src/libcfa/concurrency/coroutine	(revision 3d60c08ac51fee59e5e460399d8f60bfa15e20c6)
+++ src/libcfa/concurrency/coroutine	(revision b4a835dd16cbb368bdb18250055a0e85c62e213c)
@@ -72,5 +72,10 @@
 // Suspend implementation inlined for performance
 static inline void suspend() {
-	coroutine_desc * src = TL_GET( this_coroutine );			// optimization
+	// optimization : read TLS once and reuse it
+	// Safety note: this is preemption safe since if
+	// preemption occurs after this line, the pointer
+	// will also migrate which means this value will
+	// stay in syn with the TLS
+	coroutine_desc * src = TL_GET( this_coroutine );
 
 	assertf( src->last != 0,
@@ -89,5 +94,10 @@
 forall(dtype T | is_coroutine(T))
 static inline void resume(T & cor) {
-	coroutine_desc * src = TL_GET( this_coroutine );			// optimization
+	// optimization : read TLS once and reuse it
+	// Safety note: this is preemption safe since if
+	// preemption occurs after this line, the pointer
+	// will also migrate which means this value will
+	// stay in syn with the TLS
+	coroutine_desc * src = TL_GET( this_coroutine );
 	coroutine_desc * dst = get_coroutine(cor);
 
@@ -107,5 +117,5 @@
 		dst->last = src;
 		dst->starter = dst->starter ? dst->starter : src;
-	} // if
+	}
 
 	// always done for performance testing
@@ -114,5 +124,10 @@
 
 static inline void resume(coroutine_desc * dst) {
-	coroutine_desc * src = TL_GET( this_coroutine );			// optimization
+	// optimization : read TLS once and reuse it
+	// Safety note: this is preemption safe since if
+	// preemption occurs after this line, the pointer
+	// will also migrate which means this value will
+	// stay in syn with the TLS
+	coroutine_desc * src = TL_GET( this_coroutine );
 
 	// not resuming self ?
@@ -125,5 +140,5 @@
 		// set last resumer
 		dst->last = src;
-	} // if
+	}
 
 	// always done for performance testing
Index: src/libcfa/concurrency/coroutine.c
===================================================================
--- src/libcfa/concurrency/coroutine.c	(revision 3d60c08ac51fee59e5e460399d8f60bfa15e20c6)
+++ src/libcfa/concurrency/coroutine.c	(revision b4a835dd16cbb368bdb18250055a0e85c62e213c)
@@ -84,5 +84,6 @@
 // Wrapper for co
 void CoroutineCtxSwitch(coroutine_desc* src, coroutine_desc* dst) {
-      verify( TL_GET( preemption_state ).enabled || TL_GET( this_processor )->do_terminate );
+      // Safety note : This could cause some false positives due to preemption
+      verify( TL_GET( preemption_state.enabled ) || TL_GET( this_processor )->do_terminate );
       disable_interrupts();
 
@@ -91,5 +92,5 @@
 
       // set new coroutine that task is executing
-      TL_SET( this_coroutine, dst );
+      kernelTLS.this_coroutine = dst;
 
       // context switch to specified coroutine
@@ -102,5 +103,6 @@
 
       enable_interrupts( __cfaabi_dbg_ctx );
-      verify( TL_GET( preemption_state ).enabled || TL_GET( this_processor )->do_terminate );
+      // Safety note : This could cause some false positives due to preemption
+      verify( TL_GET( preemption_state.enabled ) || TL_GET( this_processor )->do_terminate );
 } //ctxSwitchDirect
 
Index: src/libcfa/concurrency/invoke.c
===================================================================
--- src/libcfa/concurrency/invoke.c	(revision 3d60c08ac51fee59e5e460399d8f60bfa15e20c6)
+++ src/libcfa/concurrency/invoke.c	(revision b4a835dd16cbb368bdb18250055a0e85c62e213c)
@@ -69,4 +69,5 @@
 	// Fetch the thread handle from the user defined thread structure
 	struct thread_desc* thrd = get_thread( this );
+	thrd->self_cor.last = NULL;
 
 	// Officially start the thread by enabling preemption
Index: src/libcfa/concurrency/invoke.h
===================================================================
--- src/libcfa/concurrency/invoke.h	(revision 3d60c08ac51fee59e5e460399d8f60bfa15e20c6)
+++ src/libcfa/concurrency/invoke.h	(revision b4a835dd16cbb368bdb18250055a0e85c62e213c)
@@ -18,6 +18,6 @@
 #include "bits/locks.h"
 
-#define TL_GET( member ) kernelThreadData.member
-#define TL_SET( member, value ) kernelThreadData.member = value;
+#define TL_GET( member ) kernelTLS.member
+#define TL_SET( member, value ) kernelTLS.member = value;
 
 #ifdef __cforall
@@ -44,10 +44,10 @@
 				volatile bool in_progress;
 			} preemption_state;
-		} kernelThreadData;
+		} kernelTLS;
 	}
 
 	static inline struct coroutine_desc * volatile active_coroutine() { return TL_GET( this_coroutine ); }
-	static inline struct thread_desc * volatile active_thread() { return TL_GET( this_thread ); }
-	static inline struct processor * volatile active_processor() { return TL_GET( this_processor ); }
+	static inline struct thread_desc    * volatile active_thread   () { return TL_GET( this_thread    ); }
+	static inline struct processor      * volatile active_processor() { return TL_GET( this_processor ); } // UNSAFE
 	#endif
 
Index: src/libcfa/concurrency/kernel
===================================================================
--- src/libcfa/concurrency/kernel	(revision 3d60c08ac51fee59e5e460399d8f60bfa15e20c6)
+++ src/libcfa/concurrency/kernel	(revision b4a835dd16cbb368bdb18250055a0e85c62e213c)
@@ -53,4 +53,7 @@
 	// Preemption rate on this cluster
 	Duration preemption_rate;
+
+	// List of idle processors
+	// __dllist_t(struct processor) idles;
 };
 
@@ -124,4 +127,9 @@
 	bool pending_preemption;
 
+	struct {
+		pthread_mutex_t lock;
+		pthread_cond_t  cond;
+	} idle;
+
 #ifdef __CFA_DEBUG__
 	// Last function to enable preemption on this processor
Index: src/libcfa/concurrency/kernel.c
===================================================================
--- src/libcfa/concurrency/kernel.c	(revision 3d60c08ac51fee59e5e460399d8f60bfa15e20c6)
+++ src/libcfa/concurrency/kernel.c	(revision b4a835dd16cbb368bdb18250055a0e85c62e213c)
@@ -56,5 +56,5 @@
 // volatile thread_local unsigned short disable_preempt_count = 1;
 
-thread_local struct KernelThreadData kernelThreadData = {
+thread_local struct KernelThreadData kernelTLS = {
 	NULL,
 	NULL,
@@ -155,7 +155,7 @@
 		terminate(&this);
 		verify(this.do_terminate);
-		verify(TL_GET( this_processor ) != &this);
+		verify( kernelTLS.this_processor != &this);
 		P( terminated );
-		verify(TL_GET( this_processor ) != &this);
+		verify( kernelTLS.this_processor != &this);
 		pthread_join( kernel_thread, NULL );
 	}
@@ -196,9 +196,9 @@
 			if(readyThread)
 			{
-				verify( ! TL_GET( preemption_state ).enabled );
+				verify( ! kernelTLS.preemption_state.enabled );
 
 				runThread(this, readyThread);
 
-				verify( ! TL_GET( preemption_state ).enabled );
+				verify( ! kernelTLS.preemption_state.enabled );
 
 				//Some actions need to be taken from the kernel
@@ -221,4 +221,5 @@
 }
 
+// KERNEL ONLY
 // runThread runs a thread by context switching
 // from the processor coroutine to the target thread
@@ -228,9 +229,9 @@
 	coroutine_desc * thrd_cor = dst->curr_cor;
 
-	//Reset the terminating actions here
+	// Reset the terminating actions here
 	this->finish.action_code = No_Action;
 
-	//Update global state
-	TL_SET( this_thread, dst );
+	// Update global state
+	kernelTLS.this_thread = dst;
 
 	// Context Switch to the thread
@@ -239,15 +240,17 @@
 }
 
+// KERNEL_ONLY
 void returnToKernel() {
-	coroutine_desc * proc_cor = get_coroutine(TL_GET( this_processor )->runner);
-	coroutine_desc * thrd_cor = TL_GET( this_thread )->curr_cor = TL_GET( this_coroutine );
+	coroutine_desc * proc_cor = get_coroutine(kernelTLS.this_processor->runner);
+	coroutine_desc * thrd_cor = kernelTLS.this_thread->curr_cor = kernelTLS.this_coroutine;
 	ThreadCtxSwitch(thrd_cor, proc_cor);
 }
 
+// KERNEL_ONLY
 // Once a thread has finished running, some of
 // its final actions must be executed from the kernel
 void finishRunning(processor * this) with( this->finish ) {
 	if( action_code == Release ) {
-		verify( ! TL_GET( preemption_state ).enabled );
+		verify( ! kernelTLS.preemption_state.enabled );
 		unlock( *lock );
 	}
@@ -256,10 +259,10 @@
 	}
 	else if( action_code == Release_Schedule ) {
-		verify( ! TL_GET( preemption_state ).enabled );
+		verify( ! kernelTLS.preemption_state.enabled );
 		unlock( *lock );
 		ScheduleThread( thrd );
 	}
 	else if( action_code == Release_Multi ) {
-		verify( ! TL_GET( preemption_state ).enabled );
+		verify( ! kernelTLS.preemption_state.enabled );
 		for(int i = 0; i < lock_count; i++) {
 			unlock( *locks[i] );
@@ -285,4 +288,5 @@
 }
 
+// KERNEL_ONLY
 // Context invoker for processors
 // This is the entry point for processors (kernel threads)
@@ -290,8 +294,8 @@
 void * CtxInvokeProcessor(void * arg) {
 	processor * proc = (processor *) arg;
-	TL_SET( this_processor, proc );
-	TL_SET( this_coroutine, NULL );
-	TL_SET( this_thread, NULL );
-	TL_GET( preemption_state ).[enabled, disable_count] = [false, 1];
+	kernelTLS.this_processor = proc;
+	kernelTLS.this_coroutine = NULL;
+	kernelTLS.this_thread    = NULL;
+	kernelTLS.preemption_state.[enabled, disable_count] = [false, 1];
 	// SKULLDUGGERY: We want to create a context for the processor coroutine
 	// which is needed for the 2-step context switch. However, there is no reason
@@ -305,6 +309,6 @@
 
 	//Set global state
-	TL_SET( this_coroutine, get_coroutine(proc->runner) );
-	TL_SET( this_thread, NULL );
+	kernelTLS.this_coroutine = get_coroutine(proc->runner);
+	kernelTLS.this_thread    = NULL;
 
 	//We now have a proper context from which to schedule threads
@@ -333,14 +337,15 @@
 }
 
+// KERNEL_ONLY
 void kernel_first_resume(processor * this) {
-	coroutine_desc * src = TL_GET( this_coroutine );
+	coroutine_desc * src = kernelTLS.this_coroutine;
 	coroutine_desc * dst = get_coroutine(this->runner);
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 
 	create_stack(&dst->stack, dst->stack.size);
 	CtxStart(&this->runner, CtxInvokeCoroutine);
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 
 	dst->last = src;
@@ -351,5 +356,5 @@
 
 	// set new coroutine that task is executing
-	TL_SET( this_coroutine, dst );
+	kernelTLS.this_coroutine = dst;
 
 	// SKULLDUGGERY normally interrupts are enable before leaving a coroutine ctxswitch.
@@ -368,15 +373,16 @@
 	src->state = Active;
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 }
 
 //-----------------------------------------------------------------------------
 // Scheduler routines
+
+// KERNEL ONLY
 void ScheduleThread( thread_desc * thrd ) {
-	// if( ! thrd ) return;
 	verify( thrd );
 	verify( thrd->self_cor.state != Halted );
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 
 	verifyf( thrd->next == NULL, "Expected null got %p", thrd->next );
@@ -388,13 +394,14 @@
 	}
 
-	verify( ! TL_GET( preemption_state ).enabled );
-}
-
+	verify( ! kernelTLS.preemption_state.enabled );
+}
+
+// KERNEL ONLY
 thread_desc * nextThread(cluster * this) with( *this ) {
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	lock( ready_queue_lock __cfaabi_dbg_ctx2 );
 	thread_desc * head = pop_head( ready_queue );
 	unlock( ready_queue_lock );
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	return head;
 }
@@ -402,7 +409,7 @@
 void BlockInternal() {
 	disable_interrupts();
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	returnToKernel();
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	enable_interrupts( __cfaabi_dbg_ctx );
 }
@@ -410,12 +417,12 @@
 void BlockInternal( __spinlock_t * lock ) {
 	disable_interrupts();
-	with( *TL_GET( this_processor ) ) {
+	with( *kernelTLS.this_processor ) {
 		finish.action_code = Release;
 		finish.lock        = lock;
 	}
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	returnToKernel();
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 
 	enable_interrupts( __cfaabi_dbg_ctx );
@@ -424,12 +431,12 @@
 void BlockInternal( thread_desc * thrd ) {
 	disable_interrupts();
-	with( *TL_GET( this_processor ) ) {
+	with( * kernelTLS.this_processor ) {
 		finish.action_code = Schedule;
 		finish.thrd        = thrd;
 	}
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	returnToKernel();
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 
 	enable_interrupts( __cfaabi_dbg_ctx );
@@ -439,5 +446,5 @@
 	assert(thrd);
 	disable_interrupts();
-	with( *TL_GET( this_processor ) ) {
+	with( * kernelTLS.this_processor ) {
 		finish.action_code = Release_Schedule;
 		finish.lock        = lock;
@@ -445,7 +452,7 @@
 	}
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	returnToKernel();
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 
 	enable_interrupts( __cfaabi_dbg_ctx );
@@ -454,5 +461,5 @@
 void BlockInternal(__spinlock_t * locks [], unsigned short count) {
 	disable_interrupts();
-	with( *TL_GET( this_processor ) ) {
+	with( * kernelTLS.this_processor ) {
 		finish.action_code = Release_Multi;
 		finish.locks       = locks;
@@ -460,7 +467,7 @@
 	}
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	returnToKernel();
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 
 	enable_interrupts( __cfaabi_dbg_ctx );
@@ -469,5 +476,5 @@
 void BlockInternal(__spinlock_t * locks [], unsigned short lock_count, thread_desc * thrds [], unsigned short thrd_count) {
 	disable_interrupts();
-	with( *TL_GET( this_processor ) ) {
+	with( *kernelTLS.this_processor ) {
 		finish.action_code = Release_Multi_Schedule;
 		finish.locks       = locks;
@@ -477,14 +484,15 @@
 	}
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	returnToKernel();
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 
 	enable_interrupts( __cfaabi_dbg_ctx );
 }
 
+// KERNEL ONLY
 void LeaveThread(__spinlock_t * lock, thread_desc * thrd) {
-	verify( ! TL_GET( preemption_state ).enabled );
-	with( *TL_GET( this_processor ) ) {
+	verify( ! kernelTLS.preemption_state.enabled );
+	with( * kernelTLS.this_processor ) {
 		finish.action_code = thrd ? Release_Schedule : Release;
 		finish.lock        = lock;
@@ -501,5 +509,5 @@
 // Kernel boot procedures
 void kernel_startup(void) {
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	__cfaabi_dbg_print_safe("Kernel : Starting\n");
 
@@ -547,7 +555,7 @@
 
 	//initialize the global state variables
-	TL_SET( this_processor, mainProcessor );
-	TL_SET( this_thread, mainThread );
-	TL_SET( this_coroutine, &mainThread->self_cor );
+	kernelTLS.this_processor = mainProcessor;
+	kernelTLS.this_thread    = mainThread;
+	kernelTLS.this_coroutine = &mainThread->self_cor;
 
 	// Enable preemption
@@ -561,5 +569,5 @@
 	// context. Hence, the main thread does not begin through CtxInvokeThread, like all other threads. The trick here is that
 	// mainThread is on the ready queue when this call is made.
-	kernel_first_resume( TL_GET( this_processor ) );
+	kernel_first_resume( kernelTLS.this_processor );
 
 
@@ -568,7 +576,7 @@
 	__cfaabi_dbg_print_safe("Kernel : Started\n--------------------------------------------------\n\n");
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	enable_interrupts( __cfaabi_dbg_ctx );
-	verify( TL_GET( preemption_state ).enabled );
+	verify( TL_GET( preemption_state.enabled ) );
 }
 
@@ -576,7 +584,7 @@
 	__cfaabi_dbg_print_safe("\n--------------------------------------------------\nKernel : Shutting down\n");
 
-	verify( TL_GET( preemption_state ).enabled );
+	verify( TL_GET( preemption_state.enabled ) );
 	disable_interrupts();
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 
 	// SKULLDUGGERY: Notify the mainProcessor it needs to terminates.
@@ -604,4 +612,26 @@
 
 //=============================================================================================
+// Kernel Quiescing
+//=============================================================================================
+
+// void halt(processor * this) with( this ) {
+// 	pthread_mutex_lock( &idle.lock );
+
+
+
+// 	// SKULLDUGGERY: Even if spurious wake-up is a thing
+// 	// spuriously waking up a kernel thread is not a big deal
+// 	// if it is very rare.
+// 	pthread_cond_wait( &idle.cond, &idle.lock);
+// 	pthread_mutex_unlock( &idle.lock );
+// }
+
+// void wake(processor * this) with( this ) {
+// 	pthread_mutex_lock  (&idle.lock);
+// 	pthread_cond_signal (&idle.cond);
+// 	pthread_mutex_unlock(&idle.lock);
+// }
+
+//=============================================================================================
 // Unexpected Terminating logic
 //=============================================================================================
@@ -612,5 +642,5 @@
 static bool kernel_abort_called = false;
 
-void * kernel_abort    (void) __attribute__ ((__nothrow__)) {
+void * kernel_abort(void) __attribute__ ((__nothrow__)) {
 	// abort cannot be recursively entered by the same or different processors because all signal handlers return when
 	// the globalAbort flag is true.
@@ -633,5 +663,5 @@
 	}
 
-	return TL_GET( this_thread );
+	return kernelTLS.this_thread;
 }
 
@@ -642,6 +672,6 @@
 	__cfaabi_dbg_bits_write( abort_text, len );
 
-	if ( get_coroutine(thrd) != TL_GET( this_coroutine ) ) {
-		len = snprintf( abort_text, abort_text_size, " in coroutine %.256s (%p).\n", TL_GET( this_coroutine )->name, TL_GET( this_coroutine ) );
+	if ( get_coroutine(thrd) != kernelTLS.this_coroutine ) {
+		len = snprintf( abort_text, abort_text_size, " in coroutine %.256s (%p).\n", kernelTLS.this_coroutine->name, kernelTLS.this_coroutine );
 		__cfaabi_dbg_bits_write( abort_text, len );
 	}
@@ -652,5 +682,5 @@
 
 int kernel_abort_lastframe( void ) __attribute__ ((__nothrow__)) {
-	return get_coroutine(TL_GET( this_thread )) == get_coroutine(mainThread) ? 4 : 2;
+	return get_coroutine(kernelTLS.this_thread) == get_coroutine(mainThread) ? 4 : 2;
 }
 
@@ -682,5 +712,5 @@
 	if ( count < 0 ) {
 		// queue current task
-		append( waiting, (thread_desc *)TL_GET( this_thread ) );
+		append( waiting, kernelTLS.this_thread );
 
 		// atomically release spin lock and block
@@ -742,5 +772,5 @@
 	void __cfaabi_dbg_record(__spinlock_t & this, const char * prev_name) {
 		this.prev_name = prev_name;
-		this.prev_thrd = TL_GET( this_thread );
+		this.prev_thrd = kernelTLS.this_thread;
 	}
 )
Index: src/libcfa/concurrency/monitor.c
===================================================================
--- src/libcfa/concurrency/monitor.c	(revision 3d60c08ac51fee59e5e460399d8f60bfa15e20c6)
+++ src/libcfa/concurrency/monitor.c	(revision b4a835dd16cbb368bdb18250055a0e85c62e213c)
@@ -85,5 +85,6 @@
 		// Lock the monitor spinlock
 		lock( this->lock __cfaabi_dbg_ctx2 );
-		thread_desc * thrd = TL_GET( this_thread );
+		// Interrupts disable inside critical section
+		thread_desc * thrd = kernelTLS.this_thread;
 
 		__cfaabi_dbg_print_safe( "Kernel : %10p Entering mon %p (%p)\n", thrd, this, this->owner);
@@ -134,5 +135,6 @@
 		// Lock the monitor spinlock
 		lock( this->lock __cfaabi_dbg_ctx2 );
-		thread_desc * thrd = TL_GET( this_thread );
+		// Interrupts disable inside critical section
+		thread_desc * thrd = kernelTLS.this_thread;
 
 		__cfaabi_dbg_print_safe( "Kernel : %10p Entering dtor for mon %p (%p)\n", thrd, this, this->owner);
@@ -168,5 +170,5 @@
 
 			// Create the node specific to this wait operation
-			wait_ctx_primed( TL_GET( this_thread ), 0 )
+			wait_ctx_primed( thrd, 0 )
 
 			// Some one else has the monitor, wait for him to finish and then run
@@ -179,5 +181,5 @@
 			__cfaabi_dbg_print_safe( "Kernel :  blocking \n" );
 
-			wait_ctx( TL_GET( this_thread ), 0 )
+			wait_ctx( thrd, 0 )
 			this->dtor_node = &waiter;
 
@@ -199,7 +201,7 @@
 		lock( this->lock __cfaabi_dbg_ctx2 );
 
-		__cfaabi_dbg_print_safe( "Kernel : %10p Leaving mon %p (%p)\n", TL_GET( this_thread ), this, this->owner);
-
-		verifyf( TL_GET( this_thread ) == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", TL_GET( this_thread ), this->owner, this->recursion, this );
+		__cfaabi_dbg_print_safe( "Kernel : %10p Leaving mon %p (%p)\n", kernelTLS.this_thread, this, this->owner);
+
+		verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
 
 		// Leaving a recursion level, decrement the counter
@@ -289,4 +291,6 @@
 // Sorts monitors before entering
 void ?{}( monitor_guard_t & this, monitor_desc * m [], __lock_size_t count, fptr_t func ) {
+	thread_desc * thrd = TL_GET( this_thread );
+
 	// Store current array
 	this.m = m;
@@ -297,8 +301,8 @@
 
 	// Save previous thread context
-	this.prev = TL_GET( this_thread )->monitors;
+	this.prev = thrd->monitors;
 
 	// Update thread context (needed for conditions)
-	(TL_GET( this_thread )->monitors){m, count, func};
+	(thrd->monitors){m, count, func};
 
 	// __cfaabi_dbg_print_safe( "MGUARD : enter %d\n", count);
@@ -328,12 +332,15 @@
 // Sorts monitors before entering
 void ?{}( monitor_dtor_guard_t & this, monitor_desc * m [], fptr_t func ) {
+	// optimization
+	thread_desc * thrd = TL_GET( this_thread );
+
 	// Store current array
 	this.m = *m;
 
 	// Save previous thread context
-	this.prev = TL_GET( this_thread )->monitors;
+	this.prev = thrd->monitors;
 
 	// Update thread context (needed for conditions)
-	(TL_GET( this_thread )->monitors){m, 1, func};
+	(thrd->monitors){m, 1, func};
 
 	__enter_monitor_dtor( this.m, func );
@@ -473,5 +480,5 @@
 
 	// Create the node specific to this wait operation
-	wait_ctx_primed( TL_GET( this_thread ), 0 )
+	wait_ctx_primed( kernelTLS.this_thread, 0 )
 
 	//save contexts
@@ -566,5 +573,5 @@
 
 				// Create the node specific to this wait operation
-				wait_ctx_primed( TL_GET( this_thread ), 0 );
+				wait_ctx_primed( kernelTLS.this_thread, 0 );
 
 				// Save monitor states
@@ -612,5 +619,5 @@
 
 	// Create the node specific to this wait operation
-	wait_ctx_primed( TL_GET( this_thread ), 0 );
+	wait_ctx_primed( kernelTLS.this_thread, 0 );
 
 	monitor_save;
@@ -618,5 +625,5 @@
 
 	for( __lock_size_t i = 0; i < count; i++) {
-		verify( monitors[i]->owner == TL_GET( this_thread ) );
+		verify( monitors[i]->owner == kernelTLS.this_thread );
 	}
 
Index: src/libcfa/concurrency/preemption.c
===================================================================
--- src/libcfa/concurrency/preemption.c	(revision 3d60c08ac51fee59e5e460399d8f60bfa15e20c6)
+++ src/libcfa/concurrency/preemption.c	(revision b4a835dd16cbb368bdb18250055a0e85c62e213c)
@@ -149,5 +149,5 @@
 	// Disable interrupts by incrementing the counter
 	void disable_interrupts() {
-		with( TL_GET( preemption_state ) ) {
+		with( kernelTLS.preemption_state ) {
 			enabled = false;
 			__attribute__((unused)) unsigned short new_val = disable_count + 1;
@@ -160,8 +160,8 @@
 	// If counter reaches 0, execute any pending CtxSwitch
 	void enable_interrupts( __cfaabi_dbg_ctx_param ) {
-		processor   * proc = TL_GET( this_processor ); // Cache the processor now since interrupts can start happening after the atomic add
-		thread_desc * thrd = TL_GET( this_thread );	  // Cache the thread now since interrupts can start happening after the atomic add
-
-		with( TL_GET( preemption_state ) ){
+		processor   * proc = kernelTLS.this_processor; // Cache the processor now since interrupts can start happening after the atomic add
+		thread_desc * thrd = kernelTLS.this_thread;	  // Cache the thread now since interrupts can start happening after the atomic add
+
+		with( kernelTLS.preemption_state ){
 			unsigned short prev = disable_count;
 			disable_count -= 1;
@@ -185,9 +185,9 @@
 	// Don't execute any pending CtxSwitch even if counter reaches 0
 	void enable_interrupts_noPoll() {
-		unsigned short prev = TL_GET( preemption_state ).disable_count;
-		TL_GET( preemption_state ).disable_count -= 1;
+		unsigned short prev = kernelTLS.preemption_state.disable_count;
+		kernelTLS.preemption_state.disable_count -= 1;
 		verifyf( prev != 0u, "Incremented from %u\n", prev );                     // If this triggers someone is enabled already enabled interrupts
 		if( prev == 1 ) {
-			TL_GET( preemption_state ).enabled = true;
+			kernelTLS.preemption_state.enabled = true;
 		}
 	}
@@ -234,11 +234,14 @@
 }
 
-
+// KERNEL ONLY
 // Check if a CtxSwitch signal handler shoud defer
 // If true  : preemption is safe
 // If false : preemption is unsafe and marked as pending
 static inline bool preemption_ready() {
-	bool ready = TL_GET( preemption_state ).enabled && !TL_GET( preemption_state ).in_progress; // Check if preemption is safe
-	TL_GET( this_processor )->pending_preemption = !ready;			// Adjust the pending flag accordingly
+	// Check if preemption is safe
+	bool ready = kernelTLS.preemption_state.enabled && ! kernelTLS.preemption_state.in_progress;
+
+	// Adjust the pending flag accordingly
+	kernelTLS.this_processor->pending_preemption = !ready;
 	return ready;
 }
@@ -254,6 +257,6 @@
 
 	// Start with preemption disabled until ready
-	TL_GET( preemption_state ).enabled = false;
-	TL_GET( preemption_state ).disable_count = 1;
+	kernelTLS.preemption_state.enabled = false;
+	kernelTLS.preemption_state.disable_count = 1;
 
 	// Initialize the event kernel
@@ -320,9 +323,9 @@
 	// before the kernel thread has even started running. When that happens an iterrupt
 	// we a null 'this_processor' will be caught, just ignore it.
-	if(!TL_GET( this_processor )) return;
+	if(! kernelTLS.this_processor ) return;
 
 	choose(sfp->si_value.sival_int) {
 		case PREEMPT_NORMAL   : ;// Normal case, nothing to do here
-		case PREEMPT_TERMINATE: verify(TL_GET( this_processor )->do_terminate);
+		case PREEMPT_TERMINATE: verify( kernelTLS.this_processor->do_terminate);
 		default:
 			abort( "internal error, signal value is %d", sfp->si_value.sival_int );
@@ -332,13 +335,19 @@
 	if( !preemption_ready() ) { return; }
 
-	__cfaabi_dbg_print_buffer_decl( " KERNEL: preempting core %p (%p).\n", TL_GET( this_processor ), TL_GET( this_thread ) );
-
-	TL_GET( preemption_state ).in_progress = true;  // Sync flag : prevent recursive calls to the signal handler
-	signal_unblock( SIGUSR1 );                          // We are about to CtxSwitch out of the signal handler, let other handlers in
-	TL_GET( preemption_state ).in_progress = false; // Clear the in progress flag
+	__cfaabi_dbg_print_buffer_decl( " KERNEL: preempting core %p (%p).\n", kernelTLS.this_processor, kernelTLS.this_thread );
+
+	// Sync flag : prevent recursive calls to the signal handler
+	kernelTLS.preemption_state.in_progress = true;
+
+	// We are about to CtxSwitch out of the signal handler, let other handlers in
+	signal_unblock( SIGUSR1 );
+
+	// TODO: this should go in finish action
+	// Clear the in progress flag
+	kernelTLS.preemption_state.in_progress = false;
 
 	// Preemption can occur here
 
-	BlockInternal( (thread_desc*)TL_GET( this_thread ) ); // Do the actual CtxSwitch
+	BlockInternal( kernelTLS.this_thread ); // Do the actual CtxSwitch
 }
 
@@ -409,5 +418,5 @@
 
 void __cfaabi_check_preemption() {
-	bool ready = TL_GET( preemption_state ).enabled;
+	bool ready = kernelTLS.preemption_state.enabled;
 	if(!ready) { abort("Preemption should be ready"); }
 
Index: src/libcfa/concurrency/thread.c
===================================================================
--- src/libcfa/concurrency/thread.c	(revision 3d60c08ac51fee59e5e460399d8f60bfa15e20c6)
+++ src/libcfa/concurrency/thread.c	(revision b4a835dd16cbb368bdb18250055a0e85c62e213c)
@@ -81,5 +81,5 @@
 	disable_interrupts();
 	create_stack(&thrd_c->stack, thrd_c->stack.size);
-	TL_SET( this_coroutine, thrd_c );
+	kernelTLS.this_coroutine = thrd_c;
 	CtxStart(&this, CtxInvokeThread);
 	assert( thrd_c->last->stack.context );
@@ -91,6 +91,7 @@
 
 extern "C" {
+	// KERNEL ONLY
 	void __finish_creation(void) {
-		coroutine_desc* thrd_c = TL_GET( this_coroutine );
+		coroutine_desc* thrd_c = kernelTLS.this_coroutine;
 		ThreadCtxSwitch( thrd_c, thrd_c->last );
 	}
@@ -98,7 +99,9 @@
 
 void yield( void ) {
-	verify( TL_GET( preemption_state ).enabled );
+	// Safety note : This could cause some false positives due to preemption
+      verify( TL_GET( preemption_state.enabled ) );
 	BlockInternal( TL_GET( this_thread ) );
-	verify( TL_GET( preemption_state ).enabled );
+	// Safety note : This could cause some false positives due to preemption
+      verify( TL_GET( preemption_state.enabled ) );
 }
 
@@ -109,4 +112,5 @@
 }
 
+// KERNEL ONLY
 void ThreadCtxSwitch(coroutine_desc* src, coroutine_desc* dst) {
 	// set state of current coroutine to inactive
@@ -116,8 +120,8 @@
 	// set new coroutine that the processor is executing
 	// and context switch to it
-	TL_SET( this_coroutine, dst );
+	kernelTLS.this_coroutine = dst;
 	assert( src->stack.context );
 	CtxSwitch( src->stack.context, dst->stack.context );
-	TL_SET( this_coroutine, src );
+	kernelTLS.this_coroutine = src;
 
 	// set state of new coroutine to active
Index: src/main.cc
===================================================================
--- src/main.cc	(revision 3d60c08ac51fee59e5e460399d8f60bfa15e20c6)
+++ src/main.cc	(revision b4a835dd16cbb368bdb18250055a0e85c62e213c)
@@ -371,5 +371,18 @@
 		} // if
 		return 1;
-	} // try
+	} catch(...) {
+		std::exception_ptr eptr = std::current_exception();
+		try {
+			if (eptr) {
+				std::rethrow_exception(eptr);
+			}
+			else {
+				std::cerr << "Exception Uncaught and Unkown" << std::endl;
+			}
+		} catch(const std::exception& e) {
+			std::cerr << "Unaught Exception \"" << e.what() << "\"\n";
+		}
+		return 1;
+	}// try
 
 	deleteAll( translationUnit );