Index: src/libcfa/bits/containers.h
===================================================================
--- src/libcfa/bits/containers.h	(revision 10cfad9d3d9c4cef2b74f4ef3fab06d8cbe04ae5)
+++ src/libcfa/bits/containers.h	(revision 14a61b5e0193d16a9724adf11613922ed2eeec81)
@@ -186,4 +186,78 @@
 #endif
 
+
+//-----------------------------------------------------------------------------
+// Doubly Linked List
+//-----------------------------------------------------------------------------
+#ifdef __cforall
+	trait is_db_node(dtype T) {
+		T*& get_next( T& );
+		T*& get_prev( T& );
+	};
+#endif
+
+#ifdef __cforall
+	forall(dtype TYPE | is_db_node(TYPE))
+	#define T TYPE
+#else
+	#define T void
+#endif
+struct __dllist {
+	T * head;
+};
+#undef T
+
+#ifdef __cforall
+#define __dllist_t(T) __dllist(T)
+#else
+#define __dllist_t(T) struct __dllist
+#endif
+
+#ifdef __cforall
+
+	forall(dtype T | is_db_node(T))
+	static inline void ?{}( __dllist(T) & this ) with( this ) {
+		head{ NULL };
+	}
+
+	// forall(dtype T | is_db_node(T) | sized(T))
+	// static inline void push_front( __dllist(T) & this, T & node ) with( this ) {
+	// 	if ( head ) {
+	// 		get_next( node ) = head;
+	// 		get_prev( node ) = get_prev( *head );
+	// 		// inserted node must be consistent before it is seen
+	// 		// prevent code movement across barrier
+	// 		asm( "" : : : "memory" );
+	// 		get_prev( *head ) = node;
+	// 		T & prev = *get_prev( node );
+	// 		get_next( prev ) = node;
+	// 	}
+	// 	else {
+	// 		get_next( node ) = &node;
+	// 		get_prev( node ) = &node;
+	// 	}
+
+	// 	// prevent code movement across barrier
+	// 	asm( "" : : : "memory" );
+	// 	head = val;
+	// }
+
+	// forall(dtype T | is_db_node(T) | sized(T))
+	// static inline T * remove( __dllist(T) & this, T & node ) with( this ) {
+	// 	if ( &node == head ) {
+	// 		if ( get_next( *head ) == head ) {
+	// 			head = NULL;
+	// 		}
+	// 		else {
+	// 			head = get_next( *head );
+	// 		}
+	// 	}
+	// 	get_prev( *get_next( node ) ) = get_prev( node );
+	// 	get_next( *get_prev( node ) ) = get_next( node );
+	// 	get_next( node ) = NULL;
+	// 	get_prev( node ) = NULL;
+	// }
+#endif
+
 //-----------------------------------------------------------------------------
 // Tools
Index: src/libcfa/concurrency/coroutine
===================================================================
--- src/libcfa/concurrency/coroutine	(revision 10cfad9d3d9c4cef2b74f4ef3fab06d8cbe04ae5)
+++ src/libcfa/concurrency/coroutine	(revision 14a61b5e0193d16a9724adf11613922ed2eeec81)
@@ -72,5 +72,10 @@
 // Suspend implementation inlined for performance
 static inline void suspend() {
-	coroutine_desc * src = TL_GET( this_coroutine );			// optimization
+	// optimization : read TLS once and reuse it
+	// Safety note: this is preemption safe since if
+	// preemption occurs after this line, the pointer
+	// will also migrate which means this value will
+	// stay in syn with the TLS
+	coroutine_desc * src = TL_GET( this_coroutine );
 
 	assertf( src->last != 0,
@@ -89,5 +94,10 @@
 forall(dtype T | is_coroutine(T))
 static inline void resume(T & cor) {
-	coroutine_desc * src = TL_GET( this_coroutine );			// optimization
+	// optimization : read TLS once and reuse it
+	// Safety note: this is preemption safe since if
+	// preemption occurs after this line, the pointer
+	// will also migrate which means this value will
+	// stay in syn with the TLS
+	coroutine_desc * src = TL_GET( this_coroutine );
 	coroutine_desc * dst = get_coroutine(cor);
 
@@ -107,5 +117,5 @@
 		dst->last = src;
 		dst->starter = dst->starter ? dst->starter : src;
-	} // if
+	}
 
 	// always done for performance testing
@@ -114,5 +124,10 @@
 
 static inline void resume(coroutine_desc * dst) {
-	coroutine_desc * src = TL_GET( this_coroutine );			// optimization
+	// optimization : read TLS once and reuse it
+	// Safety note: this is preemption safe since if
+	// preemption occurs after this line, the pointer
+	// will also migrate which means this value will
+	// stay in syn with the TLS
+	coroutine_desc * src = TL_GET( this_coroutine );
 
 	// not resuming self ?
@@ -125,5 +140,5 @@
 		// set last resumer
 		dst->last = src;
-	} // if
+	}
 
 	// always done for performance testing
Index: src/libcfa/concurrency/coroutine.c
===================================================================
--- src/libcfa/concurrency/coroutine.c	(revision 10cfad9d3d9c4cef2b74f4ef3fab06d8cbe04ae5)
+++ src/libcfa/concurrency/coroutine.c	(revision 14a61b5e0193d16a9724adf11613922ed2eeec81)
@@ -84,4 +84,5 @@
 // Wrapper for co
 void CoroutineCtxSwitch(coroutine_desc* src, coroutine_desc* dst) {
+      // Safety note : This could cause some false positives due to preemption
       verify( TL_GET( preemption_state ).enabled || TL_GET( this_processor )->do_terminate );
       disable_interrupts();
@@ -91,5 +92,5 @@
 
       // set new coroutine that task is executing
-      TL_SET( this_coroutine, dst );
+      kernelTLS.this_coroutine = dst;
 
       // context switch to specified coroutine
@@ -102,4 +103,5 @@
 
       enable_interrupts( __cfaabi_dbg_ctx );
+      // Safety note : This could cause some false positives due to preemption
       verify( TL_GET( preemption_state ).enabled || TL_GET( this_processor )->do_terminate );
 } //ctxSwitchDirect
Index: src/libcfa/concurrency/invoke.h
===================================================================
--- src/libcfa/concurrency/invoke.h	(revision 10cfad9d3d9c4cef2b74f4ef3fab06d8cbe04ae5)
+++ src/libcfa/concurrency/invoke.h	(revision 14a61b5e0193d16a9724adf11613922ed2eeec81)
@@ -18,6 +18,6 @@
 #include "bits/locks.h"
 
-#define TL_GET( member ) kernelThreadData.member
-#define TL_SET( member, value ) kernelThreadData.member = value;
+#define TL_GET( member ) kernelTLS.member
+#define TL_SET( member, value ) kernelTLS.member = value;
 
 #ifdef __cforall
@@ -44,10 +44,10 @@
 				volatile bool in_progress;
 			} preemption_state;
-		} kernelThreadData;
+		} kernelTLS;
 	}
 
 	static inline struct coroutine_desc * volatile active_coroutine() { return TL_GET( this_coroutine ); }
-	static inline struct thread_desc * volatile active_thread() { return TL_GET( this_thread ); }
-	static inline struct processor * volatile active_processor() { return TL_GET( this_processor ); }
+	static inline struct thread_desc    * volatile active_thread   () { return TL_GET( this_thread    ); }
+	// static inline struct processor      * volatile active_processor() { return TL_GET( this_processor ); } // UNSAFE
 	#endif
 
Index: src/libcfa/concurrency/kernel
===================================================================
--- src/libcfa/concurrency/kernel	(revision 10cfad9d3d9c4cef2b74f4ef3fab06d8cbe04ae5)
+++ src/libcfa/concurrency/kernel	(revision 14a61b5e0193d16a9724adf11613922ed2eeec81)
@@ -53,4 +53,7 @@
 	// Preemption rate on this cluster
 	Duration preemption_rate;
+
+	// List of idle processors
+	// __dllist_t(struct processor) idles;
 };
 
@@ -124,4 +127,9 @@
 	bool pending_preemption;
 
+	struct {
+		pthread_mutex_t lock;
+		pthread_cond_t  cond;
+	} idle;
+
 #ifdef __CFA_DEBUG__
 	// Last function to enable preemption on this processor
Index: src/libcfa/concurrency/kernel.c
===================================================================
--- src/libcfa/concurrency/kernel.c	(revision 10cfad9d3d9c4cef2b74f4ef3fab06d8cbe04ae5)
+++ src/libcfa/concurrency/kernel.c	(revision 14a61b5e0193d16a9724adf11613922ed2eeec81)
@@ -56,5 +56,5 @@
 // volatile thread_local unsigned short disable_preempt_count = 1;
 
-thread_local struct KernelThreadData kernelThreadData = {
+thread_local struct KernelThreadData kernelTLS = {
 	NULL,
 	NULL,
@@ -155,7 +155,7 @@
 		terminate(&this);
 		verify(this.do_terminate);
-		verify(TL_GET( this_processor ) != &this);
+		verify( kernelTLS.this_processor != &this);
 		P( terminated );
-		verify(TL_GET( this_processor ) != &this);
+		verify( kernelTLS.this_processor != &this);
 		pthread_join( kernel_thread, NULL );
 	}
@@ -196,9 +196,9 @@
 			if(readyThread)
 			{
-				verify( ! TL_GET( preemption_state ).enabled );
+				verify( ! kernelTLS.preemption_state.enabled );
 
 				runThread(this, readyThread);
 
-				verify( ! TL_GET( preemption_state ).enabled );
+				verify( ! kernelTLS.preemption_state.enabled );
 
 				//Some actions need to be taken from the kernel
@@ -221,4 +221,5 @@
 }
 
+// KERNEL ONLY
 // runThread runs a thread by context switching
 // from the processor coroutine to the target thread
@@ -228,9 +229,9 @@
 	coroutine_desc * thrd_cor = dst->curr_cor;
 
-	//Reset the terminating actions here
+	// Reset the terminating actions here
 	this->finish.action_code = No_Action;
 
-	//Update global state
-	TL_SET( this_thread, dst );
+	// Update global state
+	kernelTLS.this_thread = dst;
 
 	// Context Switch to the thread
@@ -239,15 +240,17 @@
 }
 
+// KERNEL_ONLY
 void returnToKernel() {
-	coroutine_desc * proc_cor = get_coroutine(TL_GET( this_processor )->runner);
-	coroutine_desc * thrd_cor = TL_GET( this_thread )->curr_cor = TL_GET( this_coroutine );
+	coroutine_desc * proc_cor = get_coroutine(kernelTLS.this_processor->runner);
+	coroutine_desc * thrd_cor = kernelTLS.this_thread->curr_cor = kernelTLS.this_coroutine;
 	ThreadCtxSwitch(thrd_cor, proc_cor);
 }
 
+// KERNEL_ONLY
 // Once a thread has finished running, some of
 // its final actions must be executed from the kernel
 void finishRunning(processor * this) with( this->finish ) {
 	if( action_code == Release ) {
-		verify( ! TL_GET( preemption_state ).enabled );
+		verify( ! kernelTLS.preemption_state.enabled );
 		unlock( *lock );
 	}
@@ -256,10 +259,10 @@
 	}
 	else if( action_code == Release_Schedule ) {
-		verify( ! TL_GET( preemption_state ).enabled );
+		verify( ! kernelTLS.preemption_state.enabled );
 		unlock( *lock );
 		ScheduleThread( thrd );
 	}
 	else if( action_code == Release_Multi ) {
-		verify( ! TL_GET( preemption_state ).enabled );
+		verify( ! kernelTLS.preemption_state.enabled );
 		for(int i = 0; i < lock_count; i++) {
 			unlock( *locks[i] );
@@ -285,4 +288,5 @@
 }
 
+// KERNEL_ONLY
 // Context invoker for processors
 // This is the entry point for processors (kernel threads)
@@ -290,8 +294,8 @@
 void * CtxInvokeProcessor(void * arg) {
 	processor * proc = (processor *) arg;
-	TL_SET( this_processor, proc );
-	TL_SET( this_coroutine, NULL );
-	TL_SET( this_thread, NULL );
-	TL_GET( preemption_state ).[enabled, disable_count] = [false, 1];
+	kernelTLS.this_processor = proc;
+	kernelTLS.this_coroutine = NULL;
+	kernelTLS.this_thread    = NULL;
+	kernelTLS.preemption_state.[enabled, disable_count] = [false, 1];
 	// SKULLDUGGERY: We want to create a context for the processor coroutine
 	// which is needed for the 2-step context switch. However, there is no reason
@@ -305,6 +309,6 @@
 
 	//Set global state
-	TL_SET( this_coroutine, get_coroutine(proc->runner) );
-	TL_SET( this_thread, NULL );
+	kernelTLS.this_coroutine = get_coroutine(proc->runner);
+	kernelTLS.this_thread    = NULL;
 
 	//We now have a proper context from which to schedule threads
@@ -333,14 +337,15 @@
 }
 
+// KERNEL_ONLY
 void kernel_first_resume(processor * this) {
-	coroutine_desc * src = TL_GET( this_coroutine );
+	coroutine_desc * src = kernelTLS.this_coroutine;
 	coroutine_desc * dst = get_coroutine(this->runner);
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 
 	create_stack(&dst->stack, dst->stack.size);
 	CtxStart(&this->runner, CtxInvokeCoroutine);
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 
 	dst->last = src;
@@ -351,5 +356,5 @@
 
 	// set new coroutine that task is executing
-	TL_SET( this_coroutine, dst );
+	kernelTLS.this_coroutine = dst;
 
 	// SKULLDUGGERY normally interrupts are enable before leaving a coroutine ctxswitch.
@@ -368,15 +373,16 @@
 	src->state = Active;
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 }
 
 //-----------------------------------------------------------------------------
 // Scheduler routines
+
+// KERNEL ONLY
 void ScheduleThread( thread_desc * thrd ) {
-	// if( ! thrd ) return;
 	verify( thrd );
 	verify( thrd->self_cor.state != Halted );
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 
 	verifyf( thrd->next == NULL, "Expected null got %p", thrd->next );
@@ -388,13 +394,14 @@
 	}
 
-	verify( ! TL_GET( preemption_state ).enabled );
-}
-
+	verify( ! kernelTLS.preemption_state.enabled );
+}
+
+// KERNEL ONLY
 thread_desc * nextThread(cluster * this) with( *this ) {
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	lock( ready_queue_lock __cfaabi_dbg_ctx2 );
 	thread_desc * head = pop_head( ready_queue );
 	unlock( ready_queue_lock );
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	return head;
 }
@@ -402,7 +409,7 @@
 void BlockInternal() {
 	disable_interrupts();
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	returnToKernel();
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	enable_interrupts( __cfaabi_dbg_ctx );
 }
@@ -410,12 +417,12 @@
 void BlockInternal( __spinlock_t * lock ) {
 	disable_interrupts();
-	with( *TL_GET( this_processor ) ) {
+	with( *kernelTLS.this_processor ) {
 		finish.action_code = Release;
 		finish.lock        = lock;
 	}
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! preemption_state.enabled );
 	returnToKernel();
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! preemption_state.enabled );
 
 	enable_interrupts( __cfaabi_dbg_ctx );
@@ -424,12 +431,12 @@
 void BlockInternal( thread_desc * thrd ) {
 	disable_interrupts();
-	with( *TL_GET( this_processor ) ) {
+	with( * kernelTLS.this_processor ) {
 		finish.action_code = Schedule;
 		finish.thrd        = thrd;
 	}
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	returnToKernel();
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 
 	enable_interrupts( __cfaabi_dbg_ctx );
@@ -439,5 +446,5 @@
 	assert(thrd);
 	disable_interrupts();
-	with( *TL_GET( this_processor ) ) {
+	with( * kernelTLS.this_processor ) {
 		finish.action_code = Release_Schedule;
 		finish.lock        = lock;
@@ -445,7 +452,7 @@
 	}
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	returnToKernel();
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 
 	enable_interrupts( __cfaabi_dbg_ctx );
@@ -454,5 +461,5 @@
 void BlockInternal(__spinlock_t * locks [], unsigned short count) {
 	disable_interrupts();
-	with( *TL_GET( this_processor ) ) {
+	with( * kernelTLS.this_processor ) {
 		finish.action_code = Release_Multi;
 		finish.locks       = locks;
@@ -460,7 +467,7 @@
 	}
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	returnToKernel();
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 
 	enable_interrupts( __cfaabi_dbg_ctx );
@@ -469,5 +476,5 @@
 void BlockInternal(__spinlock_t * locks [], unsigned short lock_count, thread_desc * thrds [], unsigned short thrd_count) {
 	disable_interrupts();
-	with( *TL_GET( this_processor ) ) {
+	with( *kernelTLS.this_processor ) {
 		finish.action_code = Release_Multi_Schedule;
 		finish.locks       = locks;
@@ -477,14 +484,15 @@
 	}
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	returnToKernel();
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 
 	enable_interrupts( __cfaabi_dbg_ctx );
 }
 
+// KERNEL ONLY
 void LeaveThread(__spinlock_t * lock, thread_desc * thrd) {
-	verify( ! TL_GET( preemption_state ).enabled );
-	with( *TL_GET( this_processor ) ) {
+	verify( ! kernelTLS.preemption_state.enabled );
+	with( * kernelTLS.this_processor ) {
 		finish.action_code = thrd ? Release_Schedule : Release;
 		finish.lock        = lock;
@@ -501,5 +509,5 @@
 // Kernel boot procedures
 void kernel_startup(void) {
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	__cfaabi_dbg_print_safe("Kernel : Starting\n");
 
@@ -547,7 +555,7 @@
 
 	//initialize the global state variables
-	TL_SET( this_processor, mainProcessor );
-	TL_SET( this_thread, mainThread );
-	TL_SET( this_coroutine, &mainThread->self_cor );
+	kernelTLS.this_processor = mainProcessor;
+	kernelTLS.this_thread    = mainThread;
+	kernelTLS.this_coroutine = &mainThread->self_cor;
 
 	// Enable preemption
@@ -561,5 +569,5 @@
 	// context. Hence, the main thread does not begin through CtxInvokeThread, like all other threads. The trick here is that
 	// mainThread is on the ready queue when this call is made.
-	kernel_first_resume( TL_GET( this_processor ) );
+	kernel_first_resume( kernelTLS.this_processor );
 
 
@@ -568,5 +576,5 @@
 	__cfaabi_dbg_print_safe("Kernel : Started\n--------------------------------------------------\n\n");
 
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 	enable_interrupts( __cfaabi_dbg_ctx );
 	verify( TL_GET( preemption_state ).enabled );
@@ -578,5 +586,5 @@
 	verify( TL_GET( preemption_state ).enabled );
 	disable_interrupts();
-	verify( ! TL_GET( preemption_state ).enabled );
+	verify( ! kernelTLS.preemption_state.enabled );
 
 	// SKULLDUGGERY: Notify the mainProcessor it needs to terminates.
@@ -602,4 +610,26 @@
 	__cfaabi_dbg_print_safe("Kernel : Shutdown complete\n");
 }
+
+//=============================================================================================
+// Kernel Quiescing
+//=============================================================================================
+
+// void halt(processor * this) with( this ) {
+// 	pthread_mutex_lock( &idle.lock );
+
+
+
+// 	// SKULLDUGGERY: Even if spurious wake-up is a thing
+// 	// spuriously waking up a kernel thread is not a big deal
+// 	// if it is very rare.
+// 	pthread_cond_wait( &idle.cond, &idle.lock);
+// 	pthread_mutex_unlock( &idle.lock );
+// }
+
+// void wake(processor * this) with( this ) {
+// 	pthread_mutex_lock  (&idle.lock);
+// 	pthread_cond_signal (&idle.cond);
+// 	pthread_mutex_unlock(&idle.lock);
+// }
 
 //=============================================================================================
@@ -633,5 +663,5 @@
 	}
 
-	return TL_GET( this_thread );
+	return kernelTLS.this_thread;
 }
 
@@ -642,6 +672,6 @@
 	__cfaabi_dbg_bits_write( abort_text, len );
 
-	if ( get_coroutine(thrd) != TL_GET( this_coroutine ) ) {
-		len = snprintf( abort_text, abort_text_size, " in coroutine %.256s (%p).\n", TL_GET( this_coroutine )->name, TL_GET( this_coroutine ) );
+	if ( get_coroutine(thrd) != kernelTLS.this_coroutine ) {
+		len = snprintf( abort_text, abort_text_size, " in coroutine %.256s (%p).\n", kernelTLS.this_coroutine->name, kernelTLS.this_coroutine );
 		__cfaabi_dbg_bits_write( abort_text, len );
 	}
@@ -652,5 +682,5 @@
 
 int kernel_abort_lastframe( void ) __attribute__ ((__nothrow__)) {
-	return get_coroutine(TL_GET( this_thread )) == get_coroutine(mainThread) ? 4 : 2;
+	return get_coroutine(kernelTLS.this_thread) == get_coroutine(mainThread) ? 4 : 2;
 }
 
@@ -682,5 +712,5 @@
 	if ( count < 0 ) {
 		// queue current task
-		append( waiting, (thread_desc *)TL_GET( this_thread ) );
+		append( waiting, kernelTLS.this_thread );
 
 		// atomically release spin lock and block
@@ -742,5 +772,5 @@
 	void __cfaabi_dbg_record(__spinlock_t & this, const char * prev_name) {
 		this.prev_name = prev_name;
-		this.prev_thrd = TL_GET( this_thread );
+		this.prev_thrd = kernelTLS.this_thread;
 	}
 )
Index: src/libcfa/concurrency/monitor.c
===================================================================
--- src/libcfa/concurrency/monitor.c	(revision 10cfad9d3d9c4cef2b74f4ef3fab06d8cbe04ae5)
+++ src/libcfa/concurrency/monitor.c	(revision 14a61b5e0193d16a9724adf11613922ed2eeec81)
@@ -85,5 +85,6 @@
 		// Lock the monitor spinlock
 		lock( this->lock __cfaabi_dbg_ctx2 );
-		thread_desc * thrd = TL_GET( this_thread );
+		// Interrupts disable inside critical section
+		thread_desc * thrd = kernelTLS.this_thread;
 
 		__cfaabi_dbg_print_safe( "Kernel : %10p Entering mon %p (%p)\n", thrd, this, this->owner);
@@ -134,5 +135,6 @@
 		// Lock the monitor spinlock
 		lock( this->lock __cfaabi_dbg_ctx2 );
-		thread_desc * thrd = TL_GET( this_thread );
+		// Interrupts disable inside critical section
+		thread_desc * thrd = kernelTLS.this_thread;
 
 		__cfaabi_dbg_print_safe( "Kernel : %10p Entering dtor for mon %p (%p)\n", thrd, this, this->owner);
@@ -168,5 +170,5 @@
 
 			// Create the node specific to this wait operation
-			wait_ctx_primed( TL_GET( this_thread ), 0 )
+			wait_ctx_primed( thrd, 0 )
 
 			// Some one else has the monitor, wait for him to finish and then run
@@ -179,5 +181,5 @@
 			__cfaabi_dbg_print_safe( "Kernel :  blocking \n" );
 
-			wait_ctx( TL_GET( this_thread ), 0 )
+			wait_ctx( thrd, 0 )
 			this->dtor_node = &waiter;
 
@@ -199,7 +201,7 @@
 		lock( this->lock __cfaabi_dbg_ctx2 );
 
-		__cfaabi_dbg_print_safe( "Kernel : %10p Leaving mon %p (%p)\n", TL_GET( this_thread ), this, this->owner);
-
-		verifyf( TL_GET( this_thread ) == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", TL_GET( this_thread ), this->owner, this->recursion, this );
+		__cfaabi_dbg_print_safe( "Kernel : %10p Leaving mon %p (%p)\n", kernelTLS.this_thread, this, this->owner);
+
+		verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
 
 		// Leaving a recursion level, decrement the counter
@@ -289,4 +291,6 @@
 // Sorts monitors before entering
 void ?{}( monitor_guard_t & this, monitor_desc * m [], __lock_size_t count, fptr_t func ) {
+	thread_desc * thrd = TL_GET( this_thread );
+
 	// Store current array
 	this.m = m;
@@ -297,8 +301,8 @@
 
 	// Save previous thread context
-	this.prev = TL_GET( this_thread )->monitors;
+	this.prev = thrd->monitors;
 
 	// Update thread context (needed for conditions)
-	(TL_GET( this_thread )->monitors){m, count, func};
+	(thrd->monitors){m, count, func};
 
 	// __cfaabi_dbg_print_safe( "MGUARD : enter %d\n", count);
@@ -328,12 +332,15 @@
 // Sorts monitors before entering
 void ?{}( monitor_dtor_guard_t & this, monitor_desc * m [], fptr_t func ) {
+	// optimization
+	thread_desc * thrd = TL_GET( this_thread );
+
 	// Store current array
 	this.m = *m;
 
 	// Save previous thread context
-	this.prev = TL_GET( this_thread )->monitors;
+	this.prev = thrd->monitors;
 
 	// Update thread context (needed for conditions)
-	(TL_GET( this_thread )->monitors){m, 1, func};
+	(thrd->monitors){m, 1, func};
 
 	__enter_monitor_dtor( this.m, func );
@@ -566,5 +573,5 @@
 
 				// Create the node specific to this wait operation
-				wait_ctx_primed( TL_GET( this_thread ), 0 );
+				wait_ctx_primed( kernelTLS.this_thread, 0 );
 
 				// Save monitor states
@@ -612,5 +619,5 @@
 
 	// Create the node specific to this wait operation
-	wait_ctx_primed( TL_GET( this_thread ), 0 );
+	wait_ctx_primed( kernelTLS.this_thread, 0 );
 
 	monitor_save;
@@ -618,5 +625,5 @@
 
 	for( __lock_size_t i = 0; i < count; i++) {
-		verify( monitors[i]->owner == TL_GET( this_thread ) );
+		verify( monitors[i]->owner == kernelTLS.this_thread );
 	}
 
Index: src/libcfa/concurrency/preemption.c
===================================================================
--- src/libcfa/concurrency/preemption.c	(revision 10cfad9d3d9c4cef2b74f4ef3fab06d8cbe04ae5)
+++ src/libcfa/concurrency/preemption.c	(revision 14a61b5e0193d16a9724adf11613922ed2eeec81)
@@ -234,11 +234,14 @@
 }
 
-
+// KERNEL ONLY
 // Check if a CtxSwitch signal handler shoud defer
 // If true  : preemption is safe
 // If false : preemption is unsafe and marked as pending
 static inline bool preemption_ready() {
-	bool ready = TL_GET( preemption_state ).enabled && !TL_GET( preemption_state ).in_progress; // Check if preemption is safe
-	TL_GET( this_processor )->pending_preemption = !ready;			// Adjust the pending flag accordingly
+	// Check if preemption is safe
+	bool ready = kernelTLS.preemption_state.enabled && ! kernelTLS.preemption_state.in_progress;
+
+	// Adjust the pending flag accordingly
+	kernelTLS.this_processor->pending_preemption = !ready;
 	return ready;
 }
@@ -254,6 +257,6 @@
 
 	// Start with preemption disabled until ready
-	TL_GET( preemption_state ).enabled = false;
-	TL_GET( preemption_state ).disable_count = 1;
+	kernelTLS.preemption_state.enabled = false;
+	kernelTLS.preemption_state.disable_count = 1;
 
 	// Initialize the event kernel
@@ -320,9 +323,9 @@
 	// before the kernel thread has even started running. When that happens an iterrupt
 	// we a null 'this_processor' will be caught, just ignore it.
-	if(!TL_GET( this_processor )) return;
+	if(! kernelTLS.this_processor ) return;
 
 	choose(sfp->si_value.sival_int) {
 		case PREEMPT_NORMAL   : ;// Normal case, nothing to do here
-		case PREEMPT_TERMINATE: verify(TL_GET( this_processor )->do_terminate);
+		case PREEMPT_TERMINATE: verify( kernelTLS.this_processor->do_terminate);
 		default:
 			abort( "internal error, signal value is %d", sfp->si_value.sival_int );
@@ -332,13 +335,19 @@
 	if( !preemption_ready() ) { return; }
 
-	__cfaabi_dbg_print_buffer_decl( " KERNEL: preempting core %p (%p).\n", TL_GET( this_processor ), TL_GET( this_thread ) );
-
-	TL_GET( preemption_state ).in_progress = true;  // Sync flag : prevent recursive calls to the signal handler
-	signal_unblock( SIGUSR1 );                          // We are about to CtxSwitch out of the signal handler, let other handlers in
-	TL_GET( preemption_state ).in_progress = false; // Clear the in progress flag
+	__cfaabi_dbg_print_buffer_decl( " KERNEL: preempting core %p (%p).\n", kernelTLS.this_processor, kernelTLS.this_thread );
+
+	// Sync flag : prevent recursive calls to the signal handler
+	kernelTLS.preemption_state.in_progress = true;
+
+	// We are about to CtxSwitch out of the signal handler, let other handlers in
+	signal_unblock( SIGUSR1 );
+
+	// TODO: this should go in finish action
+	// Clear the in progress flag
+	kernelTLS.preemption_state.in_progress = false;
 
 	// Preemption can occur here
 
-	BlockInternal( (thread_desc*)TL_GET( this_thread ) ); // Do the actual CtxSwitch
+	BlockInternal( kernelTLS.this_thread ); // Do the actual CtxSwitch
 }
 
@@ -409,5 +418,5 @@
 
 void __cfaabi_check_preemption() {
-	bool ready = TL_GET( preemption_state ).enabled;
+	bool ready = kernelTLS.preemption_state.enabled;
 	if(!ready) { abort("Preemption should be ready"); }
 
Index: src/libcfa/concurrency/thread.c
===================================================================
--- src/libcfa/concurrency/thread.c	(revision 10cfad9d3d9c4cef2b74f4ef3fab06d8cbe04ae5)
+++ src/libcfa/concurrency/thread.c	(revision 14a61b5e0193d16a9724adf11613922ed2eeec81)
@@ -81,5 +81,5 @@
 	disable_interrupts();
 	create_stack(&thrd_c->stack, thrd_c->stack.size);
-	TL_SET( this_coroutine, thrd_c );
+	kernelTLS.this_coroutine = thrd_c;
 	CtxStart(&this, CtxInvokeThread);
 	assert( thrd_c->last->stack.context );
@@ -91,6 +91,7 @@
 
 extern "C" {
+	// KERNEL ONLY
 	void __finish_creation(void) {
-		coroutine_desc* thrd_c = TL_GET( this_coroutine );
+		coroutine_desc* thrd_c = kernelTLS.this_coroutine;
 		ThreadCtxSwitch( thrd_c, thrd_c->last );
 	}
@@ -98,7 +99,9 @@
 
 void yield( void ) {
-	verify( TL_GET( preemption_state ).enabled );
+	// Safety note : This could cause some false positives due to preemption
+      verify( TL_GET( preemption_state ).enabled );
 	BlockInternal( TL_GET( this_thread ) );
-	verify( TL_GET( preemption_state ).enabled );
+	// Safety note : This could cause some false positives due to preemption
+      verify( TL_GET( preemption_state ).enabled );
 }
 
@@ -109,4 +112,5 @@
 }
 
+// KERNEL ONLY
 void ThreadCtxSwitch(coroutine_desc* src, coroutine_desc* dst) {
 	// set state of current coroutine to inactive
@@ -116,8 +120,8 @@
 	// set new coroutine that the processor is executing
 	// and context switch to it
-	TL_SET( this_coroutine, dst );
+	kernelTLS.this_coroutine = dst;
 	assert( src->stack.context );
 	CtxSwitch( src->stack.context, dst->stack.context );
-	TL_SET( this_coroutine, src );
+	kernelTLS.this_coroutine = src;
 
 	// set state of new coroutine to active
