Index: libcfa/src/concurrency/CtxSwitch-x86_64.S
===================================================================
--- libcfa/src/concurrency/CtxSwitch-x86_64.S	(revision 76d3ca627c3ea893099f6a38248fd639209fdfe5)
+++ libcfa/src/concurrency/CtxSwitch-x86_64.S	(revision 342136ab8790110e02e6008ab5bf329dcd6d9aa1)
@@ -88,4 +88,59 @@
 	ret
 
+//.text
+//	.align 2
+//.globl	CtxStore
+//CtxStore:
+//	// Save floating & SSE control words on the stack.
+//
+//	subq   $8,%rsp
+//	stmxcsr 0(%rsp)         // 4 bytes
+//	fnstcw  4(%rsp)         // 2 bytes
+//
+//	// Save volatile registers on the stack.
+//
+//	pushq %r15
+//	pushq %r14
+//	pushq %r13
+//	pushq %r12
+//	pushq %rbx
+//
+//	// Save old context in the "from" area.
+//
+//	movq %rsp,SP_OFFSET(%rdi)
+//	movq %rbp,FP_OFFSET(%rdi)
+//
+//	// Return to thread
+//
+//	ret
+//
+//.text
+//	.align 2
+//.globl 	CtxRet
+//CtxRet:
+//	// Load new context from the "to" area.
+//
+//	movq SP_OFFSET(%rdi),%rsp
+//	movq FP_OFFSET(%rdi),%rbp
+//
+//	// Load volatile registers from the stack.
+//
+//	popq %rbx
+//	popq %r12
+//	popq %r13
+//	popq %r14
+//	popq %r15
+//
+//	// Load floating & SSE control words from the stack.
+//
+//	fldcw   4(%rsp)
+//	ldmxcsr 0(%rsp)
+//	addq   $8,%rsp
+//
+//	// Return to thread.
+//
+//	ret
+
+
 .text
 	.align 2
Index: libcfa/src/concurrency/coroutine.cfa
===================================================================
--- libcfa/src/concurrency/coroutine.cfa	(revision 76d3ca627c3ea893099f6a38248fd639209fdfe5)
+++ libcfa/src/concurrency/coroutine.cfa	(revision 342136ab8790110e02e6008ab5bf329dcd6d9aa1)
@@ -35,5 +35,5 @@
 
 extern "C" {
-      void _CtxCoroutine_Unwind(struct _Unwind_Exception * storage) __attribute__ ((__noreturn__));
+      void _CtxCoroutine_Unwind(struct _Unwind_Exception * storage, struct coroutine_desc *) __attribute__ ((__noreturn__));
       static void _CtxCoroutine_UnwindCleanup(_Unwind_Reason_Code, struct _Unwind_Exception *) __attribute__ ((__noreturn__));
       static void _CtxCoroutine_UnwindCleanup(_Unwind_Reason_Code, struct _Unwind_Exception *) {
@@ -84,5 +84,5 @@
 void ^?{}(coroutine_desc& this) {
       if(this.state != Halted && this.state != Start) {
-            coroutine_desc * src = TL_GET( this_coroutine );
+            coroutine_desc * src = TL_GET( this_thread )->curr_cor;
             coroutine_desc * dst = &this;
 
@@ -115,5 +115,6 @@
 // Wrapper for co
 void CoroutineCtxSwitch(coroutine_desc* src, coroutine_desc* dst) {
-      // Safety note : This could cause some false positives due to preemption
+      // Safety note : Preemption must be disabled since there is a race condition
+      // kernelTLS.this_thread->curr_cor and $rsp/$rbp must agree at all times
       verify( TL_GET( preemption_state.enabled ) || TL_GET( this_processor )->do_terminate );
       disable_interrupts();
@@ -123,5 +124,5 @@
 
       // set new coroutine that task is executing
-      kernelTLS.this_coroutine = dst;
+      TL_GET( this_thread )->curr_cor = dst;
 
       // context switch to specified coroutine
@@ -134,9 +135,9 @@
 
       enable_interrupts( __cfaabi_dbg_ctx );
-      // Safety note : This could cause some false positives due to preemption
       verify( TL_GET( preemption_state.enabled ) || TL_GET( this_processor )->do_terminate );
 
+
       if( unlikely(src->cancellation != NULL) ) {
-            _CtxCoroutine_Unwind(src->cancellation);
+            _CtxCoroutine_Unwind(src->cancellation, src);
       }
 } //ctxSwitchDirect
@@ -197,6 +198,5 @@
       }
 
-      void __leave_coroutine() {
-            coroutine_desc * src = TL_GET( this_coroutine ); // optimization
+      void __leave_coroutine( coroutine_desc * src ) {
             coroutine_desc * starter = src->cancellation != 0 ? src->last : src->starter;
 
Index: libcfa/src/concurrency/coroutine.hfa
===================================================================
--- libcfa/src/concurrency/coroutine.hfa	(revision 76d3ca627c3ea893099f6a38248fd639209fdfe5)
+++ libcfa/src/concurrency/coroutine.hfa	(revision 342136ab8790110e02e6008ab5bf329dcd6d9aa1)
@@ -77,5 +77,5 @@
 	// will also migrate which means this value will
 	// stay in syn with the TLS
-	coroutine_desc * src = TL_GET( this_coroutine );
+	coroutine_desc * src = TL_GET( this_thread )->curr_cor;
 
 	assertf( src->last != 0,
@@ -99,5 +99,5 @@
 	// will also migrate which means this value will
 	// stay in syn with the TLS
-	coroutine_desc * src = TL_GET( this_coroutine );
+	coroutine_desc * src = TL_GET( this_thread )->curr_cor;
 	coroutine_desc * dst = get_coroutine(cor);
 
@@ -129,5 +129,5 @@
 	// will also migrate which means this value will
 	// stay in syn with the TLS
-	coroutine_desc * src = TL_GET( this_coroutine );
+	coroutine_desc * src = TL_GET( this_thread )->curr_cor;
 
 	// not resuming self ?
@@ -146,4 +146,70 @@
 }
 
+
+
+// static inline bool suspend_checkpoint(void) {
+// 	// optimization : read TLS once and reuse it
+// 	// Safety note: this is preemption safe since if
+// 	// preemption occurs after this line, the pointer
+// 	// will also migrate which means this value will
+// 	// stay in syn with the TLS
+// 	// set state of current coroutine to inactive
+//       this->state = Checkpoint;
+
+//       // context switch to specified coroutine
+//       assert( src->stack.context );
+
+//       CtxStore(src->stack.context);
+
+// 	bool ret = this->state == Checkpoint;
+
+//       // set state of new coroutine to active
+//       src->state = Active;
+
+//       enable_interrupts( __cfaabi_dbg_ctx );
+//       // Safety note : This could cause some false positives due to preemption
+//       verify( TL_GET( preemption_state.enabled ) || TL_GET( this_processor )->do_terminate );
+
+//       if( unlikely(src->cancellation != NULL) ) {
+//             _CtxCoroutine_Unwind(src->cancellation);
+//       }
+
+// 	return ret;
+// }
+
+// static inline void suspend_return(void) {
+// 	// optimization : read TLS once and reuse it
+// 	// Safety note: this is preemption safe since if
+// 	// preemption occurs after this line, the pointer
+// 	// will also migrate which means this value will
+// 	// stay in syn with the TLS
+// 	coroutine_desc * src = TL_GET( this_thread )->curr_cor;
+
+// 	assertf( src->last != 0,
+// 		"Attempt to suspend coroutine \"%.256s\" (%p) that has never been resumed.\n"
+// 		"Possible cause is a suspend executed in a member called by a coroutine user rather than by the coroutine main.",
+// 		src->name, src );
+// 	assertf( src->last->state != Halted,
+// 		"Attempt by coroutine \"%.256s\" (%p) to suspend back to terminated coroutine \"%.256s\" (%p).\n"
+// 		"Possible cause is terminated coroutine's main routine has already returned.",
+// 		src->name, src, src->last->name, src->last );
+
+// 	// Safety note : Preemption must be disabled here since kernelTLS.this_coroutine must always be up to date
+//       verify( TL_GET( preemption_state.enabled ) || TL_GET( this_processor )->do_terminate );
+//       disable_interrupts();
+
+//       // set state of current coroutine to inactive
+//       src->state = src->state == Halted ? Halted : Inactive;
+
+//       // set new coroutine that task is executing
+//       kernelTLS.this_coroutine = dst;
+
+//       // context switch to specified coroutine
+//       assert( src->stack.context );
+// 	CtxRet( src->stack.context );
+
+// 	abort();
+// }
+
 // Local Variables: //
 // mode: c //
Index: libcfa/src/concurrency/invoke.c
===================================================================
--- libcfa/src/concurrency/invoke.c	(revision 76d3ca627c3ea893099f6a38248fd639209fdfe5)
+++ libcfa/src/concurrency/invoke.c	(revision 342136ab8790110e02e6008ab5bf329dcd6d9aa1)
@@ -28,6 +28,6 @@
 
 extern void __suspend_internal(void);
-extern void __leave_coroutine(void);
-extern void __finish_creation(void);
+extern void __leave_coroutine( struct coroutine_desc * );
+extern void __finish_creation( struct coroutine_desc * );
 extern void __leave_thread_monitor( struct thread_desc * this );
 extern void disable_interrupts();
@@ -52,5 +52,5 @@
 
 	//Final suspend, should never return
-	__leave_coroutine();
+	__leave_coroutine( cor );
 	__cabi_abort( "Resumed dead coroutine" );
 }
@@ -62,10 +62,10 @@
 	__attribute((__unused__)) struct _Unwind_Exception * unwind_exception,
 	__attribute((__unused__)) struct _Unwind_Context * context,
-	__attribute((__unused__)) void * param
+	void * param
 ) {
 	if( actions & _UA_END_OF_STACK  ) {
 		// We finished unwinding the coroutine,
 		// leave it
-		__leave_coroutine();
+		__leave_coroutine( param );
 		__cabi_abort( "Resumed dead coroutine" );
 	}
@@ -75,7 +75,7 @@
 }
 
-void _CtxCoroutine_Unwind(struct _Unwind_Exception * storage) __attribute__ ((__noreturn__));
-void _CtxCoroutine_Unwind(struct _Unwind_Exception * storage) {
-	_Unwind_Reason_Code ret = _Unwind_ForcedUnwind( storage, _CtxCoroutine_UnwindStop, NULL );
+void _CtxCoroutine_Unwind(struct _Unwind_Exception * storage, struct coroutine_desc * cor) __attribute__ ((__noreturn__));
+void _CtxCoroutine_Unwind(struct _Unwind_Exception * storage, struct coroutine_desc * cor) {
+	_Unwind_Reason_Code ret = _Unwind_ForcedUnwind( storage, _CtxCoroutine_UnwindStop, cor );
 	printf("UNWIND ERROR %d after force unwind\n", ret);
 	abort();
@@ -88,10 +88,12 @@
 	void *this
 ) {
+	// Fetch the thread handle from the user defined thread structure
+	struct thread_desc* thrd = get_thread( this );
+
 	// First suspend, once the thread arrives here,
 	// the function pointer to main can be invalidated without risk
-	__finish_creation();
+	__finish_creation(&thrd->self_cor);
 
-	// Fetch the thread handle from the user defined thread structure
-	struct thread_desc* thrd = get_thread( this );
+	// Restore the last to NULL, we clobbered because of the thunk problem
 	thrd->self_cor.last = NULL;
 
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision 76d3ca627c3ea893099f6a38248fd639209fdfe5)
+++ libcfa/src/concurrency/invoke.h	(revision 342136ab8790110e02e6008ab5bf329dcd6d9aa1)
@@ -50,5 +50,4 @@
 
 		extern thread_local struct KernelThreadData {
-			struct coroutine_desc * volatile this_coroutine;
 			struct thread_desc    * volatile this_thread;
 			struct processor      * volatile this_processor;
@@ -61,8 +60,4 @@
 		} kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
 	}
-
-	static inline struct coroutine_desc * volatile active_coroutine() { return TL_GET( this_coroutine ); }
-	static inline struct thread_desc    * volatile active_thread   () { return TL_GET( this_thread    ); }
-	static inline struct processor      * volatile active_processor() { return TL_GET( this_processor ); } // UNSAFE
 	#endif
 
@@ -170,8 +165,12 @@
 			struct thread_desc * prev;
 		} node;
-     };
-
-     #ifdef __cforall
-     extern "Cforall" {
+	};
+
+	#ifdef __cforall
+	extern "Cforall" {
+		static inline struct coroutine_desc * volatile active_coroutine() { return TL_GET( this_thread )->curr_cor; }
+		static inline struct thread_desc    * volatile active_thread   () { return TL_GET( this_thread    ); }
+		static inline struct processor      * volatile active_processor() { return TL_GET( this_processor ); } // UNSAFE
+
 		static inline thread_desc * & get_next( thread_desc & this ) {
 			return this.next;
@@ -232,4 +231,6 @@
 	extern void CtxInvokeStub( void );
 	void CtxSwitch( void * from, void * to ) asm ("CtxSwitch");
+	// void CtxStore ( void * this ) asm ("CtxStore");
+	// void CtxRet   ( void * dst  ) asm ("CtxRet");
 
 	#if   defined( __i386 )
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 76d3ca627c3ea893099f6a38248fd639209fdfe5)
+++ libcfa/src/concurrency/kernel.cfa	(revision 342136ab8790110e02e6008ab5bf329dcd6d9aa1)
@@ -60,5 +60,4 @@
 	NULL,
 	NULL,
-	NULL,
 	{ 1, false, false }
 };
@@ -263,5 +262,5 @@
 static void returnToKernel() {
 	coroutine_desc * proc_cor = get_coroutine(kernelTLS.this_processor->runner);
-	coroutine_desc * thrd_cor = kernelTLS.this_thread->curr_cor = kernelTLS.this_coroutine;
+	coroutine_desc * thrd_cor = kernelTLS.this_thread->curr_cor;
 	ThreadCtxSwitch(thrd_cor, proc_cor);
 }
@@ -307,5 +306,4 @@
 	processor * proc = (processor *) arg;
 	kernelTLS.this_processor = proc;
-	kernelTLS.this_coroutine = NULL;
 	kernelTLS.this_thread    = NULL;
 	kernelTLS.preemption_state.[enabled, disable_count] = [false, 1];
@@ -321,5 +319,4 @@
 
 	//Set global state
-	kernelTLS.this_coroutine = get_coroutine(proc->runner);
 	kernelTLS.this_thread    = NULL;
 
@@ -351,5 +348,5 @@
 // KERNEL_ONLY
 void kernel_first_resume(processor * this) {
-	coroutine_desc * src = kernelTLS.this_coroutine;
+	coroutine_desc * src = mainThread->curr_cor;
 	coroutine_desc * dst = get_coroutine(this->runner);
 
@@ -366,7 +363,4 @@
 	// set state of current coroutine to inactive
 	src->state = src->state == Halted ? Halted : Inactive;
-
-	// set new coroutine that task is executing
-	kernelTLS.this_coroutine = dst;
 
 	// SKULLDUGGERY normally interrupts are enable before leaving a coroutine ctxswitch.
@@ -599,5 +593,4 @@
 	kernelTLS.this_processor = mainProcessor;
 	kernelTLS.this_thread    = mainThread;
-	kernelTLS.this_coroutine = &mainThread->self_cor;
 
 	// Enable preemption
@@ -720,6 +713,6 @@
 		__cfaabi_dbg_bits_write( abort_text, len );
 
-		if ( get_coroutine(thrd) != kernelTLS.this_coroutine ) {
-			len = snprintf( abort_text, abort_text_size, " in coroutine %.256s (%p).\n", kernelTLS.this_coroutine->name, kernelTLS.this_coroutine );
+		if ( &thrd->self_cor != thrd->curr_cor ) {
+			len = snprintf( abort_text, abort_text_size, " in coroutine %.256s (%p).\n", thrd->curr_cor->name, thrd->curr_cor );
 			__cfaabi_dbg_bits_write( abort_text, len );
 		}
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 76d3ca627c3ea893099f6a38248fd639209fdfe5)
+++ libcfa/src/concurrency/thread.cfa	(revision 342136ab8790110e02e6008ab5bf329dcd6d9aa1)
@@ -75,5 +75,5 @@
 	coroutine_desc* thrd_c = get_coroutine(this);
 	thread_desc   * thrd_h = get_thread   (this);
-	thrd_c->last = TL_GET( this_coroutine );
+	thrd_c->last = TL_GET( this_thread )->curr_cor;
 
 	// __cfaabi_dbg_print_safe("Thread start : %p (t %p, c %p)\n", this, thrd_c, thrd_h);
@@ -81,5 +81,4 @@
 	disable_interrupts();
 	create_stack(&thrd_c->stack, thrd_c->stack.size);
-	kernelTLS.this_coroutine = thrd_c;
 	CtxStart(&this, CtxInvokeThread);
 	assert( thrd_c->last->stack.context );
@@ -92,6 +91,5 @@
 extern "C" {
 	// KERNEL ONLY
-	void __finish_creation(void) {
-		coroutine_desc* thrd_c = kernelTLS.this_coroutine;
+	void __finish_creation(coroutine_desc * thrd_c) {
 		ThreadCtxSwitch( thrd_c, thrd_c->last );
 	}
@@ -120,8 +118,6 @@
 	// set new coroutine that the processor is executing
 	// and context switch to it
-	kernelTLS.this_coroutine = dst;
 	assert( src->stack.context );
 	CtxSwitch( src->stack.context, dst->stack.context );
-	kernelTLS.this_coroutine = src;
 
 	// set state of new coroutine to active
