Index: src/libcfa/concurrency/invoke.c
===================================================================
--- src/libcfa/concurrency/invoke.c	(revision c5a8c5bf2ad8fe91b68614a3c832241d460c3571)
+++ src/libcfa/concurrency/invoke.c	(revision 596f987b3b97b71c5ffd2f87a1b6f90e82761547)
@@ -13,5 +13,5 @@
 // Called from the kernel when starting a coroutine or task so must switch back to user mode.
 
-extern void __suspend__F___1(void);
+extern void __suspend_no_inline__F___1(void);
 
 void CtxInvokeCoroutine(
@@ -25,5 +25,5 @@
 
       if(cor->state == Primed) {
-            __suspend__F___1();
+            __suspend_no_inline__F___1();
       }
 
Index: src/libcfa/concurrency/invoke.h
===================================================================
--- src/libcfa/concurrency/invoke.h	(revision c5a8c5bf2ad8fe91b68614a3c832241d460c3571)
+++ src/libcfa/concurrency/invoke.h	(revision 596f987b3b97b71c5ffd2f87a1b6f90e82761547)
@@ -10,5 +10,7 @@
 #define _INVOKE_H_
 
-     struct coStack_t {
+      #define unlikely(x)    __builtin_expect(!!(x), 0)
+
+      struct coStack_t {
             unsigned int size;		// size of stack
             void *storage;			// pointer to stack
@@ -38,5 +40,5 @@
 #define _INVOKE_PRIVATE_H_
       
-     struct machine_context_t {
+      struct machine_context_t {
             void *SP;
             void *FP;
Index: src/libcfa/concurrency/threads
===================================================================
--- src/libcfa/concurrency/threads	(revision c5a8c5bf2ad8fe91b68614a3c832241d460c3571)
+++ src/libcfa/concurrency/threads	(revision 596f987b3b97b71c5ffd2f87a1b6f90e82761547)
@@ -18,10 +18,11 @@
 #define __THREADS_H__
 
+#include "assert"       //
 #include "invoke.h"
 
-void ?{}(coStack_t* this);
-
-void ?{}(coroutine* this);
-
+//-----------------------------------------------------------------------------
+// Coroutine trait
+// Anything that implements this trait can be resumed.
+// Anything that is resumed is a coroutine.
 trait is_coroutine(dtype T) {
       void co_main(T* this);
@@ -29,11 +30,84 @@
 };
 
-void suspend(void);
+//-----------------------------------------------------------------------------
+// Ctors and dtors
+void ?{}(coStack_t* this);
+void ?{}(coroutine* this);
+void ^?{}(coStack_t* this);
+void ^?{}(coroutine* this);
+
+//-----------------------------------------------------------------------------
+// Public coroutine API
+static inline void suspend();
 
 forall(dtype T | is_coroutine(T))
-void resume(T* cor);
+static inline void resume(T* cor);
 
 forall(dtype T | is_coroutine(T))
 void prime(T* cor);
+
+//-----------------------------------------------------------------------------
+// PRIVATE exposed because of inline
+
+// Start coroutine routines
+extern "C" {
+      forall(dtype T | is_coroutine(T))
+      void CtxInvokeCoroutine(T* this);
+
+      forall(dtype T | is_coroutine(T))
+      void CtxStart(T* this, void (*invoke)(T*));
+}
+
+// Get current coroutine
+extern coroutine* current_coroutine; //PRIVATE, never use directly
+static inline coroutine* this_coroutine(void) {
+	return current_coroutine;
+}
+
+// Private wrappers for context switch and stack creation
+extern void corCxtSw(coroutine* src, coroutine* dst);
+extern void create_stack( coStack_t* this, unsigned int storageSize );
+
+// Suspend implementation inlined for performance
+static inline void suspend() {
+      coroutine* src = this_coroutine();		// optimization
+
+	assertf( src->last != 0, 
+		"Attempt to suspend coroutine %.256s (%p) that has never been resumed.\n"
+		"Possible cause is a suspend executed in a member called by a coroutine user rather than by the coroutine main.",
+		src->name, src );
+	assertf( src->last->notHalted, 
+		"Attempt by coroutine %.256s (%p) to suspend back to terminated coroutine %.256s (%p).\n"
+		"Possible cause is terminated coroutine's main routine has already returned.",
+		src->name, src, src->last->name, src->last );
+
+	corCxtSw( src, src->last );
+}
+
+// Resume implementation inlined for performance
+forall(dtype T | is_coroutine(T))
+static inline void resume(T* cor) {
+	coroutine* src = this_coroutine();		// optimization
+	coroutine* dst = get_coroutine(cor);
+
+      if( unlikely(!dst->stack.base) ) {
+		create_stack(&dst->stack, dst->stack.size);
+		CtxStart(cor, CtxInvokeCoroutine);
+	}
+
+      // not resuming self ?
+	if ( src != dst ) {
+		assertf( dst->notHalted , 
+			"Attempt by coroutine %.256s (%p) to resume terminated coroutine %.256s (%p).\n"
+			"Possible cause is terminated coroutine's main routine has already returned.",
+			src->name, src, dst->name, dst );
+
+            // set last resumer
+		dst->last = src;
+	} // if
+
+      // always done for performance testing
+	corCxtSw( src, dst );
+}
 
 #endif //__THREADS_H__
Index: src/libcfa/concurrency/threads.c
===================================================================
--- src/libcfa/concurrency/threads.c	(revision c5a8c5bf2ad8fe91b68614a3c832241d460c3571)
+++ src/libcfa/concurrency/threads.c	(revision 596f987b3b97b71c5ffd2f87a1b6f90e82761547)
@@ -24,5 +24,4 @@
 
 #include "threads"
-#include "assert"
 #include "libhdr.h"
 
@@ -30,29 +29,28 @@
 #include "invoke.h"
 
+//-----------------------------------------------------------------------------
+// Global state variables
+
 // minimum feasible stack size in bytes
 #define MinStackSize 1000
-static size_t pageSize = 0;				// architecture pagesize
+static size_t pageSize = 0;				// architecture pagesize HACK, should go in proper runtime singleton
 
+//Extra private desctructor for the main
+//FIXME the main should not actually allocate a stack
+//Since the main is never resumed the extra stack does not cause 
+//any problem but it is wasted memory
 void ?{}(coStack_t* this, size_t size);
 void ?{}(coroutine* this, size_t size);
 
-static coroutine main_coroutine = { 1000 };
-static coroutine* current_coroutine = &main_coroutine;
+//Main coroutine
+//FIXME do not construct a stack for the main
+coroutine main_coroutine = { 1000 };
 
-coroutine* this_coroutine(void) {
-	return current_coroutine;
-}
+//Current coroutine
+//Will need to be in TLS when multi-threading is added
+coroutine* current_coroutine = &main_coroutine;
 
-void corCxtSw(coroutine* src, coroutine* dst);
-void create_stack( coStack_t* this, unsigned int storageSize );	// used by all constructors
-
-extern "C" {
-      forall(dtype T | is_coroutine(T))
-      void CtxInvokeCoroutine(T* this);
-
-      forall(dtype T | is_coroutine(T))
-      void CtxStart(T* this, void (*invoke)(T*));
-}
-
+//-----------------------------------------------------------------------------
+// Coroutine ctors and dtors
 void ?{}(coStack_t* this) {
 	this->size		= 10240;	// size of stack
@@ -72,6 +70,5 @@
 }
 
-void ?{}(coroutine* this)
-{
+void ?{}(coroutine* this) {
 	this->name = "Anonymous Coroutine";
 	this->errno_ = 0;
@@ -82,45 +79,24 @@
 }
 
-void ?{}(coroutine* this, size_t size)
-{
+void ?{}(coroutine* this, size_t size) {
 	this{};
 	(&this->stack){size};
 }
 
-void suspend() {
-      coroutine* src = this_coroutine();		// optimization
-
-	assertf( src->last != 0, 
-		"Attempt to suspend coroutine %.256s (%p) that has never been resumed.\n"
-		"Possible cause is a suspend executed in a member called by a coroutine user rather than by the coroutine main.",
-		src->name, src );
-	assertf( src->last->notHalted, 
-		"Attempt by coroutine %.256s (%p) to suspend back to terminated coroutine %.256s (%p).\n"
-		"Possible cause is terminated coroutine's main routine has already returned.",
-		src->name, src, src->last->name, src->last );
-
-	corCxtSw( src, src->last );
+void ^?{}(coStack_t* this) {
+	if ( ! this->userStack ) {
+		LIB_DEBUG_DO(
+			if ( mprotect( this->storage, pageSize, PROT_READ | PROT_WRITE ) == -1 ) {
+				abortf( "(coStack_t *)%p.^?{}() : internal error, mprotect failure, error(%d) %s.", this, errno, strerror( errno ) );
+			}
+		);
+		free( this->storage );
+	}
 }
 
-forall(dtype T | is_coroutine(T))
-void resume(T* cor) {
-	coroutine* src = this_coroutine();		// optimization
-	coroutine* dst = get_coroutine(cor);
+void ^?{}(coroutine* this) {}
 
-	if( dst->stack.base == NULL ) {
-		create_stack(&dst->stack, dst->stack.size);
-		CtxStart(cor, CtxInvokeCoroutine);
-	}
-
-	if ( src != dst ) {				// not resuming self ?
-		assertf( dst->notHalted , 
-			"Attempt by coroutine %.256s (%p) to resume terminated coroutine %.256s (%p).\n"
-			"Possible cause is terminated coroutine's main routine has already returned.",
-			src->name, src, dst->name, dst );
-		dst->last = src;					// set last resumer
-	} // if
-	corCxtSw( src, dst );				// always done for performance testing
-}
-
+// Part of the Public API
+// Not inline since only ever called once per coroutine
 forall(dtype T | is_coroutine(T))
 void prime(T* cor) {
@@ -130,4 +106,10 @@
 	this->state = Primed;
 	resume(cor);
+}
+
+// We need to call suspend from invoke.c, so we expose this wrapper that
+// is not inline (We can't inline Cforall in C)
+void suspend_no_inline(void) {
+	suspend();
 }
 
@@ -151,5 +133,4 @@
 } //ctxSwitchDirect
 
-// used by all constructors
 void create_stack( coStack_t* this, unsigned int storageSize ) {
 	//TEMP HACK do this on proper kernel startup
