Index: src/libcfa/concurrency/kernel
===================================================================
--- src/libcfa/concurrency/kernel	(revision 84e25234b398f83aaaa7e9dd13bcf88005f65b63)
+++ src/libcfa/concurrency/kernel	(revision 8d4f7fe847eef8e0d54d3b2fd88d35f0d28ef095)
@@ -30,5 +30,5 @@
 struct cluster {
 	simple_thread_list ready_queue;
-	pthread_spinlock_t lock;
+	// pthread_spinlock_t lock;
 };
 
@@ -38,6 +38,11 @@
 //-----------------------------------------------------------------------------
 // Processor
+enum ProcessorAction { 
+	Reschedule,
+	NoAction
+};
+
 struct processor {
-	struct processorCtx_t * ctx;
+	struct processorCtx_t * runner;
 	cluster * cltr;
 	coroutine * current_coroutine;
@@ -46,4 +51,5 @@
 	simple_lock lock;
 	volatile bool terminated;
+	ProcessorAction thread_action;
 };
 
Index: src/libcfa/concurrency/kernel.c
===================================================================
--- src/libcfa/concurrency/kernel.c	(revision 84e25234b398f83aaaa7e9dd13bcf88005f65b63)
+++ src/libcfa/concurrency/kernel.c	(revision 8d4f7fe847eef8e0d54d3b2fd88d35f0d28ef095)
@@ -25,4 +25,5 @@
 #include <stddef.h>
 extern "C" {
+#include <fenv.h>
 #include <sys/resource.h>
 }
@@ -35,4 +36,16 @@
 #define __CFA_INVOKE_PRIVATE__
 #include "invoke.h"
+
+static volatile int lock;
+
+void spin_lock( volatile int *lock ) {
+	for ( unsigned int i = 1;; i += 1 ) {
+	  if ( *lock == 0 && __sync_lock_test_and_set_4( lock, 1 ) == 0 ) break;
+	}
+}
+
+void spin_unlock( volatile int *lock ) {
+	__sync_lock_release_4( lock );
+}
 
 //-----------------------------------------------------------------------------
@@ -127,5 +140,5 @@
 	(&this->c){};
 	this->proc = proc;
-	proc->ctx = this;
+	proc->runner = this;
 }
 
@@ -133,5 +146,5 @@
 	(&this->c){ info };
 	this->proc = proc;
-	proc->ctx = this;
+	proc->runner = this;
 }
 
@@ -152,5 +165,5 @@
 }
 
-void ?{}(processor * this, cluster * cltr, processorCtx_t * ctx) {
+void ?{}(processor * this, cluster * cltr, processorCtx_t * runner) {
 	this->cltr = cltr;
 	this->current_coroutine = NULL;
@@ -159,7 +172,7 @@
 	this->terminated = false;
 
-	this->ctx = ctx;
-	LIB_DEBUG_PRINTF("Kernel : constructing processor context %p\n", ctx);
-	ctx{ this };
+	this->runner = runner;
+	LIB_DEBUG_PRINTF("Kernel : constructing processor context %p\n", runner);
+	runner{ this };
 }
 
@@ -174,21 +187,27 @@
 void ?{}(cluster * this) {
 	( &this->ready_queue ){};
-	pthread_spin_init( &this->lock, PTHREAD_PROCESS_PRIVATE );
+	lock = 0;
 }
 
 void ^?{}(cluster * this) {
-	pthread_spin_destroy( &this->lock );
+	
 }
 
 //-----------------------------------------------------------------------------
 // Processor running routines
-void main(processorCtx_t * ctx);
+void main(processorCtx_t *);
 thread * nextThread(cluster * this);
 void scheduleInternal(processor * this, thread * dst);
 void spin(processor * this, unsigned int * spin_count);
-
-void main(processorCtx_t * ctx) {
-	processor * this = ctx->proc;
+void thread_schedule( thread * thrd );
+
+//Main of the processor contexts
+void main(processorCtx_t * runner) {
+	processor * this = runner->proc;
 	LIB_DEBUG_PRINTF("Kernel : core %p starting\n", this);
+
+	fenv_t envp;
+	fegetenv( &envp );
+	LIB_DEBUG_PRINTF("Kernel : mxcsr %x\n", envp.__mxcsr);
 
 	thread * readyThread = NULL;
@@ -216,4 +235,6 @@
 // from the processor coroutine to the target thread 
 void scheduleInternal(processor * this, thread * dst) {
+	this->thread_action = NoAction;
+
 	// coroutine * proc_ctx = get_coroutine(this->ctx);
 	// coroutine * thrd_ctx = get_coroutine(dst);
@@ -226,5 +247,5 @@
 	// // when ThreadCtxSwitch returns we are back in the processor coroutine
 
-	coroutine * proc_ctx = get_coroutine(this->ctx);
+	coroutine * proc_ctx = get_coroutine(this->runner);
 	coroutine * thrd_ctx = get_coroutine(dst);
       thrd_ctx->last = proc_ctx;
@@ -232,12 +253,15 @@
       // context switch to specified coroutine
       // Which is now the current_coroutine
-      LIB_DEBUG_PRINTF("Kernel : switching to ctx %p (from %p, current %p)\n", thrd_ctx, proc_ctx, this->current_coroutine);
+      // LIB_DEBUG_PRINTF("Kernel : switching to ctx %p (from %p, current %p)\n", thrd_ctx, proc_ctx, this->current_coroutine);
       this->current_thread = dst;
       this->current_coroutine = thrd_ctx;
       CtxSwitch( proc_ctx->stack.context, thrd_ctx->stack.context );
       this->current_coroutine = proc_ctx;
-      LIB_DEBUG_PRINTF("Kernel : returned from ctx %p (to %p, current %p)\n", thrd_ctx, proc_ctx, this->current_coroutine);
+      // LIB_DEBUG_PRINTF("Kernel : returned from ctx %p (to %p, current %p)\n", thrd_ctx, proc_ctx, this->current_coroutine);
  
       // when CtxSwitch returns we are back in the processor coroutine
+	if(this->thread_action == Reschedule) {
+		thread_schedule( dst );
+	}
 }
 
@@ -262,10 +286,12 @@
 	processorCtx_t proc_cor_storage = { proc, &info };
 
+	LIB_DEBUG_PRINTF("Coroutine : created stack %p\n", proc_cor_storage.c.stack.base);
+
 	//Set global state
-	proc->current_coroutine = &proc->ctx->c;
+	proc->current_coroutine = &proc->runner->c;
 	proc->current_thread = NULL;
 
 	//We now have a proper context from which to schedule threads
-	LIB_DEBUG_PRINTF("Kernel : core %p created (%p)\n", proc, proc->ctx);
+	LIB_DEBUG_PRINTF("Kernel : core %p created (%p, %p)\n", proc, proc->runner, &ctx);
 
 	// SKULLDUGGERY: Since the coroutine doesn't have its own stack, we can't 
@@ -279,5 +305,5 @@
 
 	// Main routine of the core returned, the core is now fully terminated
-	LIB_DEBUG_PRINTF("Kernel : core %p main ended (%p)\n", proc, proc->ctx);	
+	LIB_DEBUG_PRINTF("Kernel : core %p main ended (%p)\n", proc, proc->runner);	
 
 	return NULL;
@@ -287,10 +313,10 @@
 	LIB_DEBUG_PRINTF("Kernel : Starting core %p\n", this);
 	
-	pthread_attr_t attributes;
-	pthread_attr_init( &attributes );
-
-	pthread_create( &this->kernel_thread, &attributes, CtxInvokeProcessor, (void*)this );
-
-	pthread_attr_destroy( &attributes );
+	// pthread_attr_t attributes;
+	// pthread_attr_init( &attributes );
+
+	pthread_create( &this->kernel_thread, NULL, CtxInvokeProcessor, (void*)this );
+
+	// pthread_attr_destroy( &attributes );
 
 	LIB_DEBUG_PRINTF("Kernel : core %p started\n", this);	
@@ -302,11 +328,14 @@
 	assertf( thrd->next == NULL, "Expected null got %p", thrd->next );
 	
-	pthread_spinlock_guard guard = { &systemProcessor->cltr->lock };
+	spin_lock( &lock );
 	append( &systemProcessor->cltr->ready_queue, thrd );
+	spin_unlock( &lock );
 }
 
 thread * nextThread(cluster * this) {
-	pthread_spinlock_guard guard = { &this->lock };
-	return pop_head( &this->ready_queue );
+	spin_lock( &lock );
+	thread * head = pop_head( &this->ready_queue );
+	spin_unlock( &lock );
+	return head;
 }
 
@@ -314,20 +343,11 @@
 // Kernel boot procedures
 void kernel_startup(void) {
-
+	LIB_DEBUG_PRINTF("Kernel : Starting\n");	
+
+	// Start by initializing the main thread
 	// SKULLDUGGERY: the mainThread steals the process main thread 
 	// which will then be scheduled by the systemProcessor normally
-	LIB_DEBUG_PRINTF("Kernel : Starting\n");	
-
+	mainThread = (thread *)&mainThread_storage;
 	current_stack_info_t info;
-
-	// LIB_DEBUG_PRINTF("Kernel : core    base : %p \n", info.base );
-	// LIB_DEBUG_PRINTF("Kernel : core storage : %p \n", info.storage );
-	// LIB_DEBUG_PRINTF("Kernel : core    size : %x \n", info.size );
-	// LIB_DEBUG_PRINTF("Kernel : core   limit : %p \n", info.limit );
-	// LIB_DEBUG_PRINTF("Kernel : core context : %p \n", info.context );
-	// LIB_DEBUG_PRINTF("Kernel : core     top : %p \n", info.top );
-
-	// Start by initializing the main thread
-	mainThread = (thread *)&mainThread_storage;
 	mainThread{ &info };
 
@@ -353,12 +373,9 @@
 	// context. Hence, the main thread does not begin through CtxInvokeThread, like all other threads. The trick here is that
 	// mainThread is on the ready queue when this call is made. 
-	resume(systemProcessor->ctx);
+	resume(systemProcessor->runner);
 
 
 
 	// THE SYSTEM IS NOW COMPLETELY RUNNING
-
-
-
 	LIB_DEBUG_PRINTF("Kernel : Started\n--------------------------------------------------\n\n");
 }
@@ -377,5 +394,5 @@
 	// Destroy the system processor and its context in reverse order of construction
 	// These were manually constructed so we need manually destroy them
-	^(systemProcessor->ctx){};
+	^(systemProcessor->runner){};
 	^(systemProcessor){};
 
@@ -399,6 +416,7 @@
 void lock( simple_lock * this ) {
 	{
-		pthread_spinlock_guard guard = { &systemCluster->lock };  	//HUGE TEMP HACK which only works if we have a single cluster and is stupid
+		spin_lock( &lock );
 		append( &this->blocked, this_thread() );
+		spin_unlock( &lock );
 	}
 	suspend();
Index: src/libcfa/concurrency/threads.c
===================================================================
--- src/libcfa/concurrency/threads.c	(revision 84e25234b398f83aaaa7e9dd13bcf88005f65b63)
+++ src/libcfa/concurrency/threads.c	(revision 8d4f7fe847eef8e0d54d3b2fd88d35f0d28ef095)
@@ -24,4 +24,5 @@
 
 extern "C" {
+	#include <fenv.h>
 	#include <stddef.h>
 }
@@ -91,4 +92,7 @@
 	CtxSwitch( thrd_c->last->stack.context, thrd_c->stack.context );
 
+	fenv_t envp;
+	fegetenv( &envp );
+	LIB_DEBUG_PRINTF("Thread : mxcsr %x\n", envp.__mxcsr);
 	LIB_DEBUG_PRINTF("Thread started : %p (t %p, c %p)\n", this, thrd_c, thrd_h);
 
@@ -105,5 +109,5 @@
 
 void yield( void ) {
-	thread_schedule( this_thread() );
+	get_this_processor()->thread_action = Reschedule;
 	suspend();
 }
