Index: doc/proposals/concurrency/thePlan.md
===================================================================
--- doc/proposals/concurrency/thePlan.md	(revision dcb42b8657f59146a4e13e740761ef8af2db06e3)
+++ doc/proposals/concurrency/thePlan.md	(revision bd98b588c3ea1cfcbdb262cfe07a29ade49b0de4)
@@ -1,16 +1,17 @@
 _Phase 1_ : Prototype
-Threads and Processors.
-Main needs to call process
+done - Threads.
+done - Main thread is a cfa thread.
+done - SimpleBlockingLock.
+done - Synchronisation points in thread destructors.
+Processors & SpinLock.
 
 _Phase 2_ : Minimum Viable Product
-Main thread is a cfa thread
-Basic monitors for synchronisation and minimal lock support.
-No internal/external scheduling.
-Synchronisation points in thread destructors.
+Basic monitors for synchronisation (No internal/external scheduling).
+Non-thrash scheduler.
+Clusters.
 
 _Phase 3_ : Kernel features
 Threads features ex: detach
 Internal scheduling
-Clusters
 
 _Phase 4_ : Monitor features
Index: src/examples/thread.c
===================================================================
--- src/examples/thread.c	(revision dcb42b8657f59146a4e13e740761ef8af2db06e3)
+++ src/examples/thread.c	(revision bd98b588c3ea1cfcbdb262cfe07a29ade49b0de4)
@@ -10,5 +10,5 @@
 };
 
-DECL_THREAD(MyThread)
+// DECL_THREAD(MyThread)
 
 void ?{}( MyThread * this, unsigned id, unsigned count ) {
@@ -17,29 +17,29 @@
 }
 
-void main(MyThread* this) {
-	sout | "Thread" | this->id | " : Suspending" | this->count | "times" | endl;
-	suspend();
+// void main(MyThread* this) {
+// 	sout | "Thread" | this->id | " : Suspending" | this->count | "times" | endl;
+// 	yield();
 
-	for(int i = 0; i < this->count; i++) {
-		sout | "Thread" | this->id | " : Suspend No." | i + 1 | endl;
-		suspend();
-	}
-}
+// 	for(int i = 0; i < this->count; i++) {
+// 		sout | "Thread" | this->id | " : Suspend No." | i + 1 | endl;
+// 		yield();
+// 	}
+// }
 
 int main(int argc, char* argv[]) {
 
-	unsigned itterations = 10u;
-	if(argc == 2) { 
-		int val = ato(argv[1]);
-		assert(val >= 0);
-		itterations = val;
-	}
+	// unsigned itterations = 10u;
+	// if(argc == 2) { 
+	// 	int val = ato(argv[1]);
+	// 	assert(val >= 0);
+	// 	itterations = val;
+	// }
 
 	sout | "User main begin" | endl;
 
-	{
-		thread(MyThread) thread1 = { 1u, itterations };
-		thread(MyThread) thread2 = { 2u, itterations };
-	}
+	// {
+	// 	thread(MyThread) thread1 = { 1u, itterations };
+	// 	thread(MyThread) thread2 = { 2u, itterations };
+	// }
 
 	sout | "User main end" | endl;
Index: src/libcfa/concurrency/coroutines
===================================================================
--- src/libcfa/concurrency/coroutines	(revision dcb42b8657f59146a4e13e740761ef8af2db06e3)
+++ src/libcfa/concurrency/coroutines	(revision bd98b588c3ea1cfcbdb262cfe07a29ade49b0de4)
@@ -62,8 +62,5 @@
 
 // Get current coroutine
-extern coroutine * current_coroutine; //PRIVATE, never use directly
-static inline coroutine * this_coroutine(void) {
-	return current_coroutine;
-}
+coroutine * this_coroutine(void);
 
 // Private wrappers for context switch and stack creation
Index: src/libcfa/concurrency/coroutines.c
===================================================================
--- src/libcfa/concurrency/coroutines.c	(revision dcb42b8657f59146a4e13e740761ef8af2db06e3)
+++ src/libcfa/concurrency/coroutines.c	(revision bd98b588c3ea1cfcbdb262cfe07a29ade49b0de4)
@@ -14,4 +14,6 @@
 //
 
+#include "coroutines"
+
 extern "C" {
 #include <stddef.h>
@@ -23,9 +25,11 @@
 }
 
-#include "coroutines"
+#include "kernel"
 #include "libhdr.h"
 
 #define __CFA_INVOKE_PRIVATE__
 #include "invoke.h"
+
+/*thread_local*/ extern processor * this_processor;
 
 //-----------------------------------------------------------------------------
@@ -35,8 +39,4 @@
 #define MinStackSize 1000
 static size_t pageSize = 0;				// architecture pagesize HACK, should go in proper runtime singleton
-
-//Current coroutine
-//Will need to be in TLS when multi-threading is added
-coroutine* current_coroutine;
 
 //-----------------------------------------------------------------------------
@@ -110,5 +110,5 @@
 
 	// set new coroutine that task is executing
-	current_coroutine = dst;			
+	this_processor->current_coroutine = dst;			
 
 	// context switch to specified coroutine
Index: src/libcfa/concurrency/invoke.c
===================================================================
--- src/libcfa/concurrency/invoke.c	(revision dcb42b8657f59146a4e13e740761ef8af2db06e3)
+++ src/libcfa/concurrency/invoke.c	(revision bd98b588c3ea1cfcbdb262cfe07a29ade49b0de4)
@@ -14,5 +14,5 @@
 
 extern void __suspend_no_inline__F___1(void);
-extern void __scheduler_remove__F_P9sthread_h__1(struct thread_h*);
+extern void __signal_termination__F_P9sthread_h__1(struct thread_h*);
 
 void CtxInvokeCoroutine(
@@ -57,7 +57,5 @@
       main( this );
 
-      cor->state = Halt;
-      cor->notHalted = false;
-      __scheduler_remove__F_P9sthread_h__1(thrd);
+      __signal_termination__F_P9sthread_h__1(thrd);
 
       //Final suspend, should never return
Index: src/libcfa/concurrency/invoke.h
===================================================================
--- src/libcfa/concurrency/invoke.h	(revision dcb42b8657f59146a4e13e740761ef8af2db06e3)
+++ src/libcfa/concurrency/invoke.h	(revision bd98b588c3ea1cfcbdb262cfe07a29ade49b0de4)
@@ -11,4 +11,18 @@
 
       #define unlikely(x)    __builtin_expect(!!(x), 0)
+      #define SCHEDULER_CAPACITY 10
+
+      struct simple_thread_list {
+            struct thread_h * head;
+            struct thread_h ** tail;
+      };
+
+      #ifdef __CFORALL__
+      extern "Cforall" {
+            void ?{}( struct simple_thread_list * );
+            void append( struct simple_thread_list *, struct thread_h * );
+            struct thread_h * pop_head( struct simple_thread_list * );
+      }
+      #endif
 
       struct coStack_t {
@@ -35,6 +49,12 @@
       };
 
+      struct simple_lock {
+      	struct simple_thread_list blocked;
+      };
+
       struct thread_h {
             struct coroutine c;
+            struct simple_lock lock;
+            struct thread_h * next;
       };
 
Index: src/libcfa/concurrency/kernel
===================================================================
--- src/libcfa/concurrency/kernel	(revision dcb42b8657f59146a4e13e740761ef8af2db06e3)
+++ src/libcfa/concurrency/kernel	(revision bd98b588c3ea1cfcbdb262cfe07a29ade49b0de4)
@@ -20,18 +20,37 @@
 #include <stdbool.h>
 
+#include "invoke.h"
+
+//-----------------------------------------------------------------------------
+// Cluster
+struct cluster {
+	simple_thread_list ready_queue;
+};
+
+void ?{}(cluster * this);
+void ^?{}(cluster * this);
+
+//-----------------------------------------------------------------------------
+// Processor
 struct processor {
 	struct processorCtx_t * ctx;
-	unsigned int thread_index;
-	unsigned int thread_count;
-	struct thread_h * threads[10];
+	cluster * cltr;
+	coroutine * current_coroutine;
+	thread_h * current_thread;
 	bool terminated;
 };
 
-void ?{}(processor * this);
+void ?{}(processor * this, cluster * cltr);
 void ^?{}(processor * this);
 
-void scheduler_add( struct thread_h * thrd );
-void scheduler_remove( struct thread_h * thrd );
-void kernel_run( void );
+
+//-----------------------------------------------------------------------------
+// Locks
+
+void ?{}(simple_lock * this);
+void ^?{}(simple_lock * this);
+
+void lock( simple_lock * );
+void unlock( simple_lock * );
 
 #endif //KERNEL_H
Index: src/libcfa/concurrency/kernel.c
===================================================================
--- src/libcfa/concurrency/kernel.c	(revision dcb42b8657f59146a4e13e740761ef8af2db06e3)
+++ src/libcfa/concurrency/kernel.c	(revision bd98b588c3ea1cfcbdb262cfe07a29ade49b0de4)
@@ -32,4 +32,5 @@
 #include "invoke.h"
 
+cluster * systemCluster;
 processor * systemProcessor;
 thread_h * mainThread;
@@ -38,19 +39,29 @@
 void kernel_shutdown(void) __attribute__((destructor(101)));
 
-void ?{}(processor * this) {
+void ?{}(processor * this, cluster * cltr) {
 	this->ctx = NULL;
-	this->thread_index = 0;
-	this->thread_count = 10;
+	this->cltr = cltr;
 	this->terminated = false;
-
-	for(int i = 0; i < 10; i++) {
-		this->threads[i] = NULL;
-	}
-
-	LIB_DEBUG_PRINTF("Processor : ctor for core %p (core spots %d)\n", this, this->thread_count);
-}
-
-void ^?{}(processor * this) {
-
+}
+
+void ^?{}(processor * this) {}
+
+void ?{}(cluster * this) {
+	( &this->ready_queue ){};
+}
+
+void ^?{}(cluster * this) {}
+
+//-----------------------------------------------------------------------------
+// Global state
+
+/*thread_local*/ processor * this_processor;
+
+coroutine * this_coroutine(void) {
+	return this_processor->current_coroutine;
+}
+
+thread_h * this_thread(void) {
+	return this_processor->current_thread;
 }
 
@@ -77,5 +88,5 @@
 // Processor running routines
 void main(processorCtx_t * ctx);
-thread_h * nextThread(processor * this);
+thread_h * nextThread(cluster * this);
 void runThread(processor * this, thread_h * dst);
 void spin(processor * this, unsigned int * spin_count);
@@ -88,5 +99,5 @@
 	for( unsigned int spin_count = 0; ! this->terminated; spin_count++ ) {
 		
-		readyThread = nextThread(this);
+		readyThread = nextThread( this->cltr );
 
 		if(readyThread) {
@@ -101,15 +112,4 @@
 }
 
-thread_h * nextThread(processor * this) {
-	for(int i = 0; i < this->thread_count; i++) {
-		this->thread_index = (this->thread_index + 1) % this->thread_count;	
-		
-		thread_h * thrd = this->threads[this->thread_index];
-		if(thrd) return thrd;
-	}
-
-	return NULL;
-}
-
 void runThread(processor * this, thread_h * dst) {
 	coroutine * proc_ctx = get_coroutine(this->ctx);
@@ -120,7 +120,8 @@
 	// Which is now the current_coroutine
 	// LIB_DEBUG_PRINTF("Kernel : switching to ctx %p (from %p, current %p)\n", thrd_ctx, proc_ctx, current_coroutine);
-	current_coroutine = thrd_ctx;
+	this->current_thread = dst;
+	this->current_coroutine = thrd_ctx;
 	CtxSwitch( proc_ctx->stack.context, thrd_ctx->stack.context );
-	current_coroutine = proc_ctx;
+	this->current_coroutine = proc_ctx;
 	// LIB_DEBUG_PRINTF("Kernel : returned from ctx %p (to %p, current %p)\n", thrd_ctx, proc_ctx, current_coroutine);
 
@@ -133,24 +134,12 @@
 
 //-----------------------------------------------------------------------------
-// Kernel runner (Temporary)
-
-void scheduler_add( thread_h * thrd ) {
-	for(int i = 0; i < systemProcessor->thread_count; i++) {
-		if(systemProcessor->threads[i] == NULL) {
-			systemProcessor->threads[i] = thrd;
-			return;
-		}
-	}
-	assertf(false, "Scheduler full");
-}
-
-void scheduler_remove( thread_h * thrd ) {
-	for(int i = 0; i < systemProcessor->thread_count; i++) {
-		if(systemProcessor->threads[i] == thrd) {
-			systemProcessor->threads[i] = NULL;
-			return;
-		}
-	}
-	assertf(false, "Trying to unschedule unkown thread");
+// Scheduler routines
+void thread_schedule( thread_h * thrd ) {
+	assertf( thrd->next == NULL, "Expected null got %p", thrd->next );
+	append( &systemProcessor->cltr->ready_queue, thrd );
+}
+
+thread_h * nextThread(cluster * this) {
+	return pop_head( &this->ready_queue );
 }
 
@@ -160,4 +149,5 @@
 
 KERNEL_STORAGE(processorCtx_t, systemProcessorCtx);
+KERNEL_STORAGE(cluster, systemCluster);
 KERNEL_STORAGE(processor, systemProcessor);
 KERNEL_STORAGE(thread_h, mainThread);
@@ -221,10 +211,4 @@
 
 	mainThread_info_t ctx;
-	// LIB_DEBUG_PRINTF("Kernel :    base : %p\n", ctx.base );
-	// LIB_DEBUG_PRINTF("Kernel :     top : %p\n", ctx.top );
-	// LIB_DEBUG_PRINTF("Kernel :   limit : %p\n", ctx.limit );
-	// LIB_DEBUG_PRINTF("Kernel :    size : %x\n", ctx.size );
-	// LIB_DEBUG_PRINTF("Kernel : storage : %p\n", ctx.storage );
-	// LIB_DEBUG_PRINTF("Kernel : context : %p\n", ctx.context );
 
 	// Start by initializing the main thread
@@ -232,7 +216,11 @@
 	mainThread{ &ctx };
 
-	// // Initialize the system processor
+	// Initialize the system cluster
+	systemCluster = (cluster *)&systemCluster_storage;
+	systemCluster{};
+
+	// Initialize the system processor
 	systemProcessor = (processor *)&systemProcessor_storage;
-	systemProcessor{};
+	systemProcessor{ systemCluster };
 
 	// Initialize the system processor ctx
@@ -243,8 +231,10 @@
 	// Add the main thread to the ready queue 
 	// once resume is called on systemProcessor->ctx the mainThread needs to be scheduled like any normal thread
-	scheduler_add(mainThread);
+	thread_schedule(mainThread);
 
 	//initialize the global state variables
-	current_coroutine = &mainThread->c;
+	this_processor = systemProcessor;
+	this_processor->current_thread = mainThread;
+	this_processor->current_coroutine = &mainThread->c;
 
 	// SKULLDUGGERY: Force a context switch to the system processor to set the main thread's context to the current UNIX
@@ -285,4 +275,51 @@
 }
 
+//-----------------------------------------------------------------------------
+// Locks
+void ?{}( simple_lock * this ) {
+	( &this->blocked ){};
+}
+
+void ^?{}( simple_lock * this ) {
+
+}
+
+void lock( simple_lock * this ) {
+	append( &this->blocked, this_thread() );
+	suspend();
+}
+
+void unlock( simple_lock * this ) {
+	thread_h * it;
+	while( it = pop_head( &this->blocked) ) {
+		thread_schedule( it );
+	}
+}
+
+//-----------------------------------------------------------------------------
+// Queues
+void ?{}( simple_thread_list * this ) {
+	this->head = NULL;
+	this->tail = &this->head;
+}
+
+void append( simple_thread_list * this, thread_h * t ) {
+	assert( t->next == NULL );
+	*this->tail = t;
+	this->tail = &t->next;
+}
+
+thread_h * pop_head( simple_thread_list * this ) {
+	thread_h * head = this->head;
+	if( head ) {
+		this->head = head->next;
+		if( !head->next ) {
+			this->tail = &this->head;
+		}
+		head->next = NULL;
+	}	
+	
+	return head;
+}
 // Local Variables: //
 // mode: c //
Index: src/libcfa/concurrency/threads
===================================================================
--- src/libcfa/concurrency/threads	(revision dcb42b8657f59146a4e13e740761ef8af2db06e3)
+++ src/libcfa/concurrency/threads	(revision bd98b588c3ea1cfcbdb262cfe07a29ade49b0de4)
@@ -45,4 +45,6 @@
 }
 
+thread_h * this_thread(void);
+
 //-----------------------------------------------------------------------------
 // Ctors and dtors
@@ -67,6 +69,5 @@
 void ^?{}( thread(T)* this );
 
-//-----------------------------------------------------------------------------
-// PRIVATE exposed because of inline
+void yield();
 
 #endif //THREADS_H
Index: src/libcfa/concurrency/threads.c
===================================================================
--- src/libcfa/concurrency/threads.c	(revision dcb42b8657f59146a4e13e740761ef8af2db06e3)
+++ src/libcfa/concurrency/threads.c	(revision bd98b588c3ea1cfcbdb262cfe07a29ade49b0de4)
@@ -23,13 +23,17 @@
 #include "invoke.h"
 
-#include <stdlib>
+extern "C" {
+	#include <stddef.h>
+}
+
+/*thread_local*/ extern processor * this_processor;
 
 //-----------------------------------------------------------------------------
 // Forward declarations
 forall(otype T | is_thread(T) )
-void start( thread(T)* this );
+void start( T* this );
 
 forall(otype T | is_thread(T) )
-void stop( thread(T)* this );
+void stop( T* this );
 
 //-----------------------------------------------------------------------------
@@ -38,4 +42,6 @@
 void ?{}(thread_h* this) {
 	(&this->c){};
+	(&this->lock){};
+	this->next = NULL;
 }
 
@@ -47,5 +53,5 @@
 void ?{}( thread(T)* this ) {
 	(&this->handle){};
-	start(this);
+	start(&this->handle);
 }
 
@@ -53,10 +59,10 @@
 void ?{}( thread(T)* this, P params ) {
 	(&this->handle){ params };
-	start(this);
+	start(&this->handle);
 }
 
 forall(otype T | is_thread(T) )
 void ^?{}( thread(T)* this ) {
-	stop(this);
+	stop(&this->handle);
 	^(&this->handle){};
 }
@@ -69,28 +75,39 @@
 }
 
+extern void thread_schedule( thread_h * );
+
 forall(otype T | is_thread(T))
-void start( thread(T)* this ) {
-	T* handle  = &this->handle;
-	coroutine* thrd_c = get_coroutine(handle);
-	thread_h*  thrd_h = get_thread   (handle);
+void start( T* this ) {
+	coroutine* thrd_c = get_coroutine(this);
+	thread_h*  thrd_h = get_thread   (this);
 	thrd_c->last = this_coroutine();
-	current_coroutine = thrd_c;
+	this_processor->current_coroutine = thrd_c;
 
 	// LIB_DEBUG_PRINTF("Thread start : %p (t %p, c %p)\n", handle, thrd_c, thrd_h);
 
 	create_stack(&thrd_c->stack, thrd_c->stack.size);
-	CtxStart(handle, CtxInvokeThread);
+	CtxStart(this, CtxInvokeThread);
 	CtxSwitch( thrd_c->last->stack.context, thrd_c->stack.context );
 
-	scheduler_add(thrd_h);
+	thread_schedule(thrd_h);
 }
 
 forall(otype T | is_thread(T) )
-void stop( thread(T)* this ) {
-	T* handle  = &this->handle;
-	thread_h*  thrd_h = get_thread   (handle);
-	while( thrd_h->c.notHalted ) {
-		suspend();
+void stop( T* this ) {
+	thread_h*  thrd = get_thread(this);
+	if( thrd->c.notHalted ) {
+		lock( &thrd->lock );
 	}
+}
+
+void signal_termination( thread_h * this ) {
+	this->c.state = Halt;
+      this->c.notHalted = false;
+	unlock( &this->lock );
+}
+
+void yield( void ) {
+	thread_schedule( this_thread() );
+	suspend();
 }
 
