Index: src/libcfa/concurrency/invoke.h
===================================================================
--- src/libcfa/concurrency/invoke.h	(revision 1d29d46e76ca8eeebab3a2ad22aea33afbb42f87)
+++ src/libcfa/concurrency/invoke.h	(revision 727cf70fd3ddbe0381c745441cbdb5942cc8920a)
@@ -33,14 +33,22 @@
       };
 
-      struct simple_thread_list {
+      struct __thread_queue_t {
             struct thread_desc * head;
             struct thread_desc ** tail;
       };
 
+      struct __thread_stack_t {
+            struct thread_desc * top;
+      };
+
       #ifdef __CFORALL__
       extern "Cforall" {
-            void ?{}( struct simple_thread_list * );
-            void append( struct simple_thread_list *, struct thread_desc * );
-            struct thread_desc * pop_head( struct simple_thread_list * );
+            void ?{}( struct __thread_queue_t * );
+            void append( struct __thread_queue_t *, struct thread_desc * );
+            struct thread_desc * pop_head( struct __thread_queue_t * );
+
+            void ?{}( struct __thread_stack_t * );
+            void push( struct __thread_stack_t *, struct thread_desc * );            
+            struct thread_desc * pop( struct __thread_stack_t * );
 
             void ?{}(spinlock * this);
@@ -50,11 +58,11 @@
 
       struct coStack_t {
-            unsigned int size;                  // size of stack
-            void *storage;                      // pointer to stack
-            void *limit;                        // stack grows towards stack limit
-            void *base;                         // base of stack
-            void *context;                      // address of cfa_context_t
-            void *top;                          // address of top of storage
-            bool userStack;                     // whether or not the user allocated the stack
+            unsigned int size;                        // size of stack
+            void *storage;                            // pointer to stack
+            void *limit;                              // stack grows towards stack limit
+            void *base;                               // base of stack
+            void *context;                            // address of cfa_context_t
+            void *top;                                // address of top of storage
+            bool userStack;                           // whether or not the user allocated the stack
       };
 
@@ -62,23 +70,27 @@
 
       struct coroutine_desc {
-            struct coStack_t stack;             // stack information of the coroutine
-            const char *name;                   // textual name for coroutine/task, initialized by uC++ generated code
-            int errno_;                         // copy of global UNIX variable errno
-            enum coroutine_state state;         // current execution status for coroutine
-            struct coroutine_desc *starter;     // first coroutine to resume this one
-            struct coroutine_desc *last;	      // last coroutine to resume this one
+            struct coStack_t stack;                   // stack information of the coroutine
+            const char *name;                         // textual name for coroutine/task, initialized by uC++ generated code
+            int errno_;                               // copy of global UNIX variable errno
+            enum coroutine_state state;               // current execution status for coroutine
+            struct coroutine_desc *starter;           // first coroutine to resume this one
+            struct coroutine_desc *last;	            // last coroutine to resume this one
       };
 
       struct monitor_desc {
-            struct spinlock lock;
-            struct thread_desc * owner;
-            struct simple_thread_list entry_queue;
-            unsigned int recursion;
+            struct spinlock lock;                     // spinlock to protect internal data
+            struct thread_desc * owner;               // current owner of the monitor
+            struct __thread_queue_t entry_queue;      // queue of threads that are blocked waiting for the monitor
+            struct __thread_stack_t signal_stack;     // stack of threads to run next once we exit the monitor
+            struct monitor_desc * stack_owner;        // if bulk acquiring was used we need to synchronize signals with an other monitor
+            unsigned int recursion;                   // monitor routines can be called recursively, we need to keep track of that
       };
 
       struct thread_desc {
-            struct coroutine_desc cor;          // coroutine body used to store context
-            struct monitor_desc mon;            // monitor body used for mutual exclusion
-            struct thread_desc * next;          // instrusive link field for threads
+            struct coroutine_desc cor;                // coroutine body used to store context
+            struct monitor_desc mon;                  // monitor body used for mutual exclusion
+            struct thread_desc * next;                // instrusive link field for threads
+            struct monitor_desc ** current_monitors;  // currently held monitors
+            unsigned short current_monitor_count;     // number of currently held monitors
       };
 
Index: src/libcfa/concurrency/kernel
===================================================================
--- src/libcfa/concurrency/kernel	(revision 1d29d46e76ca8eeebab3a2ad22aea33afbb42f87)
+++ src/libcfa/concurrency/kernel	(revision 727cf70fd3ddbe0381c745441cbdb5942cc8920a)
@@ -32,7 +32,7 @@
 
 struct signal_once {
-	volatile bool condition;
+	volatile bool cond;
 	struct spinlock lock;
-	struct simple_thread_list blocked;
+	struct __thread_queue_t blocked;
 };
 
@@ -46,5 +46,5 @@
 // Cluster
 struct cluster {
-	simple_thread_list ready_queue;
+	__thread_queue_t ready_queue;
 	spinlock lock;
 };
Index: src/libcfa/concurrency/kernel.c
===================================================================
--- src/libcfa/concurrency/kernel.c	(revision 1d29d46e76ca8eeebab3a2ad22aea33afbb42f87)
+++ src/libcfa/concurrency/kernel.c	(revision 727cf70fd3ddbe0381c745441cbdb5942cc8920a)
@@ -299,4 +299,6 @@
 // Scheduler routines
 void ScheduleThread( thread_desc * thrd ) {
+	if( !thrd ) return;
+
 	assertf( thrd->next == NULL, "Expected null got %p", thrd->next );
 	
@@ -473,5 +475,5 @@
 
 void ?{}( signal_once * this ) {
-	this->condition = false;
+	this->cond = false;
 }
 void ^?{}( signal_once * this ) {
@@ -481,5 +483,5 @@
 void wait( signal_once * this ) {
 	lock( &this->lock );
-	if( !this->condition ) {
+	if( !this->cond ) {
 		append( &this->blocked, this_thread() );
 		ScheduleInternal( &this->lock );
@@ -492,5 +494,5 @@
 	lock( &this->lock );
 	{
-		this->condition = true;
+		this->cond = true;
 
 		thread_desc * it;
@@ -504,10 +506,10 @@
 //-----------------------------------------------------------------------------
 // Queues
-void ?{}( simple_thread_list * this ) {
+void ?{}( __thread_queue_t * this ) {
 	this->head = NULL;
 	this->tail = &this->head;
 }
 
-void append( simple_thread_list * this, thread_desc * t ) {
+void append( __thread_queue_t * this, thread_desc * t ) {
 	assert(this->tail != NULL);
 	*this->tail = t;
@@ -515,5 +517,5 @@
 }
 
-thread_desc * pop_head( simple_thread_list * this ) {
+thread_desc * pop_head( __thread_queue_t * this ) {
 	thread_desc * head = this->head;
 	if( head ) {
@@ -526,4 +528,23 @@
 	return head;
 }
+
+void ?{}( __thread_stack_t * this ) {
+	this->top = NULL;
+}
+
+void push( __thread_stack_t * this, thread_desc * t ) {
+	assert(t->next != NULL);
+	t->next = this->top;
+	this->top = t;
+}
+
+thread_desc * pop( __thread_stack_t * this ) {
+	thread_desc * top = this->top;
+	if( top ) {
+		this->top = top->next;
+		top->next = NULL;
+	}	
+	return top;
+}
 // Local Variables: //
 // mode: c //
Index: src/libcfa/concurrency/monitor
===================================================================
--- src/libcfa/concurrency/monitor	(revision 1d29d46e76ca8eeebab3a2ad22aea33afbb42f87)
+++ src/libcfa/concurrency/monitor	(revision 727cf70fd3ddbe0381c745441cbdb5942cc8920a)
@@ -18,4 +18,6 @@
 #define MONITOR_H
 
+#include <stddef.h>
+
 #include "assert"
 #include "invoke.h"
@@ -23,15 +25,14 @@
 
 static inline void ?{}(monitor_desc * this) {
-	this->owner = 0;
+	this->owner = NULL;
+      this->stack_owner = NULL;
 	this->recursion = 0;
 }
-
-//Array entering routine
-void enter(monitor_desc **, int count);
-void leave(monitor_desc **, int count);
 
 struct monitor_guard_t {
 	monitor_desc ** m;
 	int count;
+      monitor_desc ** prev_mntrs;
+      unsigned short  prev_count;
 };
 
@@ -40,15 +41,21 @@
 }
 
-static inline void ?{}( monitor_guard_t * this, monitor_desc ** m, int count ) {
-	this->m = m;
-	this->count = count;
-	qsort(this->m, count);
-	enter( this->m, this->count );
+void ?{}( monitor_guard_t * this, monitor_desc ** m, int count );
+void ^?{}( monitor_guard_t * this );
+
+//-----------------------------------------------------------------------------
+// Internal scheduling
+struct condition {
+	__thread_queue_t blocked;
+	monitor_desc ** monitors;
+	unsigned short monitor_count;
+};
+
+static inline void ?{}( condition * this ) {
+	this->monitors = NULL;
+	this->monitor_count = 0;
 }
 
-static inline void ^?{}( monitor_guard_t * this ) {
-	leave( this->m, this->count );
-}
-
-
+void wait( condition * this );
+void signal( condition * this );
 #endif //MONITOR_H
Index: src/libcfa/concurrency/monitor.c
===================================================================
--- src/libcfa/concurrency/monitor.c	(revision 1d29d46e76ca8eeebab3a2ad22aea33afbb42f87)
+++ src/libcfa/concurrency/monitor.c	(revision 727cf70fd3ddbe0381c745441cbdb5942cc8920a)
@@ -18,9 +18,21 @@
 
 #include "kernel_private.h"
+#include "libhdr.h"
+
+void set_owner( monitor_desc * this, thread_desc * owner ) {
+	//Pass the monitor appropriately
+	this->owner = owner;
+
+	//We are passing the monitor to someone else, which means recursion level is not 0
+	this->recursion = owner ? 1 : 0;
+}
 
 extern "C" {
-	void __enter_monitor_desc(monitor_desc * this) {
+	void __enter_monitor_desc(monitor_desc * this, monitor_desc * leader) {
 		lock( &this->lock );
 		thread_desc * thrd = this_thread();
+
+		//Update the stack owner
+		this->stack_owner = leader;
 
 		if( !this->owner ) {
@@ -44,43 +56,206 @@
 
 		unlock( &this->lock );
-	}
-
-	void __leave_monitor_desc(monitor_desc * this) {
+		return;
+	}
+
+	// leave pseudo code :
+	// 	decrement level
+	// 	leve == 0 ?
+	// 		no : done
+	// 		yes :
+	// 			signal stack empty ?
+	//				has leader :
+	//					bulk acquiring means we don't own the signal stack
+	//					ignore it but don't release the monitor
+	// 				yes :
+	// 					next in entry queue is new owner
+	// 				no :
+	// 					top of the signal stack is the owner
+	//					context switch to him right away
+	//
+	void __leave_monitor_desc(monitor_desc * this, monitor_desc * leader) {
 		lock( &this->lock );
 
 		thread_desc * thrd = this_thread();
-		assert( thrd == this->owner );
+		assert( thrd == this->owner || this->stack_owner );
 
 		//Leaving a recursion level, decrement the counter
 		this->recursion -= 1;
 
-		//If we left the last level of recursion it means we are changing who owns the monitor
+		//If we haven't left the last level of recursion
+		//it means we don't need to do anything
+		if( this->recursion != 0) {
+			this->stack_owner = leader;
+			unlock( &this->lock );
+			return;
+		}
+			
+		//If we don't own the signal stack then just leave it to the owner
+		if( this->stack_owner ) {
+			this->stack_owner = leader;
+			unlock( &this->lock );
+			return;
+		}
+
+		//We are the stack owner and have left the last recursion level.
+		//We are in charge of passing the monitor
 		thread_desc * new_owner = 0;
-		if( this->recursion == 0) {
-			//Get the next thread in the list
-			new_owner = this->owner = pop_head( &this->entry_queue );
-
-			//We are passing the monitor to someone else, which means recursion level is not 0
-			this->recursion = new_owner ? 1 : 0;
-		}	
-
+
+		//Check the signaller stack
+		new_owner = pop( &this->signal_stack );
+		if( new_owner ) {
+			//The signaller stack is not empty,
+			//transfer control immediately
+			set_owner( this, new_owner );
+			this->stack_owner = leader;
+			ScheduleInternal( &this->lock, new_owner );
+			return;
+		}
+		
+		// No signaller thread
+		// Get the next thread in the entry_queue
+		new_owner = pop_head( &this->entry_queue );
+		set_owner( this, new_owner );
+
+		//Update the stack owner
+		this->stack_owner = leader;
+
+		//We can now let other threads in safely
 		unlock( &this->lock );
 
-		//If we have a new owner, we need to wake-up the thread
-		if( new_owner ) {
-			ScheduleThread( new_owner );
-		}
-	}
-}
-
-void enter(monitor_desc ** monitors, int count) {
-	for(int i = 0; i < count; i++) {
-		__enter_monitor_desc( monitors[i] );
-	}
-}
-
-void leave(monitor_desc ** monitors, int count) {
+		//We need to wake-up the thread
+		ScheduleThread( new_owner );
+	}
+}
+
+static inline void enter(monitor_desc ** monitors, int count) {
+	__enter_monitor_desc( monitors[0], NULL );
+	for(int i = 1; i < count; i++) {
+		__enter_monitor_desc( monitors[i], monitors[0] );
+	}
+}
+
+static inline void leave(monitor_desc ** monitors, int count) {
+	__leave_monitor_desc( monitors[0], NULL );
 	for(int i = count - 1; i >= 0; i--) {
-		__leave_monitor_desc( monitors[i] );
-	}
-}
+		__leave_monitor_desc( monitors[i], monitors[0] );
+	}
+}
+
+void ?{}( monitor_guard_t * this, monitor_desc ** m, int count ) {
+	this->m = m;
+	this->count = count;
+	qsort(this->m, count);
+	enter( this->m, this->count );
+
+	this->prev_mntrs = this_thread()->current_monitors;
+	this->prev_count = this_thread()->current_monitor_count;
+
+	this_thread()->current_monitors      = m;
+	this_thread()->current_monitor_count = count;
+}
+
+void ^?{}( monitor_guard_t * this ) {
+	leave( this->m, this->count );
+
+	this_thread()->current_monitors      = this->prev_mntrs;
+	this_thread()->current_monitor_count = this->prev_count;
+}
+
+//-----------------------------------------------------------------------------
+// Internal scheduling
+void wait( condition * this ) {
+	// LIB_DEBUG_FPRINTF("Waiting\n");
+	thread_desc * this_thrd = this_thread();
+
+	if( !this->monitors ) {
+		this->monitors = this_thrd->current_monitors;
+		this->monitor_count = this_thrd->current_monitor_count;
+	}
+
+	unsigned short count = this->monitor_count;
+
+	//Check that everything is as expected
+	assert( this->monitors != NULL );
+	assert( this->monitor_count != 0 );
+
+	unsigned int recursions[ count ];		//Save the current recursion levels to restore them later
+	spinlock *   locks     [ count ];		//We need to pass-in an array of locks to ScheduleInternal
+
+	// LIB_DEBUG_FPRINTF("Getting ready to wait\n");
+
+	//Loop on all the monitors and release the owner
+	for( unsigned int i = 0; i < count; i++ ) {
+		monitor_desc * cur = this->monitors[i];
+
+		assert( cur );
+
+		// LIB_DEBUG_FPRINTF("cur %p lock %p\n", cur, &cur->lock);
+
+		//Store the locks for later
+		locks[i] = &cur->lock;
+
+		//Protect the monitors
+		lock( locks[i] );
+		{		
+			//Save the recursion levels
+			recursions[i] = cur->recursion;
+
+			//Release the owner
+			cur->recursion = 0;
+			cur->owner = NULL;
+		}
+		//Release the monitor
+		unlock( locks[i] );
+	}
+
+	// LIB_DEBUG_FPRINTF("Waiting now\n");
+
+	//Everything is ready to go to sleep
+	ScheduleInternal( locks, count );
+
+
+	//WE WOKE UP
+
+
+	//We are back, restore the owners and recursions
+	for( unsigned int i = 0; i < count; i++ ) {
+		monitor_desc * cur = this->monitors[i];
+
+		//Protect the monitors
+		lock( locks[i] );
+		{
+			//Release the owner
+			cur->owner = this_thrd;
+			cur->recursion = recursions[i];
+		}
+		//Release the monitor
+		unlock( locks[i] );
+	}
+}
+
+static void __signal_internal( condition * this ) {
+	if( !this->blocked.head ) return;
+
+	//Check that everything is as expected
+	assert( this->monitors );
+	assert( this->monitor_count != 0 );
+	
+	LIB_DEBUG_DO(
+		if ( this->monitors != this_thread()->current_monitors ) {
+			abortf( "Signal on condition %p made outside of the correct monitor(s)", this );
+		} // if
+	);
+
+	monitor_desc * owner = this->monitors[0];
+	lock( &owner->lock );
+	{
+		thread_desc * unblock = pop_head( &this->blocked );
+		push( &owner->signal_stack, unblock );
+	}
+	unlock( &owner->lock );
+}
+
+void signal( condition * this ) {
+	__signal_internal( this );
+}
Index: src/libcfa/concurrency/thread.c
===================================================================
--- src/libcfa/concurrency/thread.c	(revision 1d29d46e76ca8eeebab3a2ad22aea33afbb42f87)
+++ src/libcfa/concurrency/thread.c	(revision 727cf70fd3ddbe0381c745441cbdb5942cc8920a)
@@ -39,4 +39,7 @@
 	this->mon.recursion = 1;
 	this->next = NULL;
+
+	this->current_monitors      = NULL;
+	this->current_monitor_count = 0;
 }
 
Index: src/tests/sched_internal.c
===================================================================
--- src/tests/sched_internal.c	(revision 727cf70fd3ddbe0381c745441cbdb5942cc8920a)
+++ src/tests/sched_internal.c	(revision 727cf70fd3ddbe0381c745441cbdb5942cc8920a)
@@ -0,0 +1,56 @@
+#include <kernel>
+#include <monitor>
+#include <thread>
+
+monitor global_t {
+	int value;
+};
+
+global_t global;
+
+condition cond;
+
+thread Signalee {};
+thread Signaler {};
+
+void step1( global_t * mutex this ) {
+	this->value = 1;
+	wait( &cond );
+}
+
+void step2( global_t * mutex this ) {
+	if( this->value != 1) abort();
+
+	this->value = 2;
+	signal( &cond );
+}
+
+void step3( global_t * mutex this ) {
+	if( this->value != 2) abort();
+
+	this->value = 2;
+	signal( &cond );
+}
+
+void main( Signalee* this ) {
+	step1( &global );
+	step3( &global );
+}
+
+void main( Signaler* this ) {
+	for(int i = 0; i < 10_000; i++) {
+		asm volatile ("" : : : "memory");
+	}
+
+	step2( &global );
+}
+
+int main(int argc, char* argv[]) {
+	assert( global.__mon.entry_queue.tail != NULL );
+	processor p;
+	{
+		Signalee a;
+		Signaler b;
+	}
+	if( global.value != 3) abort();
+}
