Index: src/libcfa/concurrency/invoke.h
===================================================================
--- src/libcfa/concurrency/invoke.h	(revision 78d3dd5829f3bbcb4527885629edab2212e6d4b8)
+++ src/libcfa/concurrency/invoke.h	(revision 23063ea0bdf02c35bfe020a0745a20afd7b21946)
@@ -33,10 +33,10 @@
       };
 
-      struct simple_thread_list {
+      struct __thread_queue_t {
             struct thread_desc * head;
             struct thread_desc ** tail;
       };
 
-      struct simple_thread_stack {
+      struct __thread_stack_t {
             struct thread_desc * top;
       };
@@ -44,11 +44,11 @@
       #ifdef __CFORALL__
       extern "Cforall" {
-            void ?{}( struct simple_thread_list * );
-            void append( struct simple_thread_list *, struct thread_desc * );
-            struct thread_desc * pop_head( struct simple_thread_list * );
+            void ?{}( struct __thread_queue_t * );
+            void append( struct __thread_queue_t *, struct thread_desc * );
+            struct thread_desc * pop_head( struct __thread_queue_t * );
 
-            void ?{}( struct simple_thread_stack * );
-            void push( struct simple_thread_stack *, struct thread_desc * );            
-            struct thread_desc * pop( struct simple_thread_stack * );
+            void ?{}( struct __thread_stack_t * );
+            void push( struct __thread_stack_t *, struct thread_desc * );            
+            struct thread_desc * pop( struct __thread_stack_t * );
 
             void ?{}(spinlock * this);
@@ -58,11 +58,11 @@
 
       struct coStack_t {
-            unsigned int size;                  // size of stack
-            void *storage;                      // pointer to stack
-            void *limit;                        // stack grows towards stack limit
-            void *base;                         // base of stack
-            void *context;                      // address of cfa_context_t
-            void *top;                          // address of top of storage
-            bool userStack;                     // whether or not the user allocated the stack
+            unsigned int size;                        // size of stack
+            void *storage;                            // pointer to stack
+            void *limit;                              // stack grows towards stack limit
+            void *base;                               // base of stack
+            void *context;                            // address of cfa_context_t
+            void *top;                                // address of top of storage
+            bool userStack;                           // whether or not the user allocated the stack
       };
 
@@ -70,25 +70,27 @@
 
       struct coroutine_desc {
-            struct coStack_t stack;             // stack information of the coroutine
-            const char *name;                   // textual name for coroutine/task, initialized by uC++ generated code
-            int errno_;                         // copy of global UNIX variable errno
-            enum coroutine_state state;         // current execution status for coroutine
-            struct coroutine_desc *starter;     // first coroutine to resume this one
-            struct coroutine_desc *last;	      // last coroutine to resume this one
+            struct coStack_t stack;                   // stack information of the coroutine
+            const char *name;                         // textual name for coroutine/task, initialized by uC++ generated code
+            int errno_;                               // copy of global UNIX variable errno
+            enum coroutine_state state;               // current execution status for coroutine
+            struct coroutine_desc *starter;           // first coroutine to resume this one
+            struct coroutine_desc *last;	            // last coroutine to resume this one
       };
 
       struct monitor_desc {
-            struct spinlock lock;
-            struct thread_desc * owner;
-            struct simple_thread_list entry_queue;
-            struct simple_thread_stack signal_stack;
-            struct monitor_desc * stack_owner;
-            unsigned int recursion;
+            struct spinlock lock;                     // spinlock to protect internal data
+            struct thread_desc * owner;               // current owner of the monitor
+            struct __thread_queue_t entry_queue;      // queue of threads that are blocked waiting for the monitor
+            struct __thread_stack_t signal_stack;     // stack of threads to run next once we exit the monitor
+            struct monitor_desc * stack_owner;        // if bulk acquiring was used we need to synchronize signals with an other monitor
+            unsigned int recursion;                   // monitor routines can be called recursively, we need to keep track of that
       };
 
       struct thread_desc {
-            struct coroutine_desc cor;          // coroutine body used to store context
-            struct monitor_desc mon;            // monitor body used for mutual exclusion
-            struct thread_desc * next;          // instrusive link field for threads
+            struct coroutine_desc cor;                // coroutine body used to store context
+            struct monitor_desc mon;                  // monitor body used for mutual exclusion
+            struct thread_desc * next;                // instrusive link field for threads
+            struct monitor_desc ** current_monitors;  // currently held monitors
+            unsigned short current_monitor_count;     // number of currently held monitors
       };
 
Index: src/libcfa/concurrency/kernel
===================================================================
--- src/libcfa/concurrency/kernel	(revision 78d3dd5829f3bbcb4527885629edab2212e6d4b8)
+++ src/libcfa/concurrency/kernel	(revision 23063ea0bdf02c35bfe020a0745a20afd7b21946)
@@ -32,7 +32,7 @@
 
 struct signal_once {
-	volatile bool condition;
+	volatile bool cond;
 	struct spinlock lock;
-	struct simple_thread_list blocked;
+	struct __thread_queue_t blocked;
 };
 
@@ -46,5 +46,5 @@
 // Cluster
 struct cluster {
-	simple_thread_list ready_queue;
+	__thread_queue_t ready_queue;
 	spinlock lock;
 };
Index: src/libcfa/concurrency/kernel.c
===================================================================
--- src/libcfa/concurrency/kernel.c	(revision 78d3dd5829f3bbcb4527885629edab2212e6d4b8)
+++ src/libcfa/concurrency/kernel.c	(revision 23063ea0bdf02c35bfe020a0745a20afd7b21946)
@@ -475,5 +475,5 @@
 
 void ?{}( signal_once * this ) {
-	this->condition = false;
+	this->cond = false;
 }
 void ^?{}( signal_once * this ) {
@@ -483,5 +483,5 @@
 void wait( signal_once * this ) {
 	lock( &this->lock );
-	if( !this->condition ) {
+	if( !this->cond ) {
 		append( &this->blocked, this_thread() );
 		ScheduleInternal( &this->lock );
@@ -494,5 +494,5 @@
 	lock( &this->lock );
 	{
-		this->condition = true;
+		this->cond = true;
 
 		thread_desc * it;
@@ -506,10 +506,10 @@
 //-----------------------------------------------------------------------------
 // Queues
-void ?{}( simple_thread_list * this ) {
+void ?{}( __thread_queue_t * this ) {
 	this->head = NULL;
 	this->tail = &this->head;
 }
 
-void append( simple_thread_list * this, thread_desc * t ) {
+void append( __thread_queue_t * this, thread_desc * t ) {
 	assert(this->tail != NULL);
 	*this->tail = t;
@@ -517,5 +517,5 @@
 }
 
-thread_desc * pop_head( simple_thread_list * this ) {
+thread_desc * pop_head( __thread_queue_t * this ) {
 	thread_desc * head = this->head;
 	if( head ) {
@@ -529,9 +529,9 @@
 }
 
-void ?{}( simple_thread_stack * this ) {
+void ?{}( __thread_stack_t * this ) {
 	this->top = NULL;
 }
 
-void push( simple_thread_stack * this, thread_desc * t ) {
+void push( __thread_stack_t * this, thread_desc * t ) {
 	assert(t->next != NULL);
 	t->next = this->top;
@@ -539,5 +539,5 @@
 }
 
-thread_desc * pop( simple_thread_stack * this ) {
+thread_desc * pop( __thread_stack_t * this ) {
 	thread_desc * top = this->top;
 	if( top ) {
Index: src/libcfa/concurrency/monitor
===================================================================
--- src/libcfa/concurrency/monitor	(revision 78d3dd5829f3bbcb4527885629edab2212e6d4b8)
+++ src/libcfa/concurrency/monitor	(revision 23063ea0bdf02c35bfe020a0745a20afd7b21946)
@@ -30,11 +30,9 @@
 }
 
-//Array entering routine
-void enter(monitor_desc **, int count);
-void leave(monitor_desc **, int count);
-
 struct monitor_guard_t {
 	monitor_desc ** m;
 	int count;
+      monitor_desc ** prev_mntrs;
+      unsigned short  prev_count;
 };
 
@@ -43,15 +41,21 @@
 }
 
-static inline void ?{}( monitor_guard_t * this, monitor_desc ** m, int count ) {
-	this->m = m;
-	this->count = count;
-	qsort(this->m, count);
-	enter( this->m, this->count );
+void ?{}( monitor_guard_t * this, monitor_desc ** m, int count );
+void ^?{}( monitor_guard_t * this );
+
+//-----------------------------------------------------------------------------
+// Internal scheduling
+struct condition {
+	__thread_queue_t blocked;
+	monitor_desc ** monitors;
+	unsigned short monitor_count;
+};
+
+static inline void ?{}( condition * this ) {
+	this->monitors = NULL;
+	this->monitor_count = 0;
 }
 
-static inline void ^?{}( monitor_guard_t * this ) {
-	leave( this->m, this->count );
-}
-
-
+void wait( condition * this );
+void signal( condition * this );
 #endif //MONITOR_H
Index: src/libcfa/concurrency/monitor.c
===================================================================
--- src/libcfa/concurrency/monitor.c	(revision 78d3dd5829f3bbcb4527885629edab2212e6d4b8)
+++ src/libcfa/concurrency/monitor.c	(revision 23063ea0bdf02c35bfe020a0745a20afd7b21946)
@@ -18,4 +18,5 @@
 
 #include "kernel_private.h"
+#include "libhdr.h"
 
 void set_owner( monitor_desc * this, thread_desc * owner ) {
@@ -28,7 +29,10 @@
 
 extern "C" {
-	void __enter_monitor_desc(monitor_desc * this) {
+	void __enter_monitor_desc(monitor_desc * this, monitor_desc * leader) {
 		lock( &this->lock );
 		thread_desc * thrd = this_thread();
+
+		//Update the stack owner
+		this->stack_owner = leader;
 
 		if( !this->owner ) {
@@ -52,4 +56,5 @@
 
 		unlock( &this->lock );
+		return;
 	}
 
@@ -69,9 +74,9 @@
 	//					context switch to him right away
 	//
-	void __leave_monitor_desc(monitor_desc * this) {
+	void __leave_monitor_desc(monitor_desc * this, monitor_desc * leader) {
 		lock( &this->lock );
 
 		thread_desc * thrd = this_thread();
-		assert( thrd == this->owner );
+		assert( thrd == this->owner || this->stack_owner );
 
 		//Leaving a recursion level, decrement the counter
@@ -81,4 +86,5 @@
 		//it means we don't need to do anything
 		if( this->recursion != 0) {
+			this->stack_owner = leader;
 			unlock( &this->lock );
 			return;
@@ -87,5 +93,5 @@
 		//If we don't own the signal stack then just leave it to the owner
 		if( this->stack_owner ) {
-			assert( this->owner == this->stack_owner );
+			this->stack_owner = leader;
 			unlock( &this->lock );
 			return;
@@ -102,4 +108,5 @@
 			//transfer control immediately
 			set_owner( this, new_owner );
+			this->stack_owner = leader;
 			ScheduleInternal( &this->lock, new_owner );
 			return;
@@ -111,4 +118,7 @@
 		set_owner( this, new_owner );
 
+		//Update the stack owner
+		this->stack_owner = leader;
+
 		//We can now let other threads in safely
 		unlock( &this->lock );
@@ -119,13 +129,133 @@
 }
 
-void enter(monitor_desc ** monitors, int count) {
-	for(int i = 0; i < count; i++) {
-		__enter_monitor_desc( monitors[i] );
-	}
-}
-
-void leave(monitor_desc ** monitors, int count) {
+static inline void enter(monitor_desc ** monitors, int count) {
+	__enter_monitor_desc( monitors[0], NULL );
+	for(int i = 1; i < count; i++) {
+		__enter_monitor_desc( monitors[i], monitors[0] );
+	}
+}
+
+static inline void leave(monitor_desc ** monitors, int count) {
+	__leave_monitor_desc( monitors[0], NULL );
 	for(int i = count - 1; i >= 0; i--) {
-		__leave_monitor_desc( monitors[i] );
-	}
-}
+		__leave_monitor_desc( monitors[i], monitors[0] );
+	}
+}
+
+void ?{}( monitor_guard_t * this, monitor_desc ** m, int count ) {
+	this->m = m;
+	this->count = count;
+	qsort(this->m, count);
+	enter( this->m, this->count );
+
+	this->prev_mntrs = this_thread()->current_monitors;
+	this->prev_count = this_thread()->current_monitor_count;
+
+	this_thread()->current_monitors      = m;
+	this_thread()->current_monitor_count = count;
+}
+
+void ^?{}( monitor_guard_t * this ) {
+	leave( this->m, this->count );
+
+	this_thread()->current_monitors      = this->prev_mntrs;
+	this_thread()->current_monitor_count = this->prev_count;
+}
+
+//-----------------------------------------------------------------------------
+// Internal scheduling
+void wait( condition * this ) {
+	// LIB_DEBUG_FPRINTF("Waiting\n");
+	thread_desc * this_thrd = this_thread();
+
+	if( !this->monitors ) {
+		this->monitors = this_thrd->current_monitors;
+		this->monitor_count = this_thrd->current_monitor_count;
+	}
+
+	unsigned short count = this->monitor_count;
+
+	//Check that everything is as expected
+	assert( this->monitors != NULL );
+	assert( this->monitor_count != 0 );
+
+	unsigned int recursions[ count ];		//Save the current recursion levels to restore them later
+	spinlock *   locks     [ count ];		//We need to pass-in an array of locks to ScheduleInternal
+
+	// LIB_DEBUG_FPRINTF("Getting ready to wait\n");
+
+	//Loop on all the monitors and release the owner
+	for( unsigned int i = 0; i < count; i++ ) {
+		monitor_desc * cur = this->monitors[i];
+
+		assert( cur );
+
+		// LIB_DEBUG_FPRINTF("cur %p lock %p\n", cur, &cur->lock);
+
+		//Store the locks for later
+		locks[i] = &cur->lock;
+
+		//Protect the monitors
+		lock( locks[i] );
+		{		
+			//Save the recursion levels
+			recursions[i] = cur->recursion;
+
+			//Release the owner
+			cur->recursion = 0;
+			cur->owner = NULL;
+		}
+		//Release the monitor
+		unlock( locks[i] );
+	}
+
+	// LIB_DEBUG_FPRINTF("Waiting now\n");
+
+	//Everything is ready to go to sleep
+	ScheduleInternal( locks, count );
+
+
+	//WE WOKE UP
+
+
+	//We are back, restore the owners and recursions
+	for( unsigned int i = 0; i < count; i++ ) {
+		monitor_desc * cur = this->monitors[i];
+
+		//Protect the monitors
+		lock( locks[i] );
+		{
+			//Release the owner
+			cur->owner = this_thrd;
+			cur->recursion = recursions[i];
+		}
+		//Release the monitor
+		unlock( locks[i] );
+	}
+}
+
+static void __signal_internal( condition * this ) {
+	if( !this->blocked.head ) return;
+
+	//Check that everything is as expected
+	assert( this->monitors );
+	assert( this->monitor_count != 0 );
+	
+	LIB_DEBUG_DO(
+		if ( this->monitors != this_thread()->current_monitors ) {
+			abortf( "Signal on condition %p made outside of the correct monitor(s)", this );
+		} // if
+	);
+
+	monitor_desc * owner = this->monitors[0];
+	lock( &owner->lock );
+	{
+		thread_desc * unblock = pop_head( &this->blocked );
+		push( &owner->signal_stack, unblock );
+	}
+	unlock( &owner->lock );
+}
+
+void signal( condition * this ) {
+	__signal_internal( this );
+}
Index: src/libcfa/concurrency/thread.c
===================================================================
--- src/libcfa/concurrency/thread.c	(revision 78d3dd5829f3bbcb4527885629edab2212e6d4b8)
+++ src/libcfa/concurrency/thread.c	(revision 23063ea0bdf02c35bfe020a0745a20afd7b21946)
@@ -39,4 +39,7 @@
 	this->mon.recursion = 1;
 	this->next = NULL;
+
+	this->current_monitors      = NULL;
+	this->current_monitor_count = 0;
 }
 
