Index: doc/proposals/concurrency/thePlan.md
===================================================================
--- doc/proposals/concurrency/thePlan.md	(revision 5c3632f9e52d9bac69b673206f30ebb020b31f3b)
+++ doc/proposals/concurrency/thePlan.md	(revision b3d70eba692f70343957b265f1ac7c17022df555)
@@ -10,13 +10,13 @@
 done - Multi monitors calls,
 done - Monitors as a language feature (not calling enter/leave by hand)
+
+_Phase 3_ : Monitor features
 Internal scheduling
+External scheduling
 
-_Phase 3_ : Kernel features
+_Phase 4_ : Kernel features
 Preemption
 Detach thread
 Cluster migration
-
-_Phase 4_ : Monitor features
-External scheduling
 
 _Phase 5_ : Performance
Index: src/CodeGen/CodeGenerator.cc
===================================================================
--- src/CodeGen/CodeGenerator.cc	(revision 5c3632f9e52d9bac69b673206f30ebb020b31f3b)
+++ src/CodeGen/CodeGenerator.cc	(revision b3d70eba692f70343957b265f1ac7c17022df555)
@@ -89,5 +89,5 @@
 	}
 
-	CodeGenerator::CodeGenerator( std::ostream & os, bool pretty ) : indent( *this), cur_indent( 0 ), insideFunction( false ), output( os ), printLabels( *this ), pretty( pretty ) {}
+	CodeGenerator::CodeGenerator( std::ostream & os, bool pretty, bool genC ) : indent( *this), cur_indent( 0 ), insideFunction( false ), output( os ), printLabels( *this ), pretty( pretty ), genC( genC ) {}
 
 	CodeGenerator::CodeGenerator( std::ostream & os, std::string init, int indentation, bool infunp )
@@ -136,5 +136,5 @@
 		functionDecl->get_funcSpec().print( output );
 
-		output << genType( functionDecl->get_functionType(), mangleName( functionDecl ), pretty );
+		output << genType( functionDecl->get_functionType(), mangleName( functionDecl ), pretty, genC );
 
 		asmName( functionDecl );
@@ -147,5 +147,6 @@
 
 	void CodeGenerator::visit( ObjectDecl * objectDecl ) {
-		if (objectDecl->get_name().empty()) {
+		if (objectDecl->get_name().empty() && genC ) {
+			// only generate an anonymous name when generating C code, otherwise it clutters the output too much
 			static UniqueName name = { "__anonymous_object" };
 			objectDecl->set_name( name.newName() );
@@ -156,5 +157,5 @@
 
 		handleStorageClass( objectDecl );
-		output << genType( objectDecl->get_type(), mangleName( objectDecl ), pretty );
+		output << genType( objectDecl->get_type(), mangleName( objectDecl ), pretty, genC );
 
 		asmName( objectDecl );
@@ -171,12 +172,18 @@
 	}
 
-	void CodeGenerator::handleAggregate( AggregateDecl * aggDecl ) {
+	void CodeGenerator::handleAggregate( AggregateDecl * aggDecl, const std::string & kind ) {
 		genAttributes( aggDecl->get_attributes() );
 
+		if( ! aggDecl->get_parameters().empty() && ! genC ) {
+			// assertf( ! genC, "Aggregate type parameters should not reach code generation." );
+			output << "forall(";
+			genCommaList( aggDecl->get_parameters().begin(), aggDecl->get_parameters().end() );
+			output << ")" << endl;
+		}
+
+		output << kind;
 		if ( aggDecl->get_name() != "" )
 			output << aggDecl->get_name();
 
-		// std::list< Declaration * > & memb = aggDecl->get_members();
-		// if ( ! memb.empty() ) {
 		if ( aggDecl->has_body() ) {
 			std::list< Declaration * > & memb = aggDecl->get_members();
@@ -198,12 +205,10 @@
 	void CodeGenerator::visit( StructDecl * structDecl ) {
 		extension( structDecl );
-		output << "struct ";
-		handleAggregate( structDecl );
+		handleAggregate( structDecl, "struct " );
 	}
 
 	void CodeGenerator::visit( UnionDecl * unionDecl ) {
 		extension( unionDecl );
-		output << "union ";
-		handleAggregate( unionDecl );
+		handleAggregate( unionDecl, "union " );
 	}
 
@@ -242,17 +247,26 @@
 
 	void CodeGenerator::visit( TypedefDecl * typeDecl ) {
-		assert( false && "Typedefs are removed and substituted in earlier passes." );
-		//output << "typedef ";
-		//output << genType( typeDecl->get_base(), typeDecl->get_name(), pretty );
+		assertf( ! genC, "Typedefs are removed and substituted in earlier passes." );
+		output << "typedef ";
+		output << genType( typeDecl->get_base(), typeDecl->get_name(), pretty, genC ) << endl;
 	}
 
 	void CodeGenerator::visit( TypeDecl * typeDecl ) {
-		// really, we should mutate this into something that isn't a TypeDecl but that requires large-scale changes,
-		// still to be done
-		extension( typeDecl );
-		output << "extern unsigned long " << typeDecl->get_name();
-		if ( typeDecl->get_base() ) {
-			output << " = sizeof( " << genType( typeDecl->get_base(), "", pretty ) << " )";
-		} // if
+		if ( genC ) {
+			// really, we should mutate this into something that isn't a TypeDecl but that requires large-scale changes,
+			// still to be done
+			extension( typeDecl );
+			output << "extern unsigned long " << typeDecl->get_name();
+			if ( typeDecl->get_base() ) {
+				output << " = sizeof( " << genType( typeDecl->get_base(), "", pretty, genC ) << " )";
+			} // if
+		} else {
+			output << typeDecl->typeString() << " " << typeDecl->get_name();
+			if ( ! typeDecl->get_assertions().empty() ) {
+				output << " | { ";
+				genCommaList( typeDecl->get_assertions().begin(), typeDecl->get_assertions().end() );
+				output << " }";
+			}
+		}
 	}
 
@@ -293,5 +307,7 @@
 
 	void CodeGenerator::visit( ConstructorInit * init ){
-		assertf( false, "ConstructorInit nodes should not make it to CodeGen." );
+		assertf( ! genC, "ConstructorInit nodes should not reach code generation." );
+		// xxx - generate something reasonable for constructor/destructor pairs
+		output << "<ctorinit>";
 	}
 
@@ -547,5 +563,5 @@
 			// at least one result type of cast, but not an lvalue
 			output << "(";
-			output << genType( castExpr->get_result(), "", pretty );
+			output << genType( castExpr->get_result(), "", pretty, genC );
 			output << ")";
 		} else {
@@ -558,5 +574,9 @@
 
 	void CodeGenerator::visit( UntypedMemberExpr * memberExpr ) {
-		assert( false );
+		assertf( ! genC, "UntypedMemberExpr should not reach code generation." );
+		extension( memberExpr );
+		memberExpr->get_aggregate()->accept( *this );
+		output << ".";
+		memberExpr->get_member()->accept( *this );
 	}
 
@@ -587,5 +607,5 @@
 		output << "sizeof(";
 		if ( sizeofExpr->get_isType() ) {
-			output << genType( sizeofExpr->get_type(), "", pretty );
+			output << genType( sizeofExpr->get_type(), "", pretty, genC );
 		} else {
 			sizeofExpr->get_expr()->accept( *this );
@@ -599,5 +619,5 @@
 		output << "__alignof__(";
 		if ( alignofExpr->get_isType() ) {
-			output << genType( alignofExpr->get_type(), "", pretty );
+			output << genType( alignofExpr->get_type(), "", pretty, genC );
 		} else {
 			alignofExpr->get_expr()->accept( *this );
@@ -607,5 +627,9 @@
 
 	void CodeGenerator::visit( UntypedOffsetofExpr * offsetofExpr ) {
-		assert( false && "UntypedOffsetofExpr should not reach code generation." );
+		assertf( ! genC, "UntypedOffsetofExpr should not reach code generation." );
+		output << "offsetof(";
+		output << genType( offsetofExpr->get_type(), "", pretty, genC );
+		output << ", " << offsetofExpr->get_member();
+		output << ")";
 	}
 
@@ -613,5 +637,5 @@
 		// use GCC builtin
 		output << "__builtin_offsetof(";
-		output << genType( offsetofExpr->get_type(), "", pretty );
+		output << genType( offsetofExpr->get_type(), "", pretty, genC );
 		output << ", " << mangleName( offsetofExpr->get_member() );
 		output << ")";
@@ -619,5 +643,6 @@
 
 	void CodeGenerator::visit( OffsetPackExpr * offsetPackExpr ) {
-		assert( false && "OffsetPackExpr should not reach code generation." );
+		assertf( ! genC, "OffsetPackExpr should not reach code generation." );
+		output << "__CFA_offsetpack(" << genType( offsetPackExpr->get_type(), "", pretty, genC ) << ")";
 	}
 
@@ -655,9 +680,22 @@
 	}
 
-	void CodeGenerator::visit( UntypedTupleExpr * tupleExpr ) { assertf( false, "UntypedTupleExpr should not make it to Code Gen" ); }
-
-	void CodeGenerator::visit( TupleExpr * tupleExpr ) { assertf( false, "TupleExpr should not make it to Code Gen" ); }
-
-	void CodeGenerator::visit( TypeExpr * typeExpr ) {}
+	void CodeGenerator::visit( UntypedTupleExpr * tupleExpr ) {
+		assertf( ! genC, "UntypedTupleExpr should not reach code generation." );
+		output << "[";
+		genCommaList( tupleExpr->get_exprs().begin(), tupleExpr->get_exprs().end() );
+		output << "]";
+	}
+
+	void CodeGenerator::visit( TupleExpr * tupleExpr ) {
+		assertf( ! genC, "TupleExpr should not reach code generation." );
+		output << "[";
+		genCommaList( tupleExpr->get_exprs().begin(), tupleExpr->get_exprs().end() );
+		output << "]";
+	}
+
+	void CodeGenerator::visit( TypeExpr * typeExpr ) {
+		assertf( ! genC, "TypeExpr should not reach code generation." );
+		output<< genType( typeExpr->get_type(), "", pretty, genC );
+	}
 
 	void CodeGenerator::visit( AsmExpr * asmExpr ) {
@@ -675,5 +713,5 @@
 	void CodeGenerator::visit( CompoundLiteralExpr *compLitExpr ) {
 		assert( compLitExpr->get_result() && dynamic_cast< ListInit * > ( compLitExpr->get_initializer() ) );
-		output << "(" << genType( compLitExpr->get_result(), "", pretty ) << ")";
+		output << "(" << genType( compLitExpr->get_result(), "", pretty, genC ) << ")";
 		compLitExpr->get_initializer()->accept( *this );
 	}
Index: src/CodeGen/CodeGenerator.h
===================================================================
--- src/CodeGen/CodeGenerator.h	(revision 5c3632f9e52d9bac69b673206f30ebb020b31f3b)
+++ src/CodeGen/CodeGenerator.h	(revision b3d70eba692f70343957b265f1ac7c17022df555)
@@ -30,5 +30,5 @@
 		static int tabsize;
 
-		CodeGenerator( std::ostream &os, bool pretty = false );
+		CodeGenerator( std::ostream &os, bool pretty = false, bool genC = false );
 		CodeGenerator( std::ostream &os, std::string, int indent = 0, bool infun = false );
 		CodeGenerator( std::ostream &os, char *, int indent = 0, bool infun = false );
@@ -121,8 +121,9 @@
 		LabelPrinter printLabels;
 		bool pretty = false;  // pretty print
+		bool genC = false;    // true if output has to be C code
 
 		void printDesignators( std::list< Expression * > & );
 		void handleStorageClass( DeclarationWithType *decl );
-		void handleAggregate( AggregateDecl *aggDecl );
+		void handleAggregate( AggregateDecl *aggDecl, const std::string & kind );
 		void handleTypedef( NamedTypeDecl *namedType );
 		std::string mangleName( DeclarationWithType * decl );
Index: src/CodeGen/GenType.cc
===================================================================
--- src/CodeGen/GenType.cc	(revision 5c3632f9e52d9bac69b673206f30ebb020b31f3b)
+++ src/CodeGen/GenType.cc	(revision b3d70eba692f70343957b265f1ac7c17022df555)
@@ -28,5 +28,5 @@
 	class GenType : public Visitor {
 	  public:
-		GenType( const std::string &typeString, bool pretty = false );
+		GenType( const std::string &typeString, bool pretty = false, bool genC = false );
 		std::string get_typeString() const { return typeString; }
 		void set_typeString( const std::string &newValue ) { typeString = newValue; }
@@ -48,16 +48,18 @@
 	  private:
 		void handleQualifiers( Type *type );
+		std::string handleGeneric( ReferenceToType * refType );
 		void genArray( const Type::Qualifiers &qualifiers, Type *base, Expression *dimension, bool isVarLen, bool isStatic );
 
 		std::string typeString;
 		bool pretty = false; // pretty print
+		bool genC = false;   // generating C code?
 	};
 
-	std::string genType( Type *type, const std::string &baseString, bool pretty ) {
-		GenType gt( baseString, pretty );
+	std::string genType( Type *type, const std::string &baseString, bool pretty, bool genC ) {
+		GenType gt( baseString, pretty, genC );
 		std::ostringstream os;
 
 		if ( ! type->get_attributes().empty() ) {
-			CodeGenerator cg( os, pretty );
+			CodeGenerator cg( os, pretty, genC );
 			cg.genAttributes( type->get_attributes() );
 		} // if
@@ -68,8 +70,8 @@
 
   std::string genPrettyType( Type * type, const std::string & baseString ) {
-  	return genType( type, baseString, true );
+  	return genType( type, baseString, true, false );
   }
 
-	GenType::GenType( const std::string &typeString, bool pretty ) : typeString( typeString ), pretty( pretty ) {}
+	GenType::GenType( const std::string &typeString, bool pretty, bool genC ) : typeString( typeString ), pretty( pretty ), genC( genC ) {}
 
 	void GenType::visit( VoidType *voidType ) {
@@ -112,5 +114,5 @@
 		} // if
 		if ( dimension != 0 ) {
-			CodeGenerator cg( os, pretty );
+			CodeGenerator cg( os, pretty, genC );
 			dimension->accept( cg );
 		} else if ( isVarLen ) {
@@ -166,5 +168,5 @@
 			} // if
 		} else {
-			CodeGenerator cg( os, pretty );
+			CodeGenerator cg( os, pretty, genC );
 			os << "(" ;
 
@@ -184,18 +186,44 @@
 			funcType->get_returnVals().front()->get_type()->accept( *this );
 		} // if
+
+		// add forall
+		if( ! funcType->get_forall().empty() && ! genC ) {
+			// assertf( ! genC, "Aggregate type parameters should not reach code generation." );
+			std::ostringstream os;
+			CodeGenerator cg( os, pretty, genC );
+			os << "forall(";
+			cg.genCommaList( funcType->get_forall().begin(), funcType->get_forall().end() );
+			os << ")" << std::endl;
+			typeString = os.str() + typeString;
+		}
+	}
+
+	std::string GenType::handleGeneric( ReferenceToType * refType ) {
+		if ( ! refType->get_parameters().empty() ) {
+			std::ostringstream os;
+			CodeGenerator cg( os, pretty, genC );
+			os << "(";
+			cg.genCommaList( refType->get_parameters().begin(), refType->get_parameters().end() );
+			os << ") ";
+			return os.str();
+		}
+		return "";
 	}
 
 	void GenType::visit( StructInstType *structInst )  {
-		typeString = "struct " + structInst->get_name() + " " + typeString;
+		typeString = structInst->get_name() + handleGeneric( structInst ) + " " + typeString;
+		if ( genC ) typeString = "struct " + typeString;
 		handleQualifiers( structInst );
 	}
 
 	void GenType::visit( UnionInstType *unionInst ) {
-		typeString = "union " + unionInst->get_name() + " " + typeString;
+		typeString = unionInst->get_name() + handleGeneric( unionInst ) + " " + typeString;
+		if ( genC ) typeString = "union " + typeString;
 		handleQualifiers( unionInst );
 	}
 
 	void GenType::visit( EnumInstType *enumInst ) {
-		typeString = "enum " + enumInst->get_name() + " " + typeString;
+		typeString = enumInst->get_name() + " " + typeString;
+		if ( genC ) typeString = "enum " + typeString;
 		handleQualifiers( enumInst );
 	}
@@ -207,5 +235,5 @@
 
 	void GenType::visit( TupleType * tupleType ) {
-		assertf( pretty, "Tuple types should not make it to Code Gen." );
+		assertf( ! genC, "Tuple types should not reach code generation." );
 		Visitor::visit( tupleType );
 		unsigned int i = 0;
@@ -214,5 +242,5 @@
 		for ( Type * t : *tupleType ) {
 			i++;
-			os << genType( t, "", pretty ) << (i == tupleType->size() ? "" : ", ");
+			os << genType( t, "", pretty, genC ) << (i == tupleType->size() ? "" : ", ");
 		}
 		os << "]";
Index: src/CodeGen/GenType.h
===================================================================
--- src/CodeGen/GenType.h	(revision 5c3632f9e52d9bac69b673206f30ebb020b31f3b)
+++ src/CodeGen/GenType.h	(revision b3d70eba692f70343957b265f1ac7c17022df555)
@@ -21,5 +21,5 @@
 
 namespace CodeGen {
-	std::string genType( Type *type, const std::string &baseString, bool pretty = false );
+	std::string genType( Type *type, const std::string &baseString, bool pretty = false, bool genC = false );
   std::string genPrettyType( Type * type, const std::string & baseString );
 } // namespace CodeGen
Index: src/CodeGen/Generate.cc
===================================================================
--- src/CodeGen/Generate.cc	(revision 5c3632f9e52d9bac69b673206f30ebb020b31f3b)
+++ src/CodeGen/Generate.cc	(revision b3d70eba692f70343957b265f1ac7c17022df555)
@@ -27,6 +27,6 @@
 
 namespace CodeGen {
-	void generate( std::list< Declaration* > translationUnit, std::ostream &os, bool doIntrinsics, bool pretty ) {
-		CodeGen::CodeGenerator cgv( os, pretty );
+	void generate( std::list< Declaration* > translationUnit, std::ostream &os, bool doIntrinsics, bool pretty, bool generateC ) {
+		CodeGen::CodeGenerator cgv( os, pretty, generateC );
 		for ( auto & dcl : translationUnit ) {
 			if ( LinkageSpec::isGeneratable( dcl->get_linkage() ) && (doIntrinsics || ! LinkageSpec::isBuiltin( dcl->get_linkage() ) ) ) {
Index: src/CodeGen/Generate.h
===================================================================
--- src/CodeGen/Generate.h	(revision 5c3632f9e52d9bac69b673206f30ebb020b31f3b)
+++ src/CodeGen/Generate.h	(revision b3d70eba692f70343957b265f1ac7c17022df555)
@@ -23,6 +23,6 @@
 
 namespace CodeGen {
-	/// Generates code
-	void generate( std::list< Declaration* > translationUnit, std::ostream &os, bool doIntrinsics, bool pretty );
+	/// Generates code. doIntrinsics determines if intrinsic functions are printed, pretty formats output nicely (e.g., uses unmangled names, etc.), generateC is true when the output must consist only of C code (allows some assertions, etc.)
+	void generate( std::list< Declaration* > translationUnit, std::ostream &os, bool doIntrinsics, bool pretty, bool generateC = false );
 } // namespace CodeGen
 
Index: src/SynTree/Declaration.h
===================================================================
--- src/SynTree/Declaration.h	(revision 5c3632f9e52d9bac69b673206f30ebb020b31f3b)
+++ src/SynTree/Declaration.h	(revision b3d70eba692f70343957b265f1ac7c17022df555)
@@ -167,9 +167,10 @@
 	std::list< DeclarationWithType* >& get_assertions() { return assertions; }
 
+	virtual std::string typeString() const = 0;
+
 	virtual NamedTypeDecl *clone() const = 0;
 	virtual void print( std::ostream &os, int indent = 0 ) const;
 	virtual void printShort( std::ostream &os, int indent = 0 ) const;
   protected:
-	virtual std::string typeString() const = 0;
   private:
 	Type *base;
@@ -202,9 +203,10 @@
 	TypeDecl * set_sized( bool newValue ) { sized = newValue; return this; }
 
+	virtual std::string typeString() const;
+
 	virtual TypeDecl *clone() const { return new TypeDecl( *this ); }
 	virtual void accept( Visitor &v ) { v.visit( this ); }
 	virtual TypeDecl *acceptMutator( Mutator &m ) { return m.mutate( this ); }
   private:
-	virtual std::string typeString() const;
 	Kind kind;
 	bool sized;
@@ -217,9 +219,10 @@
 	TypedefDecl( const TypedefDecl &other ) : Parent( other ) {}
 
+	virtual std::string typeString() const;
+
 	virtual TypedefDecl *clone() const { return new TypedefDecl( *this ); }
 	virtual void accept( Visitor &v ) { v.visit( this ); }
 	virtual Declaration *acceptMutator( Mutator &m ) { return m.mutate( this ); }
   private:
-	virtual std::string typeString() const;
 };
 
Index: src/libcfa/concurrency/invoke.h
===================================================================
--- src/libcfa/concurrency/invoke.h	(revision 5c3632f9e52d9bac69b673206f30ebb020b31f3b)
+++ src/libcfa/concurrency/invoke.h	(revision b3d70eba692f70343957b265f1ac7c17022df555)
@@ -33,14 +33,22 @@
       };
 
-      struct simple_thread_list {
+      struct __thread_queue_t {
             struct thread_desc * head;
             struct thread_desc ** tail;
       };
 
+      struct __thread_stack_t {
+            struct thread_desc * top;
+      };
+
       #ifdef __CFORALL__
       extern "Cforall" {
-            void ?{}( struct simple_thread_list * );
-            void append( struct simple_thread_list *, struct thread_desc * );
-            struct thread_desc * pop_head( struct simple_thread_list * );
+            void ?{}( struct __thread_queue_t * );
+            void append( struct __thread_queue_t *, struct thread_desc * );
+            struct thread_desc * pop_head( struct __thread_queue_t * );
+
+            void ?{}( struct __thread_stack_t * );
+            void push( struct __thread_stack_t *, struct thread_desc * );            
+            struct thread_desc * pop( struct __thread_stack_t * );
 
             void ?{}(spinlock * this);
@@ -50,11 +58,11 @@
 
       struct coStack_t {
-            unsigned int size;                  // size of stack
-            void *storage;                      // pointer to stack
-            void *limit;                        // stack grows towards stack limit
-            void *base;                         // base of stack
-            void *context;                      // address of cfa_context_t
-            void *top;                          // address of top of storage
-            bool userStack;                     // whether or not the user allocated the stack
+            unsigned int size;                        // size of stack
+            void *storage;                            // pointer to stack
+            void *limit;                              // stack grows towards stack limit
+            void *base;                               // base of stack
+            void *context;                            // address of cfa_context_t
+            void *top;                                // address of top of storage
+            bool userStack;                           // whether or not the user allocated the stack
       };
 
@@ -62,23 +70,27 @@
 
       struct coroutine_desc {
-            struct coStack_t stack;             // stack information of the coroutine
-            const char *name;                   // textual name for coroutine/task, initialized by uC++ generated code
-            int errno_;                         // copy of global UNIX variable errno
-            enum coroutine_state state;         // current execution status for coroutine
-            struct coroutine_desc *starter;     // first coroutine to resume this one
-            struct coroutine_desc *last;	      // last coroutine to resume this one
+            struct coStack_t stack;                   // stack information of the coroutine
+            const char *name;                         // textual name for coroutine/task, initialized by uC++ generated code
+            int errno_;                               // copy of global UNIX variable errno
+            enum coroutine_state state;               // current execution status for coroutine
+            struct coroutine_desc *starter;           // first coroutine to resume this one
+            struct coroutine_desc *last;	            // last coroutine to resume this one
       };
 
       struct monitor_desc {
-            struct spinlock lock;
-            struct thread_desc * owner;
-            struct simple_thread_list entry_queue;
-            unsigned int recursion;
+            struct spinlock lock;                     // spinlock to protect internal data
+            struct thread_desc * owner;               // current owner of the monitor
+            struct __thread_queue_t entry_queue;      // queue of threads that are blocked waiting for the monitor
+            struct __thread_stack_t signal_stack;     // stack of threads to run next once we exit the monitor
+            struct monitor_desc * stack_owner;        // if bulk acquiring was used we need to synchronize signals with an other monitor
+            unsigned int recursion;                   // monitor routines can be called recursively, we need to keep track of that
       };
 
       struct thread_desc {
-            struct coroutine_desc cor;          // coroutine body used to store context
-            struct monitor_desc mon;            // monitor body used for mutual exclusion
-            struct thread_desc * next;          // instrusive link field for threads
+            struct coroutine_desc cor;                // coroutine body used to store context
+            struct monitor_desc mon;                  // monitor body used for mutual exclusion
+            struct thread_desc * next;                // instrusive link field for threads
+            struct monitor_desc ** current_monitors;  // currently held monitors
+            unsigned short current_monitor_count;     // number of currently held monitors
       };
 
Index: src/libcfa/concurrency/kernel
===================================================================
--- src/libcfa/concurrency/kernel	(revision 5c3632f9e52d9bac69b673206f30ebb020b31f3b)
+++ src/libcfa/concurrency/kernel	(revision b3d70eba692f70343957b265f1ac7c17022df555)
@@ -32,7 +32,7 @@
 
 struct signal_once {
-	volatile bool condition;
+	volatile bool cond;
 	struct spinlock lock;
-	struct simple_thread_list blocked;
+	struct __thread_queue_t blocked;
 };
 
@@ -46,5 +46,5 @@
 // Cluster
 struct cluster {
-	simple_thread_list ready_queue;
+	__thread_queue_t ready_queue;
 	spinlock lock;
 };
Index: src/libcfa/concurrency/kernel.c
===================================================================
--- src/libcfa/concurrency/kernel.c	(revision 5c3632f9e52d9bac69b673206f30ebb020b31f3b)
+++ src/libcfa/concurrency/kernel.c	(revision b3d70eba692f70343957b265f1ac7c17022df555)
@@ -299,4 +299,6 @@
 // Scheduler routines
 void ScheduleThread( thread_desc * thrd ) {
+	if( !thrd ) return;
+
 	assertf( thrd->next == NULL, "Expected null got %p", thrd->next );
 	
@@ -473,5 +475,5 @@
 
 void ?{}( signal_once * this ) {
-	this->condition = false;
+	this->cond = false;
 }
 void ^?{}( signal_once * this ) {
@@ -481,5 +483,5 @@
 void wait( signal_once * this ) {
 	lock( &this->lock );
-	if( !this->condition ) {
+	if( !this->cond ) {
 		append( &this->blocked, this_thread() );
 		ScheduleInternal( &this->lock );
@@ -492,5 +494,5 @@
 	lock( &this->lock );
 	{
-		this->condition = true;
+		this->cond = true;
 
 		thread_desc * it;
@@ -504,10 +506,10 @@
 //-----------------------------------------------------------------------------
 // Queues
-void ?{}( simple_thread_list * this ) {
+void ?{}( __thread_queue_t * this ) {
 	this->head = NULL;
 	this->tail = &this->head;
 }
 
-void append( simple_thread_list * this, thread_desc * t ) {
+void append( __thread_queue_t * this, thread_desc * t ) {
 	assert(this->tail != NULL);
 	*this->tail = t;
@@ -515,5 +517,5 @@
 }
 
-thread_desc * pop_head( simple_thread_list * this ) {
+thread_desc * pop_head( __thread_queue_t * this ) {
 	thread_desc * head = this->head;
 	if( head ) {
@@ -526,4 +528,23 @@
 	return head;
 }
+
+void ?{}( __thread_stack_t * this ) {
+	this->top = NULL;
+}
+
+void push( __thread_stack_t * this, thread_desc * t ) {
+	assert(t->next != NULL);
+	t->next = this->top;
+	this->top = t;
+}
+
+thread_desc * pop( __thread_stack_t * this ) {
+	thread_desc * top = this->top;
+	if( top ) {
+		this->top = top->next;
+		top->next = NULL;
+	}	
+	return top;
+}
 // Local Variables: //
 // mode: c //
Index: src/libcfa/concurrency/monitor
===================================================================
--- src/libcfa/concurrency/monitor	(revision 5c3632f9e52d9bac69b673206f30ebb020b31f3b)
+++ src/libcfa/concurrency/monitor	(revision b3d70eba692f70343957b265f1ac7c17022df555)
@@ -18,4 +18,6 @@
 #define MONITOR_H
 
+#include <stddef.h>
+
 #include "assert"
 #include "invoke.h"
@@ -23,15 +25,14 @@
 
 static inline void ?{}(monitor_desc * this) {
-	this->owner = 0;
+	this->owner = NULL;
+      this->stack_owner = NULL;
 	this->recursion = 0;
 }
-
-//Array entering routine
-void enter(monitor_desc **, int count);
-void leave(monitor_desc **, int count);
 
 struct monitor_guard_t {
 	monitor_desc ** m;
 	int count;
+      monitor_desc ** prev_mntrs;
+      unsigned short  prev_count;
 };
 
@@ -40,15 +41,21 @@
 }
 
-static inline void ?{}( monitor_guard_t * this, monitor_desc ** m, int count ) {
-	this->m = m;
-	this->count = count;
-	qsort(this->m, count);
-	enter( this->m, this->count );
+void ?{}( monitor_guard_t * this, monitor_desc ** m, int count );
+void ^?{}( monitor_guard_t * this );
+
+//-----------------------------------------------------------------------------
+// Internal scheduling
+struct condition {
+	__thread_queue_t blocked;
+	monitor_desc ** monitors;
+	unsigned short monitor_count;
+};
+
+static inline void ?{}( condition * this ) {
+	this->monitors = NULL;
+	this->monitor_count = 0;
 }
 
-static inline void ^?{}( monitor_guard_t * this ) {
-	leave( this->m, this->count );
-}
-
-
+void wait( condition * this );
+void signal( condition * this );
 #endif //MONITOR_H
Index: src/libcfa/concurrency/monitor.c
===================================================================
--- src/libcfa/concurrency/monitor.c	(revision 5c3632f9e52d9bac69b673206f30ebb020b31f3b)
+++ src/libcfa/concurrency/monitor.c	(revision b3d70eba692f70343957b265f1ac7c17022df555)
@@ -18,14 +18,27 @@
 
 #include "kernel_private.h"
+#include "libhdr.h"
+
+void set_owner( monitor_desc * this, thread_desc * owner ) {
+	//Pass the monitor appropriately
+	this->owner = owner;
+
+	//We are passing the monitor to someone else, which means recursion level is not 0
+	this->recursion = owner ? 1 : 0;
+}
 
 extern "C" {
-	void __enter_monitor_desc(monitor_desc * this) {
+	void __enter_monitor_desc(monitor_desc * this, monitor_desc * leader) {
 		lock( &this->lock );
 		thread_desc * thrd = this_thread();
 
+		// //Update the stack owner
+		// this->stack_owner = leader;
+
+		LIB_DEBUG_PRINT_SAFE("Entering %p (o: %p, r: %i)\n", this, this->owner, this->recursion);
+
 		if( !this->owner ) {
 			//No one has the monitor, just take it
-			this->owner = thrd;
-			this->recursion = 1;
+			set_owner( this, thrd );
 		}
 		else if( this->owner == thrd) {
@@ -44,43 +57,210 @@
 
 		unlock( &this->lock );
-	}
-
-	void __leave_monitor_desc(monitor_desc * this) {
+		return;
+	}
+
+	// leave pseudo code :
+	// 	decrement level
+	// 	leve == 0 ?
+	// 		no : done
+	// 		yes :
+	// 			signal stack empty ?
+	//				has leader :
+	//					bulk acquiring means we don't own the signal stack
+	//					ignore it but don't release the monitor
+	// 				yes :
+	// 					next in entry queue is new owner
+	// 				no :
+	// 					top of the signal stack is the owner
+	//					context switch to him right away
+	//
+	void __leave_monitor_desc(monitor_desc * this, monitor_desc * leader) {
 		lock( &this->lock );
 
+		LIB_DEBUG_PRINT_SAFE("Leaving %p (o: %p, r: %i)\n", this, this->owner, this->recursion);
+
 		thread_desc * thrd = this_thread();
-		assert( thrd == this->owner );
+		assertf( thrd == this->owner, "Expected owner to be %p, got %p (r: %i)", this->owner, thrd, this->recursion );
 
 		//Leaving a recursion level, decrement the counter
 		this->recursion -= 1;
 
-		//If we left the last level of recursion it means we are changing who owns the monitor
+		//If we haven't left the last level of recursion
+		//it means we don't need to do anything
+		if( this->recursion != 0) {
+			// this->stack_owner = leader;
+			unlock( &this->lock );
+			return;
+		}
+			
+		// //If we don't own the signal stack then just leave it to the owner
+		// if( this->stack_owner ) {
+		// 	this->stack_owner = leader;
+		// 	unlock( &this->lock );
+		// 	return;
+		// }
+
+		//We are the stack owner and have left the last recursion level.
+		//We are in charge of passing the monitor
 		thread_desc * new_owner = 0;
-		if( this->recursion == 0) {
-			//Get the next thread in the list
-			new_owner = this->owner = pop_head( &this->entry_queue );
-
-			//We are passing the monitor to someone else, which means recursion level is not 0
-			this->recursion = new_owner ? 1 : 0;
-		}	
-
+
+		//Check the signaller stack
+		new_owner = pop( &this->signal_stack );
+		if( new_owner ) {
+			//The signaller stack is not empty,
+			//transfer control immediately
+			set_owner( this, new_owner );
+			// this->stack_owner = leader;
+			ScheduleInternal( &this->lock, new_owner );
+			return;
+		}
+		
+		// No signaller thread
+		// Get the next thread in the entry_queue
+		new_owner = pop_head( &this->entry_queue );
+		set_owner( this, new_owner );
+
+		// //Update the stack owner
+		// this->stack_owner = leader;
+
+		//We can now let other threads in safely
 		unlock( &this->lock );
 
-		//If we have a new owner, we need to wake-up the thread
-		if( new_owner ) {
-			ScheduleThread( new_owner );
-		}
-	}
-}
-
-void enter(monitor_desc ** monitors, int count) {
-	for(int i = 0; i < count; i++) {
-		__enter_monitor_desc( monitors[i] );
-	}
-}
-
-void leave(monitor_desc ** monitors, int count) {
-	for(int i = count - 1; i >= 0; i--) {
-		__leave_monitor_desc( monitors[i] );
-	}
-}
+		//We need to wake-up the thread
+		ScheduleThread( new_owner );
+	}
+}
+
+static inline void enter(monitor_desc ** monitors, int count) {
+	__enter_monitor_desc( monitors[0], NULL );
+	for(int i = 1; i < count; i++) {
+		__enter_monitor_desc( monitors[i], monitors[0] );
+	}
+}
+
+static inline void leave(monitor_desc ** monitors, int count) {
+	__leave_monitor_desc( monitors[0], NULL );
+	for(int i = count - 1; i >= 1; i--) {
+		__leave_monitor_desc( monitors[i], monitors[0] );
+	}
+}
+
+void ?{}( monitor_guard_t * this, monitor_desc ** m, int count ) {
+	this->m = m;
+	this->count = count;
+	qsort(this->m, count);
+	enter( this->m, this->count );
+
+	this->prev_mntrs = this_thread()->current_monitors;
+	this->prev_count = this_thread()->current_monitor_count;
+
+	this_thread()->current_monitors      = m;
+	this_thread()->current_monitor_count = count;
+}
+
+void ^?{}( monitor_guard_t * this ) {
+	leave( this->m, this->count );
+
+	this_thread()->current_monitors      = this->prev_mntrs;
+	this_thread()->current_monitor_count = this->prev_count;
+}
+
+//-----------------------------------------------------------------------------
+// Internal scheduling
+void wait( condition * this ) {
+	assertf(false, "NO SUPPORTED");
+	// LIB_DEBUG_FPRINTF("Waiting\n");
+	thread_desc * this_thrd = this_thread();
+
+	if( !this->monitors ) {
+		this->monitors = this_thrd->current_monitors;
+		this->monitor_count = this_thrd->current_monitor_count;
+	}
+
+	unsigned short count = this->monitor_count;
+
+	//Check that everything is as expected
+	assert( this->monitors != NULL );
+	assert( this->monitor_count != 0 );
+
+	unsigned int recursions[ count ];		//Save the current recursion levels to restore them later
+	spinlock *   locks     [ count ];		//We need to pass-in an array of locks to ScheduleInternal
+
+	// LIB_DEBUG_FPRINTF("Getting ready to wait\n");
+
+	//Loop on all the monitors and release the owner
+	for( unsigned int i = 0; i < count; i++ ) {
+		monitor_desc * cur = this->monitors[i];
+
+		assert( cur );
+
+		// LIB_DEBUG_FPRINTF("cur %p lock %p\n", cur, &cur->lock);
+
+		//Store the locks for later
+		locks[i] = &cur->lock;
+
+		//Protect the monitors
+		lock( locks[i] );
+		{		
+			//Save the recursion levels
+			recursions[i] = cur->recursion;
+
+			//Release the owner
+			cur->recursion = 0;
+			cur->owner = NULL;
+		}
+		//Release the monitor
+		unlock( locks[i] );
+	}
+
+	// LIB_DEBUG_FPRINTF("Waiting now\n");
+
+	//Everything is ready to go to sleep
+	ScheduleInternal( locks, count );
+
+
+	//WE WOKE UP
+
+
+	//We are back, restore the owners and recursions
+	for( unsigned int i = 0; i < count; i++ ) {
+		monitor_desc * cur = this->monitors[i];
+
+		//Protect the monitors
+		lock( locks[i] );
+		{
+			//Release the owner
+			cur->owner = this_thrd;
+			cur->recursion = recursions[i];
+		}
+		//Release the monitor
+		unlock( locks[i] );
+	}
+}
+
+static void __signal_internal( condition * this ) {
+	assertf(false, "NO SUPPORTED");
+	if( !this->blocked.head ) return;
+
+	//Check that everything is as expected
+	assert( this->monitors );
+	assert( this->monitor_count != 0 );
+	
+	LIB_DEBUG_DO(
+		if ( this->monitors != this_thread()->current_monitors ) {
+			abortf( "Signal on condition %p made outside of the correct monitor(s)", this );
+		} // if
+	);
+
+	monitor_desc * owner = this->monitors[0];
+	lock( &owner->lock );
+	{
+		thread_desc * unblock = pop_head( &this->blocked );
+		push( &owner->signal_stack, unblock );
+	}
+	unlock( &owner->lock );
+}
+
+void signal( condition * this ) {
+	__signal_internal( this );
+}
Index: src/libcfa/concurrency/thread.c
===================================================================
--- src/libcfa/concurrency/thread.c	(revision 5c3632f9e52d9bac69b673206f30ebb020b31f3b)
+++ src/libcfa/concurrency/thread.c	(revision b3d70eba692f70343957b265f1ac7c17022df555)
@@ -39,4 +39,7 @@
 	this->mon.recursion = 1;
 	this->next = NULL;
+
+	this->current_monitors      = NULL;
+	this->current_monitor_count = 0;
 }
 
Index: src/main.cc
===================================================================
--- src/main.cc	(revision 5c3632f9e52d9bac69b673206f30ebb020b31f3b)
+++ src/main.cc	(revision b3d70eba692f70343957b265f1ac7c17022df555)
@@ -304,15 +304,9 @@
 		GenPoly::box( translationUnit );
 
-		// print tree right before code generation
-		if ( codegenp ) {
-			dump( translationUnit );
-			return 0;
-		} // if
-
 		if ( optind < argc ) {							// any commands after the flags and input file ? => output file name
 			output = new ofstream( argv[ optind ] );
 		} // if
 
-		CodeGen::generate( translationUnit, *output, ! noprotop, prettycodegenp );
+		CodeGen::generate( translationUnit, *output, ! noprotop, prettycodegenp, true );
 
 		CodeGen::FixMain::fix( *output, treep ? "../prelude/bootloader.c" : CFA_LIBDIR "/bootloader.c" );
@@ -393,5 +387,5 @@
 			break;
 		  case CtorInitFix:
-		  case 'c':
+		  case 'c':										// print after constructors and destructors are replaced
 			ctorinitp = true;
 			break;
@@ -450,10 +444,11 @@
 			validp = true;
 			break;
-		  case 'y':
+		  case 'y':										// dump AST on error
 			errorp = true;
 			break;
-		  case 'z':
+		  case 'z':										// dump as codegen rather than AST
 			codegenp = true;
-			case 'Z':
+			break;
+			case 'Z':									// prettyprint during codegen (i.e. print unmangled names, etc.)
 			prettycodegenp = true;
 			break;
@@ -501,5 +496,10 @@
 	} // if
 
-	printAll( decls, out );
+	// depending on commandline options, either generate code or dump the AST
+	if ( codegenp ) {
+		CodeGen::generate( decls, out, ! noprotop, prettycodegenp );
+	} else {
+		printAll( decls, out );
+	}
 	deleteAll( translationUnit );
 } // dump
Index: src/tests/.expect/memberCtors-ERR1.txt
===================================================================
--- src/tests/.expect/memberCtors-ERR1.txt	(revision 5c3632f9e52d9bac69b673206f30ebb020b31f3b)
+++ src/tests/.expect/memberCtors-ERR1.txt	(revision b3d70eba692f70343957b265f1ac7c17022df555)
@@ -1,2 +1,2 @@
-memberCtors.c:62 error: in void ?{}(struct B *b), field a2 used before being constructed
+memberCtors.c:62 error: in void ?{}(B *b), field a2 used before being constructed
 make: *** [memberCtors-ERR1] Error 1
Index: src/tests/sched_internal.c
===================================================================
--- src/tests/sched_internal.c	(revision b3d70eba692f70343957b265f1ac7c17022df555)
+++ src/tests/sched_internal.c	(revision b3d70eba692f70343957b265f1ac7c17022df555)
@@ -0,0 +1,56 @@
+#include <kernel>
+#include <monitor>
+#include <thread>
+
+monitor global_t {
+	int value;
+};
+
+global_t global;
+
+condition cond;
+
+thread Signalee {};
+thread Signaler {};
+
+void step1( global_t * mutex this ) {
+	this->value = 1;
+	wait( &cond );
+}
+
+void step2( global_t * mutex this ) {
+	if( this->value != 1) abort();
+
+	this->value = 2;
+	signal( &cond );
+}
+
+void step3( global_t * mutex this ) {
+	if( this->value != 2) abort();
+
+	this->value = 2;
+	signal( &cond );
+}
+
+void main( Signalee* this ) {
+	step1( &global );
+	step3( &global );
+}
+
+void main( Signaler* this ) {
+	for(int i = 0; i < 10_000; i++) {
+		asm volatile ("" : : : "memory");
+	}
+
+	step2( &global );
+}
+
+int main(int argc, char* argv[]) {
+	assert( global.__mon.entry_queue.tail != NULL );
+	processor p;
+	{
+		Signalee a;
+		Signaler b;
+	}
+	if( global.value != 3) abort();
+}
