Index: src/libcfa/Makefile.am
===================================================================
--- src/libcfa/Makefile.am	(revision 3351cc0e1e27b4c8378091ffd05c8b3a1eaf0c9e)
+++ src/libcfa/Makefile.am	(revision ea7d2b051267e571f113e8dabae0d886eda94432)
@@ -95,5 +95,16 @@
 
 cfa_includedir = $(CFA_INCDIR)
-nobase_cfa_include_HEADERS = ${headers} ${stdhdr} math gmp concurrency/invoke.h
+nobase_cfa_include_HEADERS = 	\
+	${headers} 			\
+	${stdhdr} 			\
+	math 				\
+	gmp 				\
+	bits/defs.h 		\
+	bits/locks.h 		\
+	concurrency/invoke.h 	\
+	libhdr.h 			\
+	libhdr/libalign.h 	\
+	libhdr/libdebug.h 	\
+	libhdr/libtools.h
 
 CLEANFILES = libcfa-prelude.c
Index: src/libcfa/Makefile.in
===================================================================
--- src/libcfa/Makefile.in	(revision 3351cc0e1e27b4c8378091ffd05c8b3a1eaf0c9e)
+++ src/libcfa/Makefile.in	(revision ea7d2b051267e571f113e8dabae0d886eda94432)
@@ -264,5 +264,7 @@
 	containers/result containers/vector concurrency/coroutine \
 	concurrency/thread concurrency/kernel concurrency/monitor \
-	${shell echo stdhdr/*} math gmp concurrency/invoke.h
+	${shell echo stdhdr/*} math gmp bits/defs.h bits/locks.h \
+	concurrency/invoke.h libhdr.h libhdr/libalign.h \
+	libhdr/libdebug.h libhdr/libtools.h
 HEADERS = $(nobase_cfa_include_HEADERS)
 am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
@@ -430,5 +432,17 @@
 stdhdr = ${shell echo stdhdr/*}
 cfa_includedir = $(CFA_INCDIR)
-nobase_cfa_include_HEADERS = ${headers} ${stdhdr} math gmp concurrency/invoke.h
+nobase_cfa_include_HEADERS = \
+	${headers} 			\
+	${stdhdr} 			\
+	math 				\
+	gmp 				\
+	bits/defs.h 		\
+	bits/locks.h 		\
+	concurrency/invoke.h 	\
+	libhdr.h 			\
+	libhdr/libalign.h 	\
+	libhdr/libdebug.h 	\
+	libhdr/libtools.h
+
 CLEANFILES = libcfa-prelude.c
 all: all-am
Index: src/libcfa/bits/containers.h
===================================================================
--- src/libcfa/bits/containers.h	(revision ea7d2b051267e571f113e8dabae0d886eda94432)
+++ src/libcfa/bits/containers.h	(revision ea7d2b051267e571f113e8dabae0d886eda94432)
@@ -0,0 +1,132 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// bits/containers.h -- Intrusive generic containers.h
+//
+// Author           : Thierry Delisle
+// Created On       : Tue Oct 31 16:38:50 2017
+// Last Modified By : --
+// Last Modified On : --
+// Update Count     : 0
+
+#pragma once
+
+#include <stddef.h>
+
+#include "libhdr.h"
+
+//-----------------------------------------------------------------------------
+// Node Base
+//-----------------------------------------------------------------------------
+
+#ifdef __CFORALL__
+	trait is_node(dtype T) {
+		T*& get_next( T& );
+	};
+#endif
+
+//-----------------------------------------------------------------------------
+// Stack
+//-----------------------------------------------------------------------------
+#ifdef __CFORALL__
+	forall(dtype TYPE | is_node(TYPE))
+	#define T TYPE
+#else
+	#define T void
+#endif
+struct __stack {
+	T * top;
+};
+
+#ifdef __CFORALL__
+#define __stack_t(T) __stack(T)
+#else
+#define __stack_t(T) struct __stack
+#endif
+
+#ifdef __CFORALL__
+	forall(dtype T | is_node(T))
+	void ?{}( __stack(T) & this ) {
+		this.top = NULL;
+	}
+
+	forall(dtype T | is_node(T) | sized(T))
+	void push( __stack(T) & this, T * val ) {
+		verify( !get_next( *val ) );
+		get_next( *val ) = this.top;
+		this.top = val;
+	}
+
+	forall(dtype T | is_node(T) | sized(T))
+	T * pop( __stack(T) & this ) {
+		T * top = this.top;
+		if( top ) {
+			this.top = get_next( *top );
+			get_next( *top ) = NULL;
+		}
+		return top;
+	}
+#endif
+
+//-----------------------------------------------------------------------------
+// Queue
+//-----------------------------------------------------------------------------
+#ifdef __CFORALL__
+	forall(dtype T | is_node(T))
+	#define T TYPE
+#else
+	#define T void
+#endif
+struct __queue {
+	T * head;
+	T ** tail;
+};
+
+#ifdef __CFORALL__
+	forall(dtype T | is_node(T))
+	void ?{}( __queue(T) & this ) {
+		this.head = NULL;
+		this.tail = &this.head;
+	}
+
+	forall(dtype T | is_node(T) | sized(T))
+	void append( __queue(T) & this, T * val ) {
+		verify(this.tail != NULL);
+		*this.tail = val;
+		this.tail = &get_next( *val );
+	}
+
+	forall(dtype T | is_node(T) | sized(T))
+	T * pop_head( __queue(T) & this ) {
+		T * head = this.head;
+		if( head ) {
+			this.head = get_next( *head );
+			if( !get_next( *head ) ) {
+				this.tail = &this.head;
+			}
+			get_next( *head ) = NULL;
+		}
+		return head;
+	}
+
+	forall(dtype T | is_node(T) | sized(T))
+	T * remove( __queue(T) & this, T ** it ) {
+		T * val = *it;
+		verify( val );
+
+		(*it) = get_next( *val );
+
+		if( this.tail == &get_next( *val ) ) {
+			this.tail = it;
+		}
+
+		get_next( *val ) = NULL;
+
+		verify( (this.head == NULL) == (&this.head == this.tail) );
+		verify( *this.tail == NULL );
+		return val;
+	}
+#endif
Index: src/libcfa/bits/defs.h
===================================================================
--- src/libcfa/bits/defs.h	(revision ea7d2b051267e571f113e8dabae0d886eda94432)
+++ src/libcfa/bits/defs.h	(revision ea7d2b051267e571f113e8dabae0d886eda94432)
@@ -0,0 +1,23 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// bits/defs.h --
+//
+// Author           : Thierry Delisle
+// Created On       : Thu Nov 09 13:24:10 2017
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#define unlikely(x)    __builtin_expect(!!(x), 0)
+#define likely  (x)    __builtin_expect(!!(x), 1)
+#define thread_local _Thread_local
Index: src/libcfa/bits/locks.h
===================================================================
--- src/libcfa/bits/locks.h	(revision ea7d2b051267e571f113e8dabae0d886eda94432)
+++ src/libcfa/bits/locks.h	(revision ea7d2b051267e571f113e8dabae0d886eda94432)
@@ -0,0 +1,121 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// bits/locks.h -- Fast internal locks.
+//
+// Author           : Thierry Delisle
+// Created On       : Tue Oct 31 15:14:38 2017
+// Last Modified By : --
+// Last Modified On : --
+// Update Count     : 0
+//
+
+#pragma once
+
+#include "bits/defs.h"
+
+#include "libhdr.h"
+
+// pause to prevent excess processor bus usage
+#if defined( __sparc )
+	#define Pause() __asm__ __volatile__ ( "rd %ccr,%g0" )
+#elif defined( __i386 ) || defined( __x86_64 )
+	#define Pause() __asm__ __volatile__ ( "pause" : : : )
+#else
+	#error unsupported architecture
+#endif
+
+#if defined( __i386 ) || defined( __x86_64 )
+	// Intel recommendation
+	#define __ALIGN__ __attribute__(( aligned (128) ))
+#elif defined( __sparc )
+	#define __ALIGN__ CALIGN
+#else
+	#error unsupported architecture
+#endif
+
+#if defined( __x86_64 )
+	#define __lock_test_and_test_and_set( lock ) (lock) == 0 && __sync_lock_test_and_set_8( &(lock), 1 ) == 0
+	#define __lock_release( lock ) __sync_lock_release_8( &(lock) );
+#elif defined( __i386 )
+	#define __lock_test_and_test_and_set( lock ) (lock) == 0 && __sync_lock_test_and_set_4( &(lock), 1 ) == 0
+	#define __lock_release( lock ) __sync_lock_release_4( &(lock) );
+#else
+	#error unsupported architecture
+#endif
+
+struct __spinlock_t {
+	__ALIGN__ volatile uintptr_t lock;
+	#ifdef __CFA_DEBUG__
+		const char * prev_name;
+		void* prev_thrd;
+	#endif
+} __ALIGN__;
+
+#ifdef __CFORALL__
+	extern void yield( unsigned int );
+	extern thread_local struct thread_desc *    volatile this_thread;
+
+	static inline void ?{}( __spinlock_t & this ) {
+		this.lock = 0;
+	}
+
+	// Lock the spinlock, return false if already acquired
+	static inline _Bool try_lock  ( __spinlock_t & this DEBUG_CTX_PARAM2 ) {
+		_Bool result = __lock_test_and_test_and_set( this.lock );
+		LIB_DEBUG_DO(
+			if( result ) {
+				this.prev_name = caller;
+				this.prev_thrd = this_thread;
+			}
+		)
+		return result;
+	}
+
+	// Lock the spinlock, spin if already acquired
+	static inline void lock( __spinlock_t & this DEBUG_CTX_PARAM2 ) {
+		#ifndef NOEXPBACK
+			enum { SPIN_START = 4, SPIN_END = 64 * 1024, };
+			unsigned int spin = SPIN_START;
+		#endif
+
+		for ( unsigned int i = 1;; i += 1 ) {
+			if ( __lock_test_and_test_and_set( this.lock ) ) break;
+			#ifndef NOEXPBACK
+				// exponential spin
+				for ( volatile unsigned int s = 0; s < spin; s += 1 ) Pause();
+
+				// slowly increase by powers of 2
+				if ( i % 64 == 0 ) spin += spin;
+
+				// prevent overflow
+				if ( spin > SPIN_END ) spin = SPIN_START;
+			#else
+				Pause();
+			#endif
+		}
+		LIB_DEBUG_DO(
+			this.prev_name = caller;
+			this.prev_thrd = this_thread;
+		)
+	}
+
+	// Lock the spinlock, spin if already acquired
+	static inline void lock_yield( __spinlock_t & this DEBUG_CTX_PARAM2 ) {
+		for ( unsigned int i = 1;; i += 1 ) {
+			if ( __lock_test_and_test_and_set( this.lock ) ) break;
+			yield( i );
+		}
+		LIB_DEBUG_DO(
+			this.prev_name = caller;
+			this.prev_thrd = this_thread;
+		)
+	}
+
+	static inline void unlock( __spinlock_t & this ) {
+		__lock_release( this.lock );
+	}
+#endif
Index: src/libcfa/concurrency/alarm.c
===================================================================
--- src/libcfa/concurrency/alarm.c	(revision 3351cc0e1e27b4c8378091ffd05c8b3a1eaf0c9e)
+++ src/libcfa/concurrency/alarm.c	(revision ea7d2b051267e571f113e8dabae0d886eda94432)
@@ -186,5 +186,5 @@
 
 	disable_interrupts();
-	lock( &event_kernel->lock DEBUG_CTX2 );
+	lock( event_kernel->lock DEBUG_CTX2 );
 	{
 		verify( validate( alarms ) );
@@ -196,5 +196,5 @@
 		}
 	}
-	unlock( &event_kernel->lock );
+	unlock( event_kernel->lock );
 	this->set = true;
 	enable_interrupts( DEBUG_CTX );
@@ -203,10 +203,10 @@
 void unregister_self( alarm_node_t * this ) {
 	disable_interrupts();
-	lock( &event_kernel->lock DEBUG_CTX2 );
+	lock( event_kernel->lock DEBUG_CTX2 );
 	{
 		verify( validate( &event_kernel->alarms ) );
 		remove( &event_kernel->alarms, this );
 	}
-	unlock( &event_kernel->lock );
+	unlock( event_kernel->lock );
 	enable_interrupts( DEBUG_CTX );
 	this->set = false;
Index: src/libcfa/concurrency/invoke.h
===================================================================
--- src/libcfa/concurrency/invoke.h	(revision 3351cc0e1e27b4c8378091ffd05c8b3a1eaf0c9e)
+++ src/libcfa/concurrency/invoke.h	(revision ea7d2b051267e571f113e8dabae0d886eda94432)
@@ -14,6 +14,6 @@
 //
 
-#include <stdbool.h>
-#include <stdint.h>
+#include "bits/defs.h"
+#include "bits/locks.h"
 
 #ifdef __CFORALL__
@@ -25,17 +25,6 @@
 #define _INVOKE_H_
 
-	#define unlikely(x)    __builtin_expect(!!(x), 0)
-	#define thread_local _Thread_local
-
 	typedef void (*fptr_t)();
 	typedef int_fast16_t __lock_size_t;
-
-	struct spinlock {
-		volatile int lock;
-		#ifdef __CFA_DEBUG__
-			const char * prev_name;
-			void* prev_thrd;
-		#endif
-	};
 
 	struct __thread_queue_t {
@@ -58,7 +47,4 @@
 		void push( struct __condition_stack_t &, struct __condition_criterion_t * );
 		struct __condition_criterion_t * pop( struct __condition_stack_t & );
-
-		void  ?{}(spinlock & this);
-		void ^?{}(spinlock & this);
 	}
 	#endif
@@ -122,5 +108,5 @@
 	struct monitor_desc {
 		// spinlock to protect internal data
-		struct spinlock lock;
+		struct __spinlock_t lock;
 
 		// current owner of the monitor
Index: src/libcfa/concurrency/kernel
===================================================================
--- src/libcfa/concurrency/kernel	(revision 3351cc0e1e27b4c8378091ffd05c8b3a1eaf0c9e)
+++ src/libcfa/concurrency/kernel	(revision ea7d2b051267e571f113e8dabae0d886eda94432)
@@ -26,18 +26,18 @@
 //-----------------------------------------------------------------------------
 // Locks
-// Lock the spinlock, spin if already acquired
-void lock      ( spinlock * DEBUG_CTX_PARAM2 );
+// // Lock the spinlock, spin if already acquired
+// void lock      ( spinlock * DEBUG_CTX_PARAM2 );
 
-// Lock the spinlock, yield repeatedly if already acquired
-void lock_yield( spinlock * DEBUG_CTX_PARAM2 );
+// // Lock the spinlock, yield repeatedly if already acquired
+// void lock_yield( spinlock * DEBUG_CTX_PARAM2 );
 
-// Lock the spinlock, return false if already acquired
-bool try_lock  ( spinlock * DEBUG_CTX_PARAM2 );
+// // Lock the spinlock, return false if already acquired
+// bool try_lock  ( spinlock * DEBUG_CTX_PARAM2 );
 
-// Unlock the spinlock
-void unlock    ( spinlock * );
+// // Unlock the spinlock
+// void unlock    ( spinlock * );
 
 struct semaphore {
-	spinlock lock;
+	__spinlock_t lock;
 	int count;
 	__thread_queue_t waiting;
@@ -54,5 +54,5 @@
 struct cluster {
 	// Ready queue locks
-	spinlock ready_queue_lock;
+	__spinlock_t ready_queue_lock;
 
 	// Ready queue for threads
@@ -74,6 +74,6 @@
 	FinishOpCode action_code;
 	thread_desc * thrd;
-	spinlock * lock;
-	spinlock ** locks;
+	__spinlock_t * lock;
+	__spinlock_t ** locks;
 	unsigned short lock_count;
 	thread_desc ** thrds;
Index: src/libcfa/concurrency/kernel.c
===================================================================
--- src/libcfa/concurrency/kernel.c	(revision 3351cc0e1e27b4c8378091ffd05c8b3a1eaf0c9e)
+++ src/libcfa/concurrency/kernel.c	(revision ea7d2b051267e571f113e8dabae0d886eda94432)
@@ -242,5 +242,5 @@
 void finishRunning(processor * this) {
 	if( this->finish.action_code == Release ) {
-		unlock( this->finish.lock );
+		unlock( *this->finish.lock );
 	}
 	else if( this->finish.action_code == Schedule ) {
@@ -248,15 +248,15 @@
 	}
 	else if( this->finish.action_code == Release_Schedule ) {
-		unlock( this->finish.lock );
+		unlock( *this->finish.lock );
 		ScheduleThread( this->finish.thrd );
 	}
 	else if( this->finish.action_code == Release_Multi ) {
 		for(int i = 0; i < this->finish.lock_count; i++) {
-			unlock( this->finish.locks[i] );
+			unlock( *this->finish.locks[i] );
 		}
 	}
 	else if( this->finish.action_code == Release_Multi_Schedule ) {
 		for(int i = 0; i < this->finish.lock_count; i++) {
-			unlock( this->finish.locks[i] );
+			unlock( *this->finish.locks[i] );
 		}
 		for(int i = 0; i < this->finish.thrd_count; i++) {
@@ -334,7 +334,7 @@
 	verifyf( thrd->next == NULL, "Expected null got %p", thrd->next );
 
-	lock(   &this_processor->cltr->ready_queue_lock DEBUG_CTX2 );
+	lock(   this_processor->cltr->ready_queue_lock DEBUG_CTX2 );
 	append( this_processor->cltr->ready_queue, thrd );
-	unlock( &this_processor->cltr->ready_queue_lock );
+	unlock( this_processor->cltr->ready_queue_lock );
 
 	verify( disable_preempt_count > 0 );
@@ -343,7 +343,7 @@
 thread_desc * nextThread(cluster * this) {
 	verify( disable_preempt_count > 0 );
-	lock( &this->ready_queue_lock DEBUG_CTX2 );
+	lock( this->ready_queue_lock DEBUG_CTX2 );
 	thread_desc * head = pop_head( this->ready_queue );
-	unlock( &this->ready_queue_lock );
+	unlock( this->ready_queue_lock );
 	verify( disable_preempt_count > 0 );
 	return head;
@@ -358,5 +358,5 @@
 }
 
-void BlockInternal( spinlock * lock ) {
+void BlockInternal( __spinlock_t * lock ) {
 	disable_interrupts();
 	this_processor->finish.action_code = Release;
@@ -384,5 +384,5 @@
 }
 
-void BlockInternal( spinlock * lock, thread_desc * thrd ) {
+void BlockInternal( __spinlock_t * lock, thread_desc * thrd ) {
 	assert(thrd);
 	disable_interrupts();
@@ -398,5 +398,5 @@
 }
 
-void BlockInternal(spinlock * locks [], unsigned short count) {
+void BlockInternal(__spinlock_t * locks [], unsigned short count) {
 	disable_interrupts();
 	this_processor->finish.action_code = Release_Multi;
@@ -411,5 +411,5 @@
 }
 
-void BlockInternal(spinlock * locks [], unsigned short lock_count, thread_desc * thrds [], unsigned short thrd_count) {
+void BlockInternal(__spinlock_t * locks [], unsigned short lock_count, thread_desc * thrds [], unsigned short thrd_count) {
 	disable_interrupts();
 	this_processor->finish.action_code = Release_Multi_Schedule;
@@ -426,5 +426,5 @@
 }
 
-void LeaveThread(spinlock * lock, thread_desc * thrd) {
+void LeaveThread(__spinlock_t * lock, thread_desc * thrd) {
 	verify( disable_preempt_count > 0 );
 	this_processor->finish.action_code = thrd ? Release_Schedule : Release;
@@ -516,6 +516,6 @@
 }
 
-static spinlock kernel_abort_lock;
-static spinlock kernel_debug_lock;
+static __spinlock_t kernel_abort_lock;
+static __spinlock_t kernel_debug_lock;
 static bool kernel_abort_called = false;
 
@@ -523,13 +523,13 @@
 	// abort cannot be recursively entered by the same or different processors because all signal handlers return when
 	// the globalAbort flag is true.
-	lock( &kernel_abort_lock DEBUG_CTX2 );
+	lock( kernel_abort_lock DEBUG_CTX2 );
 
 	// first task to abort ?
 	if ( !kernel_abort_called ) {			// not first task to abort ?
 		kernel_abort_called = true;
-		unlock( &kernel_abort_lock );
+		unlock( kernel_abort_lock );
 	}
 	else {
-		unlock( &kernel_abort_lock );
+		unlock( kernel_abort_lock );
 
 		sigset_t mask;
@@ -561,9 +561,9 @@
 extern "C" {
 	void __lib_debug_acquire() {
-		lock( &kernel_debug_lock DEBUG_CTX2 );
+		lock( kernel_debug_lock DEBUG_CTX2 );
 	}
 
 	void __lib_debug_release() {
-		unlock( &kernel_debug_lock );
+		unlock( kernel_debug_lock );
 	}
 }
@@ -574,41 +574,4 @@
 //-----------------------------------------------------------------------------
 // Locks
-void ?{}( spinlock & this ) {
-	this.lock = 0;
-}
-void ^?{}( spinlock & this ) {
-
-}
-
-bool try_lock( spinlock * this DEBUG_CTX_PARAM2 ) {
-	return this->lock == 0 && __sync_lock_test_and_set_4( &this->lock, 1 ) == 0;
-}
-
-void lock( spinlock * this DEBUG_CTX_PARAM2 ) {
-	for ( unsigned int i = 1;; i += 1 ) {
-		if ( this->lock == 0 && __sync_lock_test_and_set_4( &this->lock, 1 ) == 0 ) { break; }
-	}
-	LIB_DEBUG_DO(
-		this->prev_name = caller;
-		this->prev_thrd = this_thread;
-	)
-}
-
-void lock_yield( spinlock * this DEBUG_CTX_PARAM2 ) {
-	for ( unsigned int i = 1;; i += 1 ) {
-		if ( this->lock == 0 && __sync_lock_test_and_set_4( &this->lock, 1 ) == 0 ) { break; }
-		yield();
-	}
-	LIB_DEBUG_DO(
-		this->prev_name = caller;
-		this->prev_thrd = this_thread;
-	)
-}
-
-
-void unlock( spinlock * this ) {
-	__sync_lock_release_4( &this->lock );
-}
-
 void  ?{}( semaphore & this, int count = 1 ) {
 	(this.lock){};
@@ -619,5 +582,5 @@
 
 void P(semaphore & this) {
-	lock( &this.lock DEBUG_CTX2 );
+	lock( this.lock DEBUG_CTX2 );
 	this.count -= 1;
 	if ( this.count < 0 ) {
@@ -629,5 +592,5 @@
 	}
 	else {
-	    unlock( &this.lock );
+	    unlock( this.lock );
 	}
 }
@@ -635,5 +598,5 @@
 void V(semaphore & this) {
 	thread_desc * thrd = NULL;
-	lock( &this.lock DEBUG_CTX2 );
+	lock( this.lock DEBUG_CTX2 );
 	this.count += 1;
 	if ( this.count <= 0 ) {
@@ -642,5 +605,5 @@
 	}
 
-	unlock( &this.lock );
+	unlock( this.lock );
 
 	// make new owner
Index: src/libcfa/concurrency/kernel_private.h
===================================================================
--- src/libcfa/concurrency/kernel_private.h	(revision 3351cc0e1e27b4c8378091ffd05c8b3a1eaf0c9e)
+++ src/libcfa/concurrency/kernel_private.h	(revision ea7d2b051267e571f113e8dabae0d886eda94432)
@@ -45,10 +45,10 @@
 //Block current thread and release/wake-up the following resources
 void BlockInternal(void);
-void BlockInternal(spinlock * lock);
+void BlockInternal(__spinlock_t * lock);
 void BlockInternal(thread_desc * thrd);
-void BlockInternal(spinlock * lock, thread_desc * thrd);
-void BlockInternal(spinlock * locks [], unsigned short count);
-void BlockInternal(spinlock * locks [], unsigned short count, thread_desc * thrds [], unsigned short thrd_count);
-void LeaveThread(spinlock * lock, thread_desc * thrd);
+void BlockInternal(__spinlock_t * lock, thread_desc * thrd);
+void BlockInternal(__spinlock_t * locks [], unsigned short count);
+void BlockInternal(__spinlock_t * locks [], unsigned short count, thread_desc * thrds [], unsigned short thrd_count);
+void LeaveThread(__spinlock_t * lock, thread_desc * thrd);
 
 //-----------------------------------------------------------------------------
@@ -66,5 +66,5 @@
 struct event_kernel_t {
 	alarm_list_t alarms;
-	spinlock lock;
+	__spinlock_t lock;
 };
 
Index: src/libcfa/concurrency/monitor.c
===================================================================
--- src/libcfa/concurrency/monitor.c	(revision 3351cc0e1e27b4c8378091ffd05c8b3a1eaf0c9e)
+++ src/libcfa/concurrency/monitor.c	(revision ea7d2b051267e571f113e8dabae0d886eda94432)
@@ -34,11 +34,11 @@
 static inline bool is_accepted( monitor_desc * this, const __monitor_group_t & monitors );
 
-static inline void lock_all  ( spinlock * locks [], __lock_size_t count );
-static inline void lock_all  ( monitor_desc * source [], spinlock * /*out*/ locks [], __lock_size_t count );
-static inline void unlock_all( spinlock * locks [], __lock_size_t count );
+static inline void lock_all  ( __spinlock_t * locks [], __lock_size_t count );
+static inline void lock_all  ( monitor_desc * source [], __spinlock_t * /*out*/ locks [], __lock_size_t count );
+static inline void unlock_all( __spinlock_t * locks [], __lock_size_t count );
 static inline void unlock_all( monitor_desc * locks [], __lock_size_t count );
 
-static inline void save   ( monitor_desc * ctx [], __lock_size_t count, spinlock * locks [], unsigned int /*out*/ recursions [], __waitfor_mask_t /*out*/ masks [] );
-static inline void restore( monitor_desc * ctx [], __lock_size_t count, spinlock * locks [], unsigned int /*in */ recursions [], __waitfor_mask_t /*in */ masks [] );
+static inline void save   ( monitor_desc * ctx [], __lock_size_t count, __spinlock_t * locks [], unsigned int /*out*/ recursions [], __waitfor_mask_t /*out*/ masks [] );
+static inline void restore( monitor_desc * ctx [], __lock_size_t count, __spinlock_t * locks [], unsigned int /*in */ recursions [], __waitfor_mask_t /*in */ masks [] );
 
 static inline void init     ( __lock_size_t count, monitor_desc * monitors [], __condition_node_t & waiter, __condition_criterion_t criteria [] );
@@ -71,5 +71,5 @@
 	unsigned int recursions[ count ];                         /* Save the current recursion levels to restore them later                             */ \
 	__waitfor_mask_t masks [ count ];                         /* Save the current waitfor masks to restore them later                                */ \
-	spinlock *   locks     [ count ];                         /* We need to pass-in an array of locks to BlockInternal                               */ \
+	__spinlock_t *   locks [ count ];                         /* We need to pass-in an array of locks to BlockInternal                               */ \
 
 #define monitor_save    save   ( monitors, count, locks, recursions, masks )
@@ -85,5 +85,5 @@
 	static void __enter_monitor_desc( monitor_desc * this, const __monitor_group_t & group ) {
 		// Lock the monitor spinlock, lock_yield to reduce contention
-		lock_yield( &this->lock DEBUG_CTX2 );
+		lock_yield( this->lock DEBUG_CTX2 );
 		thread_desc * thrd = this_thread;
 
@@ -127,5 +127,5 @@
 
 		// Release the lock and leave
-		unlock( &this->lock );
+		unlock( this->lock );
 		return;
 	}
@@ -133,5 +133,5 @@
 	static void __enter_monitor_dtor( monitor_desc * this, fptr_t func ) {
 		// Lock the monitor spinlock, lock_yield to reduce contention
-		lock_yield( &this->lock DEBUG_CTX2 );
+		lock_yield( this->lock DEBUG_CTX2 );
 		thread_desc * thrd = this_thread;
 
@@ -145,5 +145,5 @@
 			set_owner( this, thrd );
 
-			unlock( &this->lock );
+			unlock( this->lock );
 			return;
 		}
@@ -197,5 +197,5 @@
 	void __leave_monitor_desc( monitor_desc * this ) {
 		// Lock the monitor spinlock, lock_yield to reduce contention
-		lock_yield( &this->lock DEBUG_CTX2 );
+		lock_yield( this->lock DEBUG_CTX2 );
 
 		LIB_DEBUG_PRINT_SAFE("Kernel : %10p Leaving mon %p (%p)\n", this_thread, this, this->owner);
@@ -210,5 +210,5 @@
 		if( this->recursion != 0) {
 			LIB_DEBUG_PRINT_SAFE("Kernel :  recursion still %d\n", this->recursion);
-			unlock( &this->lock );
+			unlock( this->lock );
 			return;
 		}
@@ -218,5 +218,5 @@
 
 		// We can now let other threads in safely
-		unlock( &this->lock );
+		unlock( this->lock );
 
 		//We need to wake-up the thread
@@ -243,5 +243,5 @@
 
 		// Lock the monitor now
-		lock_yield( &this->lock DEBUG_CTX2 );
+		lock_yield( this->lock DEBUG_CTX2 );
 
 		disable_interrupts();
@@ -730,21 +730,21 @@
 }
 
-static inline void lock_all( spinlock * locks [], __lock_size_t count ) {
+static inline void lock_all( __spinlock_t * locks [], __lock_size_t count ) {
 	for( __lock_size_t i = 0; i < count; i++ ) {
-		lock_yield( locks[i] DEBUG_CTX2 );
-	}
-}
-
-static inline void lock_all( monitor_desc * source [], spinlock * /*out*/ locks [], __lock_size_t count ) {
+		lock_yield( *locks[i] DEBUG_CTX2 );
+	}
+}
+
+static inline void lock_all( monitor_desc * source [], __spinlock_t * /*out*/ locks [], __lock_size_t count ) {
 	for( __lock_size_t i = 0; i < count; i++ ) {
-		spinlock * l = &source[i]->lock;
-		lock_yield( l DEBUG_CTX2 );
+		__spinlock_t * l = &source[i]->lock;
+		lock_yield( *l DEBUG_CTX2 );
 		if(locks) locks[i] = l;
 	}
 }
 
-static inline void unlock_all( spinlock * locks [], __lock_size_t count ) {
+static inline void unlock_all( __spinlock_t * locks [], __lock_size_t count ) {
 	for( __lock_size_t i = 0; i < count; i++ ) {
-		unlock( locks[i] );
+		unlock( *locks[i] );
 	}
 }
@@ -752,5 +752,5 @@
 static inline void unlock_all( monitor_desc * locks [], __lock_size_t count ) {
 	for( __lock_size_t i = 0; i < count; i++ ) {
-		unlock( &locks[i]->lock );
+		unlock( locks[i]->lock );
 	}
 }
@@ -759,5 +759,5 @@
 	monitor_desc * ctx [],
 	__lock_size_t count,
-	__attribute((unused)) spinlock * locks [],
+	__attribute((unused)) __spinlock_t * locks [],
 	unsigned int /*out*/ recursions [],
 	__waitfor_mask_t /*out*/ masks []
@@ -772,5 +772,5 @@
 	monitor_desc * ctx [],
 	__lock_size_t count,
-	spinlock * locks [],
+	__spinlock_t * locks [],
 	unsigned int /*out*/ recursions [],
 	__waitfor_mask_t /*out*/ masks []
Index: src/libcfa/concurrency/preemption.c
===================================================================
--- src/libcfa/concurrency/preemption.c	(revision 3351cc0e1e27b4c8378091ffd05c8b3a1eaf0c9e)
+++ src/libcfa/concurrency/preemption.c	(revision ea7d2b051267e571f113e8dabae0d886eda94432)
@@ -355,7 +355,7 @@
 		case SI_KERNEL:
 			// LIB_DEBUG_PRINT_SAFE("Kernel : Preemption thread tick\n");
-			lock( &event_kernel->lock DEBUG_CTX2 );
+			lock( event_kernel->lock DEBUG_CTX2 );
 			tick_preemption();
-			unlock( &event_kernel->lock );
+			unlock( event_kernel->lock );
 			break;
 		// Signal was not sent by the kernel but by an other thread
