Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision b2fc7ad9ed2e454a52ddb04a94ed1a63293acd03)
+++ libcfa/src/concurrency/kernel.cfa	(revision e8a7ca25cfb09b62921f637a538511cf43dc27c4)
@@ -163,5 +163,5 @@
 	#if !defined(__CFA_NO_STATISTICS__)
 		if( this->print_halts ) {
-			__cfaabi_bits_print_safe( STDOUT_FILENO, "Processor : %d - %s (%p)\n", this->id, this->name, (void*)this);
+			__cfaabi_bits_print_safe( STDOUT_FILENO, "Processor : %d - %s (%p)\n", this->unique_id, this->name, (void*)this);
 		}
 	#endif
@@ -223,5 +223,5 @@
 				#if !defined(__CFA_NO_STATISTICS__)
 					if(this->print_halts) {
-						__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->id, rdtscl());
+						__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->unique_id, rdtscl());
 					}
 				#endif
@@ -236,5 +236,5 @@
 				#if !defined(__CFA_NO_STATISTICS__)
 					if(this->print_halts) {
-						__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->id, rdtscl());
+						__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->unique_id, rdtscl());
 					}
 				#endif
@@ -390,5 +390,4 @@
 
 	post( this->terminated );
-
 
 	if(this == mainProcessor) {
@@ -553,5 +552,4 @@
 static void __schedule_thread( $thread * thrd ) {
 	/* paranoid */ verify( ! __preemption_enabled() );
-	/* paranoid */ verify( kernelTLS().this_proc_id );
 	/* paranoid */ verify( ready_schedule_islocked());
 	/* paranoid */ verify( thrd );
@@ -611,5 +609,4 @@
 static inline $thread * __next_thread(cluster * this) with( *this ) {
 	/* paranoid */ verify( ! __preemption_enabled() );
-	/* paranoid */ verify( kernelTLS().this_proc_id );
 
 	ready_schedule_lock();
@@ -617,5 +614,4 @@
 	ready_schedule_unlock();
 
-	/* paranoid */ verify( kernelTLS().this_proc_id );
 	/* paranoid */ verify( ! __preemption_enabled() );
 	return thrd;
@@ -625,5 +621,4 @@
 static inline $thread * __next_thread_slow(cluster * this) with( *this ) {
 	/* paranoid */ verify( ! __preemption_enabled() );
-	/* paranoid */ verify( kernelTLS().this_proc_id );
 
 	ready_schedule_lock();
@@ -638,5 +633,4 @@
 	ready_schedule_unlock();
 
-	/* paranoid */ verify( kernelTLS().this_proc_id );
 	/* paranoid */ verify( ! __preemption_enabled() );
 	return thrd;
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision b2fc7ad9ed2e454a52ddb04a94ed1a63293acd03)
+++ libcfa/src/concurrency/kernel.hfa	(revision e8a7ca25cfb09b62921f637a538511cf43dc27c4)
@@ -49,11 +49,5 @@
 
 // Processor id, required for scheduling threads
-struct __processor_id_t {
-	unsigned id:24;
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		struct __stats_t * stats;
-	#endif
-};
+
 
 coroutine processorCtx_t {
@@ -63,7 +57,4 @@
 // Wrapper around kernel threads
 struct __attribute__((aligned(128))) processor {
-	// Main state
-	inline __processor_id_t;
-
 	// Cluster from which to get threads
 	struct cluster * cltr;
@@ -89,4 +80,7 @@
 	// Handle to pthreads
 	pthread_t kernel_thread;
+
+	// Unique id for the processor (not per cluster)
+	unsigned unique_id;
 
 	struct {
Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision b2fc7ad9ed2e454a52ddb04a94ed1a63293acd03)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision e8a7ca25cfb09b62921f637a538511cf43dc27c4)
@@ -38,6 +38,5 @@
 			struct $thread          * volatile this_thread;
 			struct processor        * volatile this_processor;
-			struct __processor_id_t * volatile this_proc_id;
-			struct __stats_t        * volatile this_stats;
+			volatile bool sched_lock;
 
 			struct {
@@ -56,4 +55,13 @@
 				uint64_t bck_seed;
 			} ready_rng;
+
+			struct __stats_t        * volatile this_stats;
+
+
+			#ifdef __CFA_WITH_VERIFY__
+				// Debug, check if the rwlock is owned for reading
+				bool in_sched_lock;
+				unsigned sched_id;
+			#endif
 		} __cfaabi_tls __attribute__ ((tls_model ( "initial-exec" )));
 
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision b2fc7ad9ed2e454a52ddb04a94ed1a63293acd03)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision e8a7ca25cfb09b62921f637a538511cf43dc27c4)
@@ -77,4 +77,6 @@
 static void doregister( struct cluster & cltr );
 static void unregister( struct cluster & cltr );
+static void register_tls( processor * this );
+static void unregister_tls( processor * this );
 static void ?{}( $coroutine & this, current_stack_info_t * info);
 static void ?{}( $thread & this, current_stack_info_t * info);
@@ -123,7 +125,13 @@
 	NULL,												// cannot use 0p
 	NULL,
+	false,
+	{ 1, false, false },
+	0,
+	{ 0, 0 },
 	NULL,
-	NULL,
-	{ 1, false, false },
+	#ifdef __CFA_WITH_VERIFY__
+		false,
+		0,
+	#endif
 };
 
@@ -210,7 +218,8 @@
 	(*mainProcessor){};
 
+	register_tls( mainProcessor );
+
 	//initialize the global state variables
 	__cfaabi_tls.this_processor = mainProcessor;
-	__cfaabi_tls.this_proc_id   = (__processor_id_t*)mainProcessor;
 	__cfaabi_tls.this_thread    = mainThread;
 
@@ -273,4 +282,6 @@
 	#endif
 
+	unregister_tls( mainProcessor );
+
 	// Destroy the main processor and its context in reverse order of construction
 	// These were manually constructed so we need manually destroy them
@@ -316,7 +327,9 @@
 	processor * proc = (processor *) arg;
 	__cfaabi_tls.this_processor = proc;
-	__cfaabi_tls.this_proc_id   = (__processor_id_t*)proc;
 	__cfaabi_tls.this_thread    = 0p;
 	__cfaabi_tls.preemption_state.[enabled, disable_count] = [false, 1];
+
+	register_tls( proc );
+
 	// SKULLDUGGERY: We want to create a context for the processor coroutine
 	// which is needed for the 2-step context switch. However, there is no reason
@@ -355,4 +368,6 @@
 		#endif
 	#endif
+
+	unregister_tls( proc );
 
 	return 0p;
@@ -496,15 +511,4 @@
 	#endif
 
-	// Register and Lock the RWlock so no-one pushes/pops while we are changing the queue
-	uint_fast32_t last_size = ready_mutate_register((__processor_id_t*)&this);
-		this.cltr->procs.total += 1u;
-		insert_last(this.cltr->procs.actives, this);
-
-		// Adjust the ready queue size
-		ready_queue_grow( cltr );
-
-	// Unlock the RWlock
-	ready_mutate_unlock( last_size );
-
 	__cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
 }
@@ -512,15 +516,4 @@
 // Not a ctor, it just preps the destruction but should not destroy members
 static void deinit(processor & this) {
-	// Lock the RWlock so no-one pushes/pops while we are changing the queue
-	uint_fast32_t last_size = ready_mutate_lock();
-		this.cltr->procs.total -= 1u;
-		remove(this);
-
-		// Adjust the ready queue size
-		ready_queue_shrink( this.cltr );
-
-	// Unlock the RWlock and unregister: we don't need the read_lock any more
-	ready_mutate_unregister((__processor_id_t*)&this, last_size );
-
 	close(this.idle);
 }
@@ -656,4 +649,37 @@
 	cltr->nthreads -= 1;
 	unlock(cltr->thread_list_lock);
+}
+
+static void register_tls( processor * this ) {
+	// Register and Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size;
+	[this->unique_id, last_size] = ready_mutate_register();
+
+		this->cltr->procs.total += 1u;
+		insert_last(this->cltr->procs.actives, *this);
+
+		// Adjust the ready queue size
+		ready_queue_grow( this->cltr );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+}
+
+
+static void unregister_tls( processor * this ) {
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock();
+		this->cltr->procs.total -= 1u;
+		remove(*this);
+
+		// clear the cluster so nothing gets pushed to local queues
+		cluster * cltr = this->cltr;
+		this->cltr = 0p;
+
+		// Adjust the ready queue size
+		ready_queue_shrink( cltr );
+
+	// Unlock the RWlock and unregister: we don't need the read_lock any more
+	ready_mutate_unregister( this->unique_id, last_size );
 }
 
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision b2fc7ad9ed2e454a52ddb04a94ed1a63293acd03)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision e8a7ca25cfb09b62921f637a538511cf43dc27c4)
@@ -25,5 +25,4 @@
 // Scheduler
 
-struct __attribute__((aligned(128))) __scheduler_lock_id_t;
 
 extern "C" {
@@ -80,8 +79,8 @@
 // Lock-Free registering/unregistering of threads
 // Register a processor to a given cluster and get its unique id in return
-void register_proc_id( struct __processor_id_t * );
+unsigned register_proc_id( void );
 
 // Unregister a processor from a given cluster using its id, getting back the original pointer
-void unregister_proc_id( struct __processor_id_t * proc );
+void unregister_proc_id( unsigned );
 
 //=======================================================================
@@ -112,21 +111,7 @@
 }
 
-// Cells use by the reader writer lock
-// while not generic it only relies on a opaque pointer
-struct __attribute__((aligned(128))) __scheduler_lock_id_t {
-	// Spin lock used as the underlying lock
-	volatile bool lock;
-
-	// Handle pointing to the proc owning this cell
-	// Used for allocating cells and debugging
-	__processor_id_t * volatile handle;
-
-	#ifdef __CFA_WITH_VERIFY__
-		// Debug, check if this is owned for reading
-		bool owned;
-	#endif
-};
-
-static_assert( sizeof(struct __scheduler_lock_id_t) <= __alignof(struct __scheduler_lock_id_t));
+
+
+
 
 //-----------------------------------------------------------------------
@@ -147,8 +132,8 @@
 
 	// writer lock
-	volatile bool lock;
+	volatile bool write_lock;
 
 	// data pointer
-	__scheduler_lock_id_t * data;
+	volatile bool * volatile * data;
 };
 
@@ -163,12 +148,10 @@
 static inline void ready_schedule_lock(void) with(*__scheduler_lock) {
 	/* paranoid */ verify( ! __preemption_enabled() );
-	/* paranoid */ verify( kernelTLS().this_proc_id );
-
-	unsigned iproc = kernelTLS().this_proc_id->id;
-	/*paranoid*/ verify(data[iproc].handle == kernelTLS().this_proc_id);
-	/*paranoid*/ verify(iproc < ready);
+	/* paranoid */ verify( ! kernelTLS().in_sched_lock );
+	/* paranoid */ verify( data[kernelTLS().sched_id] == &kernelTLS().sched_lock );
+	/* paranoid */ verify( !kernelTLS().this_processor || kernelTLS().this_processor->unique_id == kernelTLS().sched_id );
 
 	// Step 1 : make sure no writer are in the middle of the critical section
-	while(__atomic_load_n(&lock, (int)__ATOMIC_RELAXED))
+	while(__atomic_load_n(&write_lock, (int)__ATOMIC_RELAXED))
 		Pause();
 
@@ -179,10 +162,10 @@
 
 	// Step 2 : acquire our local lock
-	__atomic_acquire( &data[iproc].lock );
-	/*paranoid*/ verify(data[iproc].lock);
+	__atomic_acquire( &kernelTLS().sched_lock );
+	/*paranoid*/ verify(kernelTLS().sched_lock);
 
 	#ifdef __CFA_WITH_VERIFY__
 		// Debug, check if this is owned for reading
-		data[iproc].owned = true;
+		kernelTLS().in_sched_lock = true;
 	#endif
 }
@@ -190,16 +173,13 @@
 static inline void ready_schedule_unlock(void) with(*__scheduler_lock) {
 	/* paranoid */ verify( ! __preemption_enabled() );
-	/* paranoid */ verify( kernelTLS().this_proc_id );
-
-	unsigned iproc = kernelTLS().this_proc_id->id;
-	/*paranoid*/ verify(data[iproc].handle == kernelTLS().this_proc_id);
-	/*paranoid*/ verify(iproc < ready);
-	/*paranoid*/ verify(data[iproc].lock);
-	/*paranoid*/ verify(data[iproc].owned);
+	/* paranoid */ verify( data[kernelTLS().sched_id] == &kernelTLS().sched_lock );
+	/* paranoid */ verify( !kernelTLS().this_processor || kernelTLS().this_processor->unique_id == kernelTLS().sched_id );
+	/* paranoid */ verify( kernelTLS().sched_lock );
+	/* paranoid */ verify( kernelTLS().in_sched_lock );
 	#ifdef __CFA_WITH_VERIFY__
 		// Debug, check if this is owned for reading
-		data[iproc].owned = false;
+		kernelTLS().in_sched_lock = false;
 	#endif
-	__atomic_unlock(&data[iproc].lock);
+	__atomic_unlock(&kernelTLS().sched_lock);
 }
 
@@ -207,11 +187,10 @@
 	static inline bool ready_schedule_islocked(void) {
 		/* paranoid */ verify( ! __preemption_enabled() );
-		/*paranoid*/ verify( kernelTLS().this_proc_id );
-		__processor_id_t * proc = kernelTLS().this_proc_id;
-		return __scheduler_lock->data[proc->id].owned;
+		/* paranoid */ verify( (!kernelTLS().in_sched_lock) || kernelTLS().sched_lock );
+		return kernelTLS().sched_lock;
 	}
 
 	static inline bool ready_mutate_islocked() {
-		return __scheduler_lock->lock;
+		return __scheduler_lock->write_lock;
 	}
 #endif
@@ -228,14 +207,15 @@
 // Register a processor to a given cluster and get its unique id in return
 // For convenience, also acquires the lock
-static inline uint_fast32_t ready_mutate_register( struct __processor_id_t * proc ) {
-	register_proc_id( proc );
-	return ready_mutate_lock();
+static inline [unsigned, uint_fast32_t] ready_mutate_register() {
+	unsigned id = register_proc_id();
+	uint_fast32_t last = ready_mutate_lock();
+	return [id, last];
 }
 
 // Unregister a processor from a given cluster using its id, getting back the original pointer
 // assumes the lock is acquired
-static inline void ready_mutate_unregister( struct __processor_id_t * proc, uint_fast32_t last_s ) {
+static inline void ready_mutate_unregister( unsigned id, uint_fast32_t last_s ) {
 	ready_mutate_unlock( last_s );
-	unregister_proc_id( proc );
+	unregister_proc_id( id );
 }
 
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision b2fc7ad9ed2e454a52ddb04a94ed1a63293acd03)
+++ libcfa/src/concurrency/preemption.cfa	(revision e8a7ca25cfb09b62921f637a538511cf43dc27c4)
@@ -687,8 +687,5 @@
 // Waits on SIGALRM and send SIGUSR1 to whom ever needs it
 static void * alarm_loop( __attribute__((unused)) void * args ) {
-	__processor_id_t id;
-	register_proc_id(&id);
-	__cfaabi_tls.this_proc_id = &id;
-
+	unsigned id = register_proc_id();
 
 	// Block sigalrms to control when they arrive
@@ -749,5 +746,5 @@
 EXIT:
 	__cfaabi_dbg_print_safe( "Kernel : Preemption thread stopping\n" );
-	register_proc_id(&id);
+	unregister_proc_id(id);
 
 	return 0p;
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision b2fc7ad9ed2e454a52ddb04a94ed1a63293acd03)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision e8a7ca25cfb09b62921f637a538511cf43dc27c4)
@@ -93,9 +93,7 @@
 	this.alloc = 0;
 	this.ready = 0;
-	this.lock  = false;
 	this.data  = alloc(this.max);
-
-	/*paranoid*/ verify( 0 == (((uintptr_t)(this.data    )) % 64) );
-	/*paranoid*/ verify( 0 == (((uintptr_t)(this.data + 1)) % 64) );
+	this.write_lock  = false;
+
 	/*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.alloc), &this.alloc));
 	/*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.ready), &this.ready));
@@ -106,16 +104,10 @@
 }
 
-void ?{}( __scheduler_lock_id_t & this, __processor_id_t * proc ) {
-	this.handle = proc;
-	this.lock   = false;
-	#ifdef __CFA_WITH_VERIFY__
-		this.owned  = false;
-	#endif
-}
 
 //=======================================================================
 // Lock-Free registering/unregistering of threads
-void register_proc_id( struct __processor_id_t * proc ) with(*__scheduler_lock) {
+unsigned register_proc_id( void ) with(*__scheduler_lock) {
 	__cfadbg_print_safe(ready_queue, "Kernel : Registering proc %p for RW-Lock\n", proc);
+	bool * handle = (bool *)&kernelTLS().sched_lock;
 
 	// Step - 1 : check if there is already space in the data
@@ -124,12 +116,13 @@
 	// Check among all the ready
 	for(uint_fast32_t i = 0; i < s; i++) {
-		__processor_id_t * null = 0p; // Re-write every loop since compare thrashes it
-		if( __atomic_load_n(&data[i].handle, (int)__ATOMIC_RELAXED) == null
-			&& __atomic_compare_exchange_n( &data[i].handle, &null, proc, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-			/*paranoid*/ verify(i < ready);
-			/*paranoid*/ verify(0 == (__alignof__(data[i]) % cache_line_size));
-			/*paranoid*/ verify((((uintptr_t)&data[i]) % cache_line_size) == 0);
-			proc->id = i;
-			return;
+		bool * volatile * cell = (bool * volatile *)&data[i]; // Cforall is bugged and the double volatiles causes problems
+		/* paranoid */ verify( handle != *cell );
+
+		bool * null = 0p; // Re-write every loop since compare thrashes it
+		if( __atomic_load_n(cell, (int)__ATOMIC_RELAXED) == null
+			&& __atomic_compare_exchange_n( cell, &null, handle, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+			/* paranoid */ verify(i < ready);
+			/* paranoid */ verify( (kernelTLS().sched_id = i, true) );
+			return i;
 		}
 	}
@@ -142,6 +135,5 @@
 
 	// Step - 3 : Mark space as used and then publish it.
-	__scheduler_lock_id_t * storage = (__scheduler_lock_id_t *)&data[n];
-	(*storage){ proc };
+	data[n] = handle;
 	while() {
 		unsigned copy = n;
@@ -155,15 +147,17 @@
 
 	// Return new spot.
-	/*paranoid*/ verify(n < ready);
-	/*paranoid*/ verify(__alignof__(data[n]) == (2 * cache_line_size));
-	/*paranoid*/ verify((((uintptr_t)&data[n]) % cache_line_size) == 0);
-	proc->id = n;
-}
-
-void unregister_proc_id( struct __processor_id_t * proc ) with(*__scheduler_lock) {
-	unsigned id = proc->id;
-	/*paranoid*/ verify(id < ready);
-	/*paranoid*/ verify(proc == __atomic_load_n(&data[id].handle, __ATOMIC_RELAXED));
-	__atomic_store_n(&data[id].handle, 0p, __ATOMIC_RELEASE);
+	/* paranoid */ verify(n < ready);
+	/* paranoid */ verify( (kernelTLS().sched_id = n, true) );
+	return n;
+}
+
+void unregister_proc_id( unsigned id ) with(*__scheduler_lock) {
+	/* paranoid */ verify(id < ready);
+	/* paranoid */ verify(id == kernelTLS().sched_id);
+	/* paranoid */ verify(data[id] == &kernelTLS().sched_lock);
+
+	bool * volatile * cell = (bool * volatile *)&data[id]; // Cforall is bugged and the double volatiles causes problems
+
+	__atomic_store_n(cell, 0p, __ATOMIC_RELEASE);
 
 	__cfadbg_print_safe(ready_queue, "Kernel : Unregister proc %p\n", proc);
@@ -175,9 +169,10 @@
 uint_fast32_t ready_mutate_lock( void ) with(*__scheduler_lock) {
 	/* paranoid */ verify( ! __preemption_enabled() );
+	/* paranoid */ verify( ! kernelTLS().sched_lock );
 
 	// Step 1 : lock global lock
 	// It is needed to avoid processors that register mid Critical-Section
 	//   to simply lock their own lock and enter.
-	__atomic_acquire( &lock );
+	__atomic_acquire( &write_lock );
 
 	// Step 2 : lock per-proc lock
@@ -187,5 +182,6 @@
 	uint_fast32_t s = ready;
 	for(uint_fast32_t i = 0; i < s; i++) {
-		__atomic_acquire( &data[i].lock );
+		volatile bool * llock = data[i];
+		if(llock) __atomic_acquire( llock );
 	}
 
@@ -204,11 +200,11 @@
 	// Alternative solution : return s in write_lock and pass it to write_unlock
 	for(uint_fast32_t i = 0; i < last_s; i++) {
-		verify(data[i].lock);
-		__atomic_store_n(&data[i].lock, (bool)false, __ATOMIC_RELEASE);
+		volatile bool * llock = data[i];
+		if(llock) __atomic_store_n(llock, (bool)false, __ATOMIC_RELEASE);
 	}
 
 	// Step 2 : release global lock
-	/*paranoid*/ assert(true == lock);
-	__atomic_store_n(&lock, (bool)false, __ATOMIC_RELEASE);
+	/*paranoid*/ assert(true == write_lock);
+	__atomic_store_n(&write_lock, (bool)false, __ATOMIC_RELEASE);
 
 	/* paranoid */ verify( ! __preemption_enabled() );