Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision fb0be05ecf8eb6fa3fe9d0583c7d196e16198bfc)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision c993b1521e96d36bed6b395f8eec5284bc85f5f5)
@@ -38,6 +38,5 @@
 			struct $thread          * volatile this_thread;
 			struct processor        * volatile this_processor;
-			struct __processor_id_t * volatile this_proc_id;
-			struct __stats_t        * volatile this_stats;
+			volatile bool sched_lock;
 
 			struct {
@@ -56,4 +55,13 @@
 				uint64_t bck_seed;
 			} ready_rng;
+
+			struct __stats_t        * volatile this_stats;
+
+
+			#ifdef __CFA_WITH_VERIFY__
+				// Debug, check if the rwlock is owned for reading
+				bool in_sched_lock;
+				unsigned sched_id;
+			#endif
 		} __cfaabi_tls __attribute__ ((tls_model ( "initial-exec" )));
 
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision fb0be05ecf8eb6fa3fe9d0583c7d196e16198bfc)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision c993b1521e96d36bed6b395f8eec5284bc85f5f5)
@@ -77,4 +77,6 @@
 static void doregister( struct cluster & cltr );
 static void unregister( struct cluster & cltr );
+static void register_tls( processor * this );
+static void unregister_tls( processor * this );
 static void ?{}( $coroutine & this, current_stack_info_t * info);
 static void ?{}( $thread & this, current_stack_info_t * info);
@@ -123,7 +125,13 @@
 	NULL,												// cannot use 0p
 	NULL,
+	false,
+	{ 1, false, false },
+	0,
+	{ 0, 0 },
 	NULL,
-	NULL,
-	{ 1, false, false },
+	#ifdef __CFA_WITH_VERIFY__
+		false,
+		0,
+	#endif
 };
 
@@ -210,7 +218,8 @@
 	(*mainProcessor){};
 
+	register_tls( mainProcessor );
+
 	//initialize the global state variables
 	__cfaabi_tls.this_processor = mainProcessor;
-	__cfaabi_tls.this_proc_id   = (__processor_id_t*)mainProcessor;
 	__cfaabi_tls.this_thread    = mainThread;
 
@@ -273,4 +282,6 @@
 	#endif
 
+	unregister_tls( mainProcessor );
+
 	// Destroy the main processor and its context in reverse order of construction
 	// These were manually constructed so we need manually destroy them
@@ -316,7 +327,9 @@
 	processor * proc = (processor *) arg;
 	__cfaabi_tls.this_processor = proc;
-	__cfaabi_tls.this_proc_id   = (__processor_id_t*)proc;
 	__cfaabi_tls.this_thread    = 0p;
 	__cfaabi_tls.preemption_state.[enabled, disable_count] = [false, 1];
+
+	register_tls( proc );
+
 	// SKULLDUGGERY: We want to create a context for the processor coroutine
 	// which is needed for the 2-step context switch. However, there is no reason
@@ -355,4 +368,6 @@
 		#endif
 	#endif
+
+	unregister_tls( proc );
 
 	return 0p;
@@ -496,15 +511,4 @@
 	#endif
 
-	// Register and Lock the RWlock so no-one pushes/pops while we are changing the queue
-	uint_fast32_t last_size = ready_mutate_register((__processor_id_t*)&this);
-		this.cltr->procs.total += 1u;
-		insert_last(this.cltr->procs.actives, this);
-
-		// Adjust the ready queue size
-		ready_queue_grow( cltr );
-
-	// Unlock the RWlock
-	ready_mutate_unlock( last_size );
-
 	__cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
 }
@@ -512,15 +516,4 @@
 // Not a ctor, it just preps the destruction but should not destroy members
 static void deinit(processor & this) {
-	// Lock the RWlock so no-one pushes/pops while we are changing the queue
-	uint_fast32_t last_size = ready_mutate_lock();
-		this.cltr->procs.total -= 1u;
-		remove(this);
-
-		// Adjust the ready queue size
-		ready_queue_shrink( this.cltr );
-
-	// Unlock the RWlock and unregister: we don't need the read_lock any more
-	ready_mutate_unregister((__processor_id_t*)&this, last_size );
-
 	close(this.idle);
 }
@@ -656,4 +649,37 @@
 	cltr->nthreads -= 1;
 	unlock(cltr->thread_list_lock);
+}
+
+static void register_tls( processor * this ) {
+	// Register and Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size;
+	[this->unique_id, last_size] = ready_mutate_register();
+
+		this->cltr->procs.total += 1u;
+		insert_last(this->cltr->procs.actives, *this);
+
+		// Adjust the ready queue size
+		ready_queue_grow( this->cltr );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+}
+
+
+static void unregister_tls( processor * this ) {
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock();
+		this->cltr->procs.total -= 1u;
+		remove(*this);
+
+		// clear the cluster so nothing gets pushed to local queues
+		cluster * cltr = this->cltr;
+		this->cltr = 0p;
+
+		// Adjust the ready queue size
+		ready_queue_shrink( cltr );
+
+	// Unlock the RWlock and unregister: we don't need the read_lock any more
+	ready_mutate_unregister( this->unique_id, last_size );
 }
 
