Index: libcfa/src/concurrency/coroutine.cfa
===================================================================
--- libcfa/src/concurrency/coroutine.cfa	(revision 48a91e270c570a6f6635a022d7dfafbbd208229b)
+++ libcfa/src/concurrency/coroutine.cfa	(revision 708ae384ef22961b8c1c67282cb1dc6024011687)
@@ -27,5 +27,5 @@
 #include <unwind.h>
 
-#include "kernel_private.hfa"
+#include "kernel/private.hfa"
 #include "exception.hfa"
 #include "math.hfa"
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 48a91e270c570a6f6635a022d7dfafbbd208229b)
+++ libcfa/src/concurrency/io.cfa	(revision 708ae384ef22961b8c1c67282cb1dc6024011687)
@@ -41,5 +41,5 @@
 	#include "kernel.hfa"
 	#include "kernel/fwd.hfa"
-	#include "kernel_private.hfa"
+	#include "kernel/private.hfa"
 	#include "io/types.hfa"
 
Index: libcfa/src/concurrency/io/setup.cfa
===================================================================
--- libcfa/src/concurrency/io/setup.cfa	(revision 48a91e270c570a6f6635a022d7dfafbbd208229b)
+++ libcfa/src/concurrency/io/setup.cfa	(revision 708ae384ef22961b8c1c67282cb1dc6024011687)
@@ -59,5 +59,5 @@
 	#include "bitmanip.hfa"
 	#include "fstream.hfa"
-	#include "kernel_private.hfa"
+	#include "kernel/private.hfa"
 	#include "thread.hfa"
 #pragma GCC diagnostic pop
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 48a91e270c570a6f6635a022d7dfafbbd208229b)
+++ libcfa/src/concurrency/kernel.cfa	(revision 708ae384ef22961b8c1c67282cb1dc6024011687)
@@ -35,5 +35,5 @@
 
 //CFA Includes
-#include "kernel_private.hfa"
+#include "kernel/private.hfa"
 #include "preemption.hfa"
 #include "strstream.hfa"
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 48a91e270c570a6f6635a022d7dfafbbd208229b)
+++ libcfa/src/concurrency/kernel.hfa	(revision 708ae384ef22961b8c1c67282cb1dc6024011687)
@@ -227,9 +227,9 @@
 
 		struct {
- 			// Number of I/O subqueues
- 			volatile size_t count;
-
-			// Time since subqueues were processed
+ 			// Time since subqueues were processed
 			__timestamp_t * volatile tscs;
+
+			// Number of I/O subqueues
+ 			size_t count;
 		} io;
 
Index: libcfa/src/concurrency/kernel/cluster.cfa
===================================================================
--- libcfa/src/concurrency/kernel/cluster.cfa	(revision 48a91e270c570a6f6635a022d7dfafbbd208229b)
+++ libcfa/src/concurrency/kernel/cluster.cfa	(revision 708ae384ef22961b8c1c67282cb1dc6024011687)
@@ -5,9 +5,8 @@
 // file "LICENCE" distributed with Cforall.
 //
-// cluster.cfa.cfa -- file that includes helpers for subsystem that need
-//				cluster wide support
+// cluster.cfa -- file that includes helpers for subsystem that need cluster wide support
 //
 // Author           : Thierry Delisle
-// Created On       : Fri 03 11 12:39:24 2022
+// Created On       : Fri Mar 11 12:39:24 2022
 // Last Modified By :
 // Last Modified On :
@@ -20,5 +19,6 @@
 #include "bits/defs.hfa"
 #include "device/cpu.hfa"
-#include "kernel_private.hfa"
+#include "kernel/cluster.hfa"
+#include "kernel/private.hfa"
 
 #include "stdlib.hfa"
@@ -247,8 +247,8 @@
 // fixes the list so that the pointers back to anchors aren't left dangling
 static inline void fix(__intrusive_lane_t & ll) {
-			if(is_empty(ll)) {
-				verify(ll.anchor.next == 0p);
-				ll.prev = mock_head(ll);
-			}
+	if(is_empty(ll)) {
+		verify(ll.anchor.next == 0p);
+		ll.prev = mock_head(ll);
+	}
 }
 
@@ -321,4 +321,8 @@
 	}
 
+	// Fix the io times
+	cltr->sched.io.count = target;
+	fix_times(cltr->sched.io.tscs, cltr->sched.io.count);
+
 	// realloc the caches
 	cltr->sched.caches = alloc( target, cltr->sched.caches`realloc );
@@ -406,5 +410,7 @@
 	cltr->sched.caches = alloc( target, cltr->sched.caches`realloc );
 
-
+	// Fix the io times
+	cltr->sched.io.count = target;
+	fix_times(cltr->sched.io.tscs, cltr->sched.io.count);
 
 	reassign_cltr_id(cltr);
Index: libcfa/src/concurrency/kernel/cluster.hfa
===================================================================
--- libcfa/src/concurrency/kernel/cluster.hfa	(revision 708ae384ef22961b8c1c67282cb1dc6024011687)
+++ libcfa/src/concurrency/kernel/cluster.hfa	(revision 708ae384ef22961b8c1c67282cb1dc6024011687)
@@ -0,0 +1,79 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2022 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// cluster.hfa -- file that includes helpers for subsystem that need cluster wide support
+//
+// Author           : Thierry Delisle
+// Created On       : Tue Mar 15 16:40:12 2022
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#include "kernel/private.hfa"
+
+//-----------------------------------------------------------------------
+// Calc moving average based on existing average, before and current time.
+static inline unsigned long long moving_average(unsigned long long currtsc, unsigned long long instsc, unsigned long long old_avg) {
+	/* paranoid */ verifyf( currtsc < 45000000000000000, "Suspiciously large current time: %'llu (%llx)\n", currtsc, currtsc );
+	/* paranoid */ verifyf( instsc  < 45000000000000000, "Suspiciously large insert time: %'llu (%llx)\n", instsc, instsc );
+	/* paranoid */ verifyf( old_avg < 15000000000000, "Suspiciously large previous average: %'llu (%llx)\n", old_avg, old_avg );
+
+	const unsigned long long new_val = currtsc > instsc ? currtsc - instsc : 0;
+	const unsigned long long total_weight = 16;
+	const unsigned long long new_weight   = 4;
+	const unsigned long long old_weight = total_weight - new_weight;
+	const unsigned long long ret = ((new_weight * new_val) + (old_weight * old_avg)) / total_weight;
+	return ret;
+}
+
+//-----------------------------------------------------------------------
+// Calc age a timestamp should be before needing help.
+forall(Data_t * | { unsigned long long ts(Data_t & this); })
+static inline unsigned long long calc_cutoff(
+	const unsigned long long ctsc,
+	const processor * proc,
+	size_t count,
+	Data_t * data,
+	__timestamp_t * tscs,
+	const unsigned shard_factor
+) {
+	unsigned start = proc->rdq.id;
+	unsigned long long max = 0;
+	for(i; shard_factor) {
+		unsigned long long ptsc = ts(data[start + i]);
+		if(ptsc != -1ull) {
+			/* paranoid */ verify( start + i < count );
+			unsigned long long tsc = moving_average(ctsc, ptsc, tscs[start + i].ma);
+			if(tsc > max) max = tsc;
+		}
+	}
+	return (max + 2 * max) / 2;
+}
+
+static inline unsigned cache_id(struct cluster * cltr, unsigned idx) with (cltr->sched) {
+	// Figure out the current cpu and make sure it is valid
+	const int cpu = __kernel_getcpu();
+	/* paranoid */ verify(cpu >= 0);
+	/* paranoid */ verify(cpu < cpu_info.hthrd_count);
+	unsigned this_cache = cpu_info.llc_map[cpu].cache;
+
+	// Super important: don't write the same value over and over again
+	// We want to maximise our chances that his particular values stays in cache
+	if(caches[idx].id != this_cache)
+		__atomic_store_n(&caches[idx].id, this_cache, __ATOMIC_RELAXED);
+
+	return this_cache;
+}
+
+static struct {
+	const unsigned readyq;
+} __shard_factor = { 2 };
+
+// Local Variables: //
+// mode: c //
+// tab-width: 4 //
+// End: //
Index: libcfa/src/concurrency/kernel/private.hfa
===================================================================
--- libcfa/src/concurrency/kernel/private.hfa	(revision 708ae384ef22961b8c1c67282cb1dc6024011687)
+++ libcfa/src/concurrency/kernel/private.hfa	(revision 708ae384ef22961b8c1c67282cb1dc6024011687)
@@ -0,0 +1,374 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// kernel/private.hfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Mon Feb 13 12:27:26 2017
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Wed Aug 12 08:21:33 2020
+// Update Count     : 9
+//
+
+#pragma once
+
+#if !defined(__cforall_thread__)
+	#error kernel/private.hfa should only be included in libcfathread source
+#endif
+
+#include "kernel.hfa"
+#include "thread.hfa"
+
+#include "alarm.hfa"
+#include "stats.hfa"
+
+extern "C" {
+#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
+	#include <rseq/rseq.h>
+#elif defined(CFA_HAVE_LINUX_RSEQ_H)
+	#include <linux/rseq.h>
+#else
+	#ifndef _GNU_SOURCE
+	#error kernel/private requires gnu_source
+	#endif
+	#include <sched.h>
+#endif
+}
+
+// Defines whether or not we *want* to use io_uring_enter as the idle_sleep blocking call
+#define CFA_WANT_IO_URING_IDLE
+
+// Defines whether or not we *can* use io_uring_enter as the idle_sleep blocking call
+#if defined(CFA_WANT_IO_URING_IDLE) && defined(CFA_HAVE_LINUX_IO_URING_H)
+	#if defined(CFA_HAVE_IORING_OP_READ) || (defined(CFA_HAVE_READV) && defined(CFA_HAVE_IORING_OP_READV))
+		#define CFA_WITH_IO_URING_IDLE
+	#endif
+#endif
+
+//-----------------------------------------------------------------------------
+// Scheduler
+extern "C" {
+	void disable_interrupts() OPTIONAL_THREAD;
+	void enable_interrupts( bool poll = true );
+}
+
+void schedule_thread$( thread$ *, unpark_hint hint ) __attribute__((nonnull (1)));
+
+extern bool __preemption_enabled();
+
+enum {
+	PREEMPT_NORMAL    = 0,
+	PREEMPT_TERMINATE = 1,
+	PREEMPT_IO = 2,
+};
+
+static inline void __disable_interrupts_checked() {
+	/* paranoid */ verify( __preemption_enabled() );
+	disable_interrupts();
+	/* paranoid */ verify( ! __preemption_enabled() );
+}
+
+static inline void __enable_interrupts_checked( bool poll = true ) {
+	/* paranoid */ verify( ! __preemption_enabled() );
+	enable_interrupts( poll );
+	/* paranoid */ verify( __preemption_enabled() );
+}
+
+//release/wake-up the following resources
+void __thread_finish( thread$ * thrd );
+
+//-----------------------------------------------------------------------------
+// Hardware
+
+#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
+	// No data needed
+#elif defined(CFA_HAVE_LINUX_RSEQ_H)
+	extern "Cforall" {
+		extern __attribute__((aligned(128))) thread_local volatile struct rseq __cfaabi_rseq;
+	}
+#else
+	// No data needed
+#endif
+
+static inline int __kernel_getcpu() {
+	/* paranoid */ verify( ! __preemption_enabled() );
+#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
+	return rseq_current_cpu();
+#elif defined(CFA_HAVE_LINUX_RSEQ_H)
+	int r = __cfaabi_rseq.cpu_id;
+	/* paranoid */ verify( r >= 0 );
+	return r;
+#else
+	return sched_getcpu();
+#endif
+}
+
+//-----------------------------------------------------------------------------
+// Processor
+void main(processorCtx_t *);
+
+void * __create_pthread( pthread_t *, void * (*)(void *), void * );
+void __destroy_pthread( pthread_t pthread, void * stack, void ** retval );
+
+extern cluster * mainCluster;
+
+//-----------------------------------------------------------------------------
+// Threads
+extern "C" {
+      void __cfactx_invoke_thread(void (*main)(void *), void * this);
+}
+
+__cfaabi_dbg_debug_do(
+	extern void __cfaabi_dbg_thread_register  ( thread$ * thrd );
+	extern void __cfaabi_dbg_thread_unregister( thread$ * thrd );
+)
+
+#define TICKET_BLOCKED (-1) // thread is blocked
+#define TICKET_RUNNING ( 0) // thread is running
+#define TICKET_UNBLOCK ( 1) // thread should ignore next block
+
+//-----------------------------------------------------------------------------
+// Utils
+void doregister( struct cluster * cltr, struct thread$ & thrd );
+void unregister( struct cluster * cltr, struct thread$ & thrd );
+
+//-----------------------------------------------------------------------------
+// I/O
+$io_arbiter * create(void);
+void destroy($io_arbiter *);
+
+//=======================================================================
+// Cluster lock API
+//=======================================================================
+// Lock-Free registering/unregistering of threads
+// Register a processor to a given cluster and get its unique id in return
+unsigned register_proc_id( void );
+
+// Unregister a processor from a given cluster using its id, getting back the original pointer
+void unregister_proc_id( unsigned );
+
+//=======================================================================
+// Reader-writer lock implementation
+// Concurrent with doregister/unregister,
+//    i.e., threads can be added at any point during or between the entry/exit
+
+//-----------------------------------------------------------------------
+// simple spinlock underlying the RWLock
+// Blocking acquire
+static inline void __atomic_acquire(volatile bool * ll) {
+	while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) {
+		while(__atomic_load_n(ll, (int)__ATOMIC_RELAXED))
+			Pause();
+	}
+	/* paranoid */ verify(*ll);
+}
+
+// Non-Blocking acquire
+static inline bool __atomic_try_acquire(volatile bool * ll) {
+	return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
+}
+
+// Release
+static inline void __atomic_unlock(volatile bool * ll) {
+	/* paranoid */ verify(*ll);
+	__atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
+}
+
+//-----------------------------------------------------------------------
+// Reader-Writer lock protecting the ready-queues
+// while this lock is mostly generic some aspects
+// have been hard-coded to for the ready-queue for
+// simplicity and performance
+struct __scheduler_RWLock_t {
+	// total cachelines allocated
+	unsigned int max;
+
+	// cachelines currently in use
+	volatile unsigned int alloc;
+
+	// cachelines ready to itereate over
+	// (!= to alloc when thread is in second half of doregister)
+	volatile unsigned int ready;
+
+	// writer lock
+	volatile bool write_lock;
+
+	// data pointer
+	volatile bool * volatile * data;
+};
+
+void  ?{}(__scheduler_RWLock_t & this);
+void ^?{}(__scheduler_RWLock_t & this);
+
+extern __scheduler_RWLock_t * __scheduler_lock;
+
+//-----------------------------------------------------------------------
+// Reader side : acquire when using the ready queue to schedule but not
+//  creating/destroying queues
+static inline void ready_schedule_lock(void) with(*__scheduler_lock) {
+	/* paranoid */ verify( ! __preemption_enabled() );
+	/* paranoid */ verify( ! kernelTLS().in_sched_lock );
+	/* paranoid */ verify( data[kernelTLS().sched_id] == &kernelTLS().sched_lock );
+	/* paranoid */ verify( !kernelTLS().this_processor || kernelTLS().this_processor->unique_id == kernelTLS().sched_id );
+
+	// Step 1 : make sure no writer are in the middle of the critical section
+	while(__atomic_load_n(&write_lock, (int)__ATOMIC_RELAXED))
+		Pause();
+
+	// Fence needed because we don't want to start trying to acquire the lock
+	// before we read a false.
+	// Not needed on x86
+	// std::atomic_thread_fence(std::memory_order_seq_cst);
+
+	// Step 2 : acquire our local lock
+	__atomic_acquire( &kernelTLS().sched_lock );
+	/*paranoid*/ verify(kernelTLS().sched_lock);
+
+	#ifdef __CFA_WITH_VERIFY__
+		// Debug, check if this is owned for reading
+		kernelTLS().in_sched_lock = true;
+	#endif
+}
+
+static inline void ready_schedule_unlock(void) with(*__scheduler_lock) {
+	/* paranoid */ verify( ! __preemption_enabled() );
+	/* paranoid */ verify( data[kernelTLS().sched_id] == &kernelTLS().sched_lock );
+	/* paranoid */ verify( !kernelTLS().this_processor || kernelTLS().this_processor->unique_id == kernelTLS().sched_id );
+	/* paranoid */ verify( kernelTLS().sched_lock );
+	/* paranoid */ verify( kernelTLS().in_sched_lock );
+	#ifdef __CFA_WITH_VERIFY__
+		// Debug, check if this is owned for reading
+		kernelTLS().in_sched_lock = false;
+	#endif
+	__atomic_unlock(&kernelTLS().sched_lock);
+}
+
+#ifdef __CFA_WITH_VERIFY__
+	static inline bool ready_schedule_islocked(void) {
+		/* paranoid */ verify( ! __preemption_enabled() );
+		/* paranoid */ verify( (!kernelTLS().in_sched_lock) || kernelTLS().sched_lock );
+		return kernelTLS().sched_lock;
+	}
+
+	static inline bool ready_mutate_islocked() {
+		return __scheduler_lock->write_lock;
+	}
+#endif
+
+//-----------------------------------------------------------------------
+// Writer side : acquire when changing the ready queue, e.g. adding more
+//  queues or removing them.
+uint_fast32_t ready_mutate_lock( void );
+
+void ready_mutate_unlock( uint_fast32_t /* value returned by lock */ );
+
+//-----------------------------------------------------------------------
+// Lock-Free registering/unregistering of threads
+// Register a processor to a given cluster and get its unique id in return
+// For convenience, also acquires the lock
+static inline [unsigned, uint_fast32_t] ready_mutate_register() {
+	unsigned id = register_proc_id();
+	uint_fast32_t last = ready_mutate_lock();
+	return [id, last];
+}
+
+// Unregister a processor from a given cluster using its id, getting back the original pointer
+// assumes the lock is acquired
+static inline void ready_mutate_unregister( unsigned id, uint_fast32_t last_s ) {
+	ready_mutate_unlock( last_s );
+	unregister_proc_id( id );
+}
+
+//-----------------------------------------------------------------------
+// Cluster idle lock/unlock
+static inline void lock(__cluster_proc_list & this) {
+	/* paranoid */ verify( ! __preemption_enabled() );
+
+	// Start by locking the global RWlock so that we know no-one is
+	// adding/removing processors while we mess with the idle lock
+	ready_schedule_lock();
+
+	lock( this.lock __cfaabi_dbg_ctx2 );
+
+	/* paranoid */ verify( ! __preemption_enabled() );
+}
+
+static inline bool try_lock(__cluster_proc_list & this) {
+	/* paranoid */ verify( ! __preemption_enabled() );
+
+	// Start by locking the global RWlock so that we know no-one is
+	// adding/removing processors while we mess with the idle lock
+	ready_schedule_lock();
+
+	if(try_lock( this.lock __cfaabi_dbg_ctx2 )) {
+		// success
+		/* paranoid */ verify( ! __preemption_enabled() );
+		return true;
+	}
+
+	// failed to lock
+	ready_schedule_unlock();
+
+	/* paranoid */ verify( ! __preemption_enabled() );
+	return false;
+}
+
+static inline void unlock(__cluster_proc_list & this) {
+	/* paranoid */ verify( ! __preemption_enabled() );
+
+	unlock(this.lock);
+
+	// Release the global lock, which we acquired when locking
+	ready_schedule_unlock();
+
+	/* paranoid */ verify( ! __preemption_enabled() );
+}
+
+//=======================================================================
+// Ready-Queue API
+//-----------------------------------------------------------------------
+// push thread onto a ready queue for a cluster
+// returns true if the list was previously empty, false otherwise
+__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint);
+
+//-----------------------------------------------------------------------
+// pop thread from the local queues of a cluster
+// returns 0p if empty
+// May return 0p spuriously
+__attribute__((hot)) struct thread$ * pop_fast(struct cluster * cltr);
+
+//-----------------------------------------------------------------------
+// pop thread from any ready queue of a cluster
+// returns 0p if empty
+// May return 0p spuriously
+__attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr);
+
+//-----------------------------------------------------------------------
+// search all ready queues of a cluster for any thread
+// returns 0p if empty
+// guaranteed to find any threads added before this call
+__attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr);
+
+//-----------------------------------------------------------------------
+// get preferred ready for new thread
+unsigned ready_queue_new_preferred();
+
+//-----------------------------------------------------------------------
+// Increase the width of the ready queue (number of lanes) by 4
+void ready_queue_grow  (struct cluster * cltr);
+
+//-----------------------------------------------------------------------
+// Decrease the width of the ready queue (number of lanes) by 4
+void ready_queue_shrink(struct cluster * cltr);
+
+//-----------------------------------------------------------------------
+// Decrease the width of the ready queue (number of lanes) by 4
+void ready_queue_close(struct cluster * cltr);
+
+// Local Variables: //
+// mode: c //
+// tab-width: 4 //
+// End: //
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 48a91e270c570a6f6635a022d7dfafbbd208229b)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 708ae384ef22961b8c1c67282cb1dc6024011687)
@@ -32,5 +32,5 @@
 
 // CFA Includes
-#include "kernel_private.hfa"
+#include "kernel/private.hfa"
 #include "startup.hfa"					// STARTUP_PRIORITY_XXX
 #include "limits.hfa"
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 48a91e270c570a6f6635a022d7dfafbbd208229b)
+++ 	(revision )
@@ -1,417 +1,0 @@
-//
-// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
-//
-// The contents of this file are covered under the licence agreement in the
-// file "LICENCE" distributed with Cforall.
-//
-// kernel_private.hfa --
-//
-// Author           : Thierry Delisle
-// Created On       : Mon Feb 13 12:27:26 2017
-// Last Modified By : Peter A. Buhr
-// Last Modified On : Wed Aug 12 08:21:33 2020
-// Update Count     : 9
-//
-
-#pragma once
-
-#if !defined(__cforall_thread__)
-	#error kernel_private.hfa should only be included in libcfathread source
-#endif
-
-#include "kernel.hfa"
-#include "thread.hfa"
-
-#include "alarm.hfa"
-#include "stats.hfa"
-
-extern "C" {
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-	#include <rseq/rseq.h>
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-	#include <linux/rseq.h>
-#else
-	#ifndef _GNU_SOURCE
-	#error kernel_private requires gnu_source
-	#endif
-	#include <sched.h>
-#endif
-}
-
-// Defines whether or not we *want* to use io_uring_enter as the idle_sleep blocking call
-#define CFA_WANT_IO_URING_IDLE
-
-// Defines whether or not we *can* use io_uring_enter as the idle_sleep blocking call
-#if defined(CFA_WANT_IO_URING_IDLE) && defined(CFA_HAVE_LINUX_IO_URING_H)
-	#if defined(CFA_HAVE_IORING_OP_READ) || (defined(CFA_HAVE_READV) && defined(CFA_HAVE_IORING_OP_READV))
-		#define CFA_WITH_IO_URING_IDLE
-	#endif
-#endif
-
-//-----------------------------------------------------------------------------
-// Scheduler
-extern "C" {
-	void disable_interrupts() OPTIONAL_THREAD;
-	void enable_interrupts( bool poll = true );
-}
-
-void schedule_thread$( thread$ *, unpark_hint hint ) __attribute__((nonnull (1)));
-
-extern bool __preemption_enabled();
-
-enum {
-	PREEMPT_NORMAL    = 0,
-	PREEMPT_TERMINATE = 1,
-	PREEMPT_IO = 2,
-};
-
-static inline void __disable_interrupts_checked() {
-	/* paranoid */ verify( __preemption_enabled() );
-	disable_interrupts();
-	/* paranoid */ verify( ! __preemption_enabled() );
-}
-
-static inline void __enable_interrupts_checked( bool poll = true ) {
-	/* paranoid */ verify( ! __preemption_enabled() );
-	enable_interrupts( poll );
-	/* paranoid */ verify( __preemption_enabled() );
-}
-
-//release/wake-up the following resources
-void __thread_finish( thread$ * thrd );
-
-//-----------------------------------------------------------------------------
-// Hardware
-
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-	// No data needed
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-	extern "Cforall" {
-		extern __attribute__((aligned(128))) thread_local volatile struct rseq __cfaabi_rseq;
-	}
-#else
-	// No data needed
-#endif
-
-static inline int __kernel_getcpu() {
-	/* paranoid */ verify( ! __preemption_enabled() );
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-	return rseq_current_cpu();
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-	int r = __cfaabi_rseq.cpu_id;
-	/* paranoid */ verify( r >= 0 );
-	return r;
-#else
-	return sched_getcpu();
-#endif
-}
-
-//-----------------------------------------------------------------------------
-// Processor
-void main(processorCtx_t *);
-
-void * __create_pthread( pthread_t *, void * (*)(void *), void * );
-void __destroy_pthread( pthread_t pthread, void * stack, void ** retval );
-
-extern cluster * mainCluster;
-
-//-----------------------------------------------------------------------------
-// Threads
-extern "C" {
-      void __cfactx_invoke_thread(void (*main)(void *), void * this);
-}
-
-__cfaabi_dbg_debug_do(
-	extern void __cfaabi_dbg_thread_register  ( thread$ * thrd );
-	extern void __cfaabi_dbg_thread_unregister( thread$ * thrd );
-)
-
-#define TICKET_BLOCKED (-1) // thread is blocked
-#define TICKET_RUNNING ( 0) // thread is running
-#define TICKET_UNBLOCK ( 1) // thread should ignore next block
-
-//-----------------------------------------------------------------------------
-// Utils
-void doregister( struct cluster * cltr, struct thread$ & thrd );
-void unregister( struct cluster * cltr, struct thread$ & thrd );
-
-//-----------------------------------------------------------------------------
-// I/O
-$io_arbiter * create(void);
-void destroy($io_arbiter *);
-
-//=======================================================================
-// Cluster lock API
-//=======================================================================
-// Lock-Free registering/unregistering of threads
-// Register a processor to a given cluster and get its unique id in return
-unsigned register_proc_id( void );
-
-// Unregister a processor from a given cluster using its id, getting back the original pointer
-void unregister_proc_id( unsigned );
-
-//=======================================================================
-// Reader-writer lock implementation
-// Concurrent with doregister/unregister,
-//    i.e., threads can be added at any point during or between the entry/exit
-
-//-----------------------------------------------------------------------
-// simple spinlock underlying the RWLock
-// Blocking acquire
-static inline void __atomic_acquire(volatile bool * ll) {
-	while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) {
-		while(__atomic_load_n(ll, (int)__ATOMIC_RELAXED))
-			Pause();
-	}
-	/* paranoid */ verify(*ll);
-}
-
-// Non-Blocking acquire
-static inline bool __atomic_try_acquire(volatile bool * ll) {
-	return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
-}
-
-// Release
-static inline void __atomic_unlock(volatile bool * ll) {
-	/* paranoid */ verify(*ll);
-	__atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
-}
-
-//-----------------------------------------------------------------------
-// Reader-Writer lock protecting the ready-queues
-// while this lock is mostly generic some aspects
-// have been hard-coded to for the ready-queue for
-// simplicity and performance
-struct __scheduler_RWLock_t {
-	// total cachelines allocated
-	unsigned int max;
-
-	// cachelines currently in use
-	volatile unsigned int alloc;
-
-	// cachelines ready to itereate over
-	// (!= to alloc when thread is in second half of doregister)
-	volatile unsigned int ready;
-
-	// writer lock
-	volatile bool write_lock;
-
-	// data pointer
-	volatile bool * volatile * data;
-};
-
-void  ?{}(__scheduler_RWLock_t & this);
-void ^?{}(__scheduler_RWLock_t & this);
-
-extern __scheduler_RWLock_t * __scheduler_lock;
-
-//-----------------------------------------------------------------------
-// Reader side : acquire when using the ready queue to schedule but not
-//  creating/destroying queues
-static inline void ready_schedule_lock(void) with(*__scheduler_lock) {
-	/* paranoid */ verify( ! __preemption_enabled() );
-	/* paranoid */ verify( ! kernelTLS().in_sched_lock );
-	/* paranoid */ verify( data[kernelTLS().sched_id] == &kernelTLS().sched_lock );
-	/* paranoid */ verify( !kernelTLS().this_processor || kernelTLS().this_processor->unique_id == kernelTLS().sched_id );
-
-	// Step 1 : make sure no writer are in the middle of the critical section
-	while(__atomic_load_n(&write_lock, (int)__ATOMIC_RELAXED))
-		Pause();
-
-	// Fence needed because we don't want to start trying to acquire the lock
-	// before we read a false.
-	// Not needed on x86
-	// std::atomic_thread_fence(std::memory_order_seq_cst);
-
-	// Step 2 : acquire our local lock
-	__atomic_acquire( &kernelTLS().sched_lock );
-	/*paranoid*/ verify(kernelTLS().sched_lock);
-
-	#ifdef __CFA_WITH_VERIFY__
-		// Debug, check if this is owned for reading
-		kernelTLS().in_sched_lock = true;
-	#endif
-}
-
-static inline void ready_schedule_unlock(void) with(*__scheduler_lock) {
-	/* paranoid */ verify( ! __preemption_enabled() );
-	/* paranoid */ verify( data[kernelTLS().sched_id] == &kernelTLS().sched_lock );
-	/* paranoid */ verify( !kernelTLS().this_processor || kernelTLS().this_processor->unique_id == kernelTLS().sched_id );
-	/* paranoid */ verify( kernelTLS().sched_lock );
-	/* paranoid */ verify( kernelTLS().in_sched_lock );
-	#ifdef __CFA_WITH_VERIFY__
-		// Debug, check if this is owned for reading
-		kernelTLS().in_sched_lock = false;
-	#endif
-	__atomic_unlock(&kernelTLS().sched_lock);
-}
-
-#ifdef __CFA_WITH_VERIFY__
-	static inline bool ready_schedule_islocked(void) {
-		/* paranoid */ verify( ! __preemption_enabled() );
-		/* paranoid */ verify( (!kernelTLS().in_sched_lock) || kernelTLS().sched_lock );
-		return kernelTLS().sched_lock;
-	}
-
-	static inline bool ready_mutate_islocked() {
-		return __scheduler_lock->write_lock;
-	}
-#endif
-
-//-----------------------------------------------------------------------
-// Writer side : acquire when changing the ready queue, e.g. adding more
-//  queues or removing them.
-uint_fast32_t ready_mutate_lock( void );
-
-void ready_mutate_unlock( uint_fast32_t /* value returned by lock */ );
-
-//-----------------------------------------------------------------------
-// Lock-Free registering/unregistering of threads
-// Register a processor to a given cluster and get its unique id in return
-// For convenience, also acquires the lock
-static inline [unsigned, uint_fast32_t] ready_mutate_register() {
-	unsigned id = register_proc_id();
-	uint_fast32_t last = ready_mutate_lock();
-	return [id, last];
-}
-
-// Unregister a processor from a given cluster using its id, getting back the original pointer
-// assumes the lock is acquired
-static inline void ready_mutate_unregister( unsigned id, uint_fast32_t last_s ) {
-	ready_mutate_unlock( last_s );
-	unregister_proc_id( id );
-}
-
-//-----------------------------------------------------------------------
-// Cluster idle lock/unlock
-static inline void lock(__cluster_proc_list & this) {
-	/* paranoid */ verify( ! __preemption_enabled() );
-
-	// Start by locking the global RWlock so that we know no-one is
-	// adding/removing processors while we mess with the idle lock
-	ready_schedule_lock();
-
-	lock( this.lock __cfaabi_dbg_ctx2 );
-
-	/* paranoid */ verify( ! __preemption_enabled() );
-}
-
-static inline bool try_lock(__cluster_proc_list & this) {
-	/* paranoid */ verify( ! __preemption_enabled() );
-
-	// Start by locking the global RWlock so that we know no-one is
-	// adding/removing processors while we mess with the idle lock
-	ready_schedule_lock();
-
-	if(try_lock( this.lock __cfaabi_dbg_ctx2 )) {
-		// success
-		/* paranoid */ verify( ! __preemption_enabled() );
-		return true;
-	}
-
-	// failed to lock
-	ready_schedule_unlock();
-
-	/* paranoid */ verify( ! __preemption_enabled() );
-	return false;
-}
-
-static inline void unlock(__cluster_proc_list & this) {
-	/* paranoid */ verify( ! __preemption_enabled() );
-
-	unlock(this.lock);
-
-	// Release the global lock, which we acquired when locking
-	ready_schedule_unlock();
-
-	/* paranoid */ verify( ! __preemption_enabled() );
-}
-
-//=======================================================================
-// Ready-Queue API
-//-----------------------------------------------------------------------
-// push thread onto a ready queue for a cluster
-// returns true if the list was previously empty, false otherwise
-__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint);
-
-//-----------------------------------------------------------------------
-// pop thread from the local queues of a cluster
-// returns 0p if empty
-// May return 0p spuriously
-__attribute__((hot)) struct thread$ * pop_fast(struct cluster * cltr);
-
-//-----------------------------------------------------------------------
-// pop thread from any ready queue of a cluster
-// returns 0p if empty
-// May return 0p spuriously
-__attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr);
-
-//-----------------------------------------------------------------------
-// search all ready queues of a cluster for any thread
-// returns 0p if empty
-// guaranteed to find any threads added before this call
-__attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr);
-
-//-----------------------------------------------------------------------
-// get preferred ready for new thread
-unsigned ready_queue_new_preferred();
-
-//-----------------------------------------------------------------------
-// Increase the width of the ready queue (number of lanes) by 4
-void ready_queue_grow  (struct cluster * cltr);
-
-//-----------------------------------------------------------------------
-// Decrease the width of the ready queue (number of lanes) by 4
-void ready_queue_shrink(struct cluster * cltr);
-
-//-----------------------------------------------------------------------
-// Decrease the width of the ready queue (number of lanes) by 4
-void ready_queue_close(struct cluster * cltr);
-
-//-----------------------------------------------------------------------
-// Calc moving average based on existing average, before and current time.
-static inline unsigned long long moving_average(unsigned long long currtsc, unsigned long long instsc, unsigned long long old_avg) {
-	/* paranoid */ verifyf( currtsc < 45000000000000000, "Suspiciously large current time: %'llu (%llx)\n", currtsc, currtsc );
-	/* paranoid */ verifyf( instsc  < 45000000000000000, "Suspiciously large insert time: %'llu (%llx)\n", instsc, instsc );
-	/* paranoid */ verifyf( old_avg < 15000000000000, "Suspiciously large previous average: %'llu (%llx)\n", old_avg, old_avg );
-
-	const unsigned long long new_val = currtsc > instsc ? currtsc - instsc : 0;
-	const unsigned long long total_weight = 16;
-	const unsigned long long new_weight   = 4;
-	const unsigned long long old_weight = total_weight - new_weight;
-	const unsigned long long ret = ((new_weight * new_val) + (old_weight * old_avg)) / total_weight;
-	return ret;
-}
-
-//-----------------------------------------------------------------------
-// Calc age a timestamp should be before needing help.
-forall(Data_t * | { unsigned long long ts(Data_t & this); })
-static inline unsigned long long calc_cutoff(
-	const unsigned long long ctsc,
-	const processor * proc,
-	size_t count,
-	Data_t * data,
-	__timestamp_t * tscs,
-	const unsigned shard_factor
-) {
-	unsigned start = proc->rdq.id;
-	unsigned long long max = 0;
-	for(i; shard_factor) {
-		unsigned long long ptsc = ts(data[start + i]);
-		if(ptsc != -1ull) {
-			/* paranoid */ verify( start + i < count );
-			unsigned long long tsc = moving_average(ctsc, ptsc, tscs[start + i].ma);
-			if(tsc > max) max = tsc;
-		}
-	}
-	return (max + 2 * max) / 2;
-}
-
-static struct {
-	const unsigned readyq;
-} __shard_factor __attribute__((unused)) = { 2 };
-
-// Local Variables: //
-// mode: c //
-// tab-width: 4 //
-// End: //
Index: libcfa/src/concurrency/locks.cfa
===================================================================
--- libcfa/src/concurrency/locks.cfa	(revision 48a91e270c570a6f6635a022d7dfafbbd208229b)
+++ libcfa/src/concurrency/locks.cfa	(revision 708ae384ef22961b8c1c67282cb1dc6024011687)
@@ -19,5 +19,5 @@
 
 #include "locks.hfa"
-#include "kernel_private.hfa"
+#include "kernel/private.hfa"
 
 #include <kernel.hfa>
Index: libcfa/src/concurrency/monitor.cfa
===================================================================
--- libcfa/src/concurrency/monitor.cfa	(revision 48a91e270c570a6f6635a022d7dfafbbd208229b)
+++ libcfa/src/concurrency/monitor.cfa	(revision 708ae384ef22961b8c1c67282cb1dc6024011687)
@@ -22,5 +22,5 @@
 #include <inttypes.h>
 
-#include "kernel_private.hfa"
+#include "kernel/private.hfa"
 
 #include "bits/algorithm.hfa"
Index: libcfa/src/concurrency/mutex.cfa
===================================================================
--- libcfa/src/concurrency/mutex.cfa	(revision 48a91e270c570a6f6635a022d7dfafbbd208229b)
+++ libcfa/src/concurrency/mutex.cfa	(revision 708ae384ef22961b8c1c67282cb1dc6024011687)
@@ -21,5 +21,5 @@
 #include "mutex.hfa"
 
-#include "kernel_private.hfa"
+#include "kernel/private.hfa"
 
 //-----------------------------------------------------------------------------
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 48a91e270c570a6f6635a022d7dfafbbd208229b)
+++ libcfa/src/concurrency/preemption.cfa	(revision 708ae384ef22961b8c1c67282cb1dc6024011687)
@@ -31,5 +31,5 @@
 #include "bits/debug.hfa"
 #include "bits/signal.hfa"
-#include "kernel_private.hfa"
+#include "kernel/private.hfa"
 
 
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 48a91e270c570a6f6635a022d7dfafbbd208229b)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 708ae384ef22961b8c1c67282cb1dc6024011687)
@@ -24,5 +24,6 @@
 #include "bits/defs.hfa"
 #include "device/cpu.hfa"
-#include "kernel_private.hfa"
+#include "kernel/cluster.hfa"
+#include "kernel/private.hfa"
 
 #include "limits.hfa"
@@ -122,15 +123,6 @@
 	__cfadbg_print_safe(ready_queue, "Kernel : pop from %u\n", this);
 
-	// Figure out the current cpu and make sure it is valid
-	const int cpu = __kernel_getcpu();
-	/* paranoid */ verify(cpu >= 0);
-	/* paranoid */ verify(cpu < cpu_info.hthrd_count);
-	unsigned this_cache = cpu_info.llc_map[cpu].cache;
-
-	// Super important: don't write the same value over and over again
-	// We want to maximise our chances that his particular values stays in cache
-	if(caches[this / __shard_factor.readyq].id != this_cache)
-		__atomic_store_n(&caches[this / __shard_factor.readyq].id, this_cache, __ATOMIC_RELAXED);
-
+	// Figure out the current cache is
+	const unsigned this_cache = cache_id(cltr, this / __shard_factor.readyq);
 	const unsigned long long ctsc = rdtscl();
 
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 48a91e270c570a6f6635a022d7dfafbbd208229b)
+++ libcfa/src/concurrency/thread.cfa	(revision 708ae384ef22961b8c1c67282cb1dc6024011687)
@@ -19,5 +19,5 @@
 #include "thread.hfa"
 
-#include "kernel_private.hfa"
+#include "kernel/private.hfa"
 #include "exception.hfa"