Index: libcfa/src/common.hfa
===================================================================
--- libcfa/src/common.hfa	(revision b5f17e14065831c1bd56680005fa1f733098d29f)
+++ libcfa/src/common.hfa	(revision 038a0bd82f1bc2ad25f823b8d9291fb043a0a201)
@@ -1,10 +1,10 @@
-// 
+//
 // Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
 //
 // The contents of this file are covered under the licence agreement in the
 // file "LICENCE" distributed with Cforall.
-// 
-// common -- 
-// 
+//
+// common.hfa --
+//
 // Author           : Peter A. Buhr
 // Created On       : Wed Jul 11 17:54:36 2018
@@ -12,5 +12,5 @@
 // Last Modified On : Wed May  5 14:02:04 2021
 // Update Count     : 18
-// 
+//
 
 #pragma once
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision b5f17e14065831c1bd56680005fa1f733098d29f)
+++ libcfa/src/concurrency/io.cfa	(revision 038a0bd82f1bc2ad25f823b8d9291fb043a0a201)
@@ -144,34 +144,38 @@
 		__ioarbiter_flush( ctx );
 
-		__STATS__( true, io.calls.flush++; )
-		int ret = syscall( __NR_io_uring_enter, ctx.fd, ctx.sq.to_submit, min_comp, min_comp > 0 ? IORING_ENTER_GETEVENTS : 0, (sigset_t *)0p, _NSIG / 8);
-		if( ret < 0 ) {
-			switch((int)errno) {
-			case EAGAIN:
-			case EINTR:
-			case EBUSY:
-				// Update statistics
-				__STATS__( false, io.calls.errors.busy ++; )
-				return false;
-			default:
-				abort( "KERNEL ERROR: IO_URING SYSCALL - (%d) %s\n", (int)errno, strerror(errno) );
+		if(ctx.sq.to_submit != 0 || min_comp > 0) {
+
+			__STATS__( true, io.calls.flush++; )
+			int ret = syscall( __NR_io_uring_enter, ctx.fd, ctx.sq.to_submit, min_comp, min_comp > 0 ? IORING_ENTER_GETEVENTS : 0, (sigset_t *)0p, _NSIG / 8);
+			if( ret < 0 ) {
+				switch((int)errno) {
+				case EAGAIN:
+				case EINTR:
+				case EBUSY:
+					// Update statistics
+					__STATS__( false, io.calls.errors.busy ++; )
+					return false;
+				default:
+					abort( "KERNEL ERROR: IO_URING SYSCALL - (%d) %s\n", (int)errno, strerror(errno) );
+				}
 			}
-		}
-
-		__cfadbg_print_safe(io, "Kernel I/O : %u submitted to io_uring %d\n", ret, ctx.fd);
-		__STATS__( true, io.calls.submitted += ret; )
-		/* paranoid */ verify( ctx.sq.to_submit <= *ctx.sq.num );
-		/* paranoid */ verify( ctx.sq.to_submit >= ret );
-
-		ctx.sq.to_submit -= ret;
-
-		/* paranoid */ verify( ctx.sq.to_submit <= *ctx.sq.num );
-
-		// Release the consumed SQEs
-		__release_sqes( ctx );
-
-		/* paranoid */ verify( ! __preemption_enabled() );
-
-		ctx.proc->io.pending = false;
+
+			__cfadbg_print_safe(io, "Kernel I/O : %u submitted to io_uring %d\n", ret, ctx.fd);
+			__STATS__( true, io.calls.submitted += ret; )
+			/* paranoid */ verify( ctx.sq.to_submit <= *ctx.sq.num );
+			/* paranoid */ verify( ctx.sq.to_submit >= ret );
+
+			ctx.sq.to_submit -= ret;
+
+			/* paranoid */ verify( ctx.sq.to_submit <= *ctx.sq.num );
+
+			// Release the consumed SQEs
+			__release_sqes( ctx );
+
+			/* paranoid */ verify( ! __preemption_enabled() );
+
+			ctx.proc->io.pending = false;
+		}
+
 		ready_schedule_lock();
 		bool ret = __cfa_io_drain( proc );
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision b5f17e14065831c1bd56680005fa1f733098d29f)
+++ libcfa/src/concurrency/kernel.hfa	(revision 038a0bd82f1bc2ad25f823b8d9291fb043a0a201)
@@ -68,5 +68,4 @@
 		unsigned last;
 		signed   cpu;
-		// unsigned long long int cutoff;
 	} rdq;
 
@@ -154,5 +153,5 @@
 };
 
-struct __attribute__((aligned(128))) __cache_id_t {
+struct __attribute__((aligned(16))) __cache_id_t {
 	volatile unsigned id;
 };
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision b5f17e14065831c1bd56680005fa1f733098d29f)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 038a0bd82f1bc2ad25f823b8d9291fb043a0a201)
@@ -303,9 +303,4 @@
 			lanes.help[idx].dst = 0;
 			lanes.help[idx].tri = 0;
-		}
-
-		caches = alloc( cpu_info.llc_count );
-		for( idx; (size_t)cpu_info.llc_count ) {
-			(caches[idx]){};
 		}
 	#else
@@ -404,5 +399,9 @@
 		/* paranoid */ verify(cpu < cpu_info.hthrd_count);
 		unsigned this_cache = cpu_info.llc_map[cpu].cache;
-		__atomic_store_n(&lanes.caches[this / READYQ_SHARD_FACTOR].id, this_cache, __ATOMIC_RELAXED);
+
+		// Super important: don't write the same value over and over again
+		// We want to maximise our chances that his particular values stays in cache
+		if(lanes.caches[this / READYQ_SHARD_FACTOR].id != this_cache)
+			__atomic_store_n(&lanes.caches[this / READYQ_SHARD_FACTOR].id, this_cache, __ATOMIC_RELAXED);
 
 		const unsigned long long ctsc = rdtscl();
@@ -506,22 +505,4 @@
 	}
 
-	static inline int pop_getcpu(processor * proc, __ready_queue_caches_t * caches) {
-		const int prv = proc->rdq.cpu;
-		const int cpu = __kernel_getcpu();
-		if( prv != proc->rdq.cpu ) {
-			unsigned pidx = cpu_info.llc_map[prv].cache;
-			/* paranoid */ verify(pidx < cpu_info.llc_count);
-
-			unsigned nidx = cpu_info.llc_map[cpu].cache;
-			/* paranoid */ verify(pidx < cpu_info.llc_count);
-
-			depart(caches[pidx]);
-			arrive(caches[nidx]);
-
-			__STATS( /* cpu migs++ */ )
-		}
-		return proc->rdq.cpu = cpu;
-	}
-
 	// Pop from the ready queue from a given cluster
 	__attribute__((hot)) thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
@@ -530,6 +511,5 @@
 
 		processor * const proc = kernelTLS().this_processor;
-		const int cpu = pop_getcpu( proc, caches );
-		// const int cpu = __kernel_getcpu();
+		const int cpu = __kernel_getcpu();
 		/* paranoid */ verify(cpu >= 0);
 		/* paranoid */ verify(cpu < cpu_info.hthrd_count);
@@ -548,5 +528,5 @@
 			unsigned long long max = 0;
 			for(i; READYQ_SHARD_FACTOR) {
-				unsigned long long tsc = moving_average(ctsc - ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
+				unsigned long long tsc = moving_average(ctsc, ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
 				if(tsc > max) max = tsc;
 			}
@@ -569,5 +549,5 @@
 			unsigned long long max = 0;
 			for(i; READYQ_SHARD_FACTOR) {
-				unsigned long long tsc = moving_average(ctsc - ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
+				unsigned long long tsc = moving_average(ctsc, ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
 				if(tsc > max) max = tsc;
 			}
@@ -577,6 +557,5 @@
 				proc->rdq.target = MAX;
 				lanes.help[target / READYQ_SHARD_FACTOR].tri++;
-				if(moving_average(ctsc - lanes.tscs[target].tv, lanes.tscs[target].ma) > cutoff) {
-					__STATS( __tls_stats()->ready.pop.helped[target]++; )
+				if(moving_average(ctsc, lanes.tscs[target].tv, lanes.tscs[target].ma) > cutoff) {
 					thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
 					proc->rdq.last = target;
@@ -587,6 +566,5 @@
 
 			unsigned last = proc->rdq.last;
-			if(last != MAX && moving_average(ctsc - lanes.tscs[last].tv, lanes.tscs[last].ma) > cutoff) {
-				__STATS( __tls_stats()->ready.pop.helped[last]++; )
+			if(last != MAX && moving_average(ctsc, lanes.tscs[last].tv, lanes.tscs[last].ma) > cutoff) {
 				thread$ * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.help));
 				if(t) return t;