Index: libcfa/configure.ac
===================================================================
--- libcfa/configure.ac	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/configure.ac	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -131,4 +131,10 @@
 #io_uring 5.5 uses enum values
 #io_uring 5.6 and later uses probes
+
+AH_TEMPLATE([CFA_HAVE_LINUX_RSEQ_H],[Defined if rseq support is present when compiling libcfathread.])
+AC_CHECK_HEADERS([linux/rseq.h], [AC_DEFINE(CFA_HAVE_LINUX_RSEQ_H)])
+
+AH_TEMPLATE([CFA_HAVE_LINUX_LIBRSEQ],[Defined if librseq support is present when compiling libcfathread.])
+AC_CHECK_LIB([rseq], [rseq_available], [AC_DEFINE(CFA_HAVE_LINUX_RSEQ_H)], [])
 
 AH_TEMPLATE([CFA_HAVE_LINUX_IO_URING_H],[Defined if io_uring support is present when compiling libcfathread.])
Index: libcfa/prelude/defines.hfa.in
===================================================================
--- libcfa/prelude/defines.hfa.in	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/prelude/defines.hfa.in	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -171,4 +171,10 @@
 #undef CFA_HAVE_LINUX_IO_URING_H
 
+/* Defined if librseq support is present when compiling libcfathread. */
+#undef CFA_HAVE_LINUX_LIBRSEQ
+
+/* Defined if rseq support is present when compiling libcfathread. */
+#undef CFA_HAVE_LINUX_RSEQ_H
+
 /* Defined if openat2 support is present when compiling libcfathread. */
 #undef CFA_HAVE_OPENAT2
@@ -205,4 +211,7 @@
 #undef HAVE_LINUX_IO_URING_H
 
+/* Define to 1 if you have the <linux/rseq.h> header file. */
+#undef HAVE_LINUX_RSEQ_H
+
 /* Define to 1 if you have the <memory.h> header file. */
 #undef HAVE_MEMORY_H
Index: libcfa/src/Makefile.am
===================================================================
--- libcfa/src/Makefile.am	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/Makefile.am	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -61,4 +61,5 @@
 	containers/queueLockFree.hfa \
 	containers/stackLockFree.hfa \
+	containers/vector2.hfa \
 	vec/vec.hfa \
 	vec/vec2.hfa \
@@ -69,5 +70,4 @@
 	common.hfa \
 	fstream.hfa \
-	strstream.hfa \
 	heap.hfa \
 	iostream.hfa \
@@ -78,4 +78,5 @@
 	rational.hfa \
 	stdlib.hfa \
+	strstream.hfa \
 	time.hfa \
 	bits/weakso_locks.hfa \
@@ -83,5 +84,6 @@
 	containers/pair.hfa \
 	containers/result.hfa \
-	containers/vector.hfa
+	containers/vector.hfa \
+	device/cpu.hfa
 
 libsrc = ${inst_headers_src} ${inst_headers_src:.hfa=.cfa} \
Index: libcfa/src/bits/signal.hfa
===================================================================
--- libcfa/src/bits/signal.hfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/bits/signal.hfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -20,7 +20,5 @@
 
 #include <errno.h>
-#define __USE_GNU
 #include <signal.h>
-#undef __USE_GNU
 #include <stdlib.h>
 #include <string.h>
Index: libcfa/src/concurrency/coroutine.cfa
===================================================================
--- libcfa/src/concurrency/coroutine.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/coroutine.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -15,4 +15,5 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
 
 #include "coroutine.hfa"
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/io.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -15,4 +15,5 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
 
 #if defined(__CFA_DEBUG__)
@@ -23,5 +24,4 @@
 
 #if defined(CFA_HAVE_LINUX_IO_URING_H)
-	#define _GNU_SOURCE         /* See feature_test_macros(7) */
 	#include <errno.h>
 	#include <signal.h>
Index: libcfa/src/concurrency/io/setup.cfa
===================================================================
--- libcfa/src/concurrency/io/setup.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/io/setup.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -15,5 +15,5 @@
 
 #define __cforall_thread__
-#define _GNU_SOURCE         /* See feature_test_macros(7) */
+#define _GNU_SOURCE
 
 #if defined(__CFA_DEBUG__)
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/kernel.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -15,4 +15,6 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
+
 // #define __CFA_DEBUG_PRINT_RUNTIME_CORE__
 
@@ -278,5 +280,5 @@
 
 				// Spin a little on I/O, just in case
-					for(5) {
+				for(5) {
 					__maybe_io_drain( this );
 					readyThread = pop_fast( this->cltr );
@@ -285,5 +287,5 @@
 
 				// no luck, try stealing a few times
-					for(5) {
+				for(5) {
 					if( __maybe_io_drain( this ) ) {
 						readyThread = pop_fast( this->cltr );
@@ -422,4 +424,6 @@
 		__cfactx_switch( &proc_cor->context, &thrd_dst->context );
 		// when __cfactx_switch returns we are back in the processor coroutine
+
+
 
 		/* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd_dst->canary );
@@ -522,6 +526,6 @@
 
 	/* paranoid */ verify( ! __preemption_enabled() );
-	/* paranoid */ verifyf( ((uintptr_t)thrd_src->context.SP) < ((uintptr_t)__get_stack(thrd_src->curr_cor)->base ), "ERROR : Returning $thread %p has been corrupted.\n StackPointer too small.\n", thrd_src );
-	/* paranoid */ verifyf( ((uintptr_t)thrd_src->context.SP) > ((uintptr_t)__get_stack(thrd_src->curr_cor)->limit), "ERROR : Returning $thread %p has been corrupted.\n StackPointer too large.\n", thrd_src );
+	/* paranoid */ verifyf( ((uintptr_t)thrd_src->context.SP) < ((uintptr_t)__get_stack(thrd_src->curr_cor)->base ) || thrd_src->corctx_flag, "ERROR : Returning $thread %p has been corrupted.\n StackPointer too small.\n", thrd_src );
+	/* paranoid */ verifyf( ((uintptr_t)thrd_src->context.SP) > ((uintptr_t)__get_stack(thrd_src->curr_cor)->limit) || thrd_src->corctx_flag, "ERROR : Returning $thread %p has been corrupted.\n StackPointer too large.\n", thrd_src );
 }
 
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/kernel.hfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -66,4 +66,5 @@
 		unsigned id;
 		unsigned target;
+		unsigned last;
 		unsigned long long int cutoff;
 	} rdq;
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -15,11 +15,15 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
 
 // C Includes
 #include <errno.h>              // errno
+#include <signal.h>
 #include <string.h>             // strerror
 #include <unistd.h>             // sysconf
+
 extern "C" {
       #include <limits.h>       // PTHREAD_STACK_MIN
+	#include <unistd.h>       // syscall
 	#include <sys/eventfd.h>  // eventfd
       #include <sys/mman.h>     // mprotect
@@ -136,4 +140,16 @@
 };
 
+#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
+	// No data needed
+#elif defined(CFA_HAVE_LINUX_RSEQ_H)
+	extern "Cforall" {
+		__attribute__((aligned(128))) thread_local volatile struct rseq __cfaabi_rseq @= {
+			.cpu_id : RSEQ_CPU_ID_UNINITIALIZED,
+		};
+	}
+#else
+	// No data needed
+#endif
+
 //-----------------------------------------------------------------------------
 // Struct to steal stack
@@ -468,5 +484,5 @@
 	self_mon_p = &self_mon;
 	link.next = 0p;
-	link.ts   = 0;
+	link.ts   = -1llu;
 	preferred = -1u;
 	last_proc = 0p;
@@ -497,4 +513,5 @@
 	this.rdq.id  = -1u;
 	this.rdq.target = -1u;
+	this.rdq.last = -1u;
 	this.rdq.cutoff = 0ull;
 	do_terminate = false;
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -16,4 +16,8 @@
 #pragma once
 
+#if !defined(__cforall_thread__)
+	#error kernel_private.hfa should only be included in libcfathread source
+#endif
+
 #include "kernel.hfa"
 #include "thread.hfa"
@@ -22,8 +26,19 @@
 #include "stats.hfa"
 
+extern "C" {
+#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
+	#include <rseq/rseq.h>
+#elif defined(CFA_HAVE_LINUX_RSEQ_H)
+	#include <linux/rseq.h>
+#else
+	#ifndef _GNU_SOURCE
+	#error kernel_private requires gnu_source
+	#endif
+	#include <sched.h>
+#endif
+}
+
 //-----------------------------------------------------------------------------
 // Scheduler
-
-
 extern "C" {
 	void disable_interrupts() OPTIONAL_THREAD;
@@ -39,4 +54,30 @@
 
 //-----------------------------------------------------------------------------
+// Hardware
+
+#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
+	// No data needed
+#elif defined(CFA_HAVE_LINUX_RSEQ_H)
+	extern "Cforall" {
+		extern __attribute__((aligned(128))) thread_local volatile struct rseq __cfaabi_rseq;
+	}
+#else
+	// No data needed
+#endif
+
+static inline int __kernel_getcpu() {
+	/* paranoid */ verify( ! __preemption_enabled() );
+#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
+	return rseq_current_cpu();
+#elif defined(CFA_HAVE_LINUX_RSEQ_H)
+	int r = __cfaabi_rseq.cpu_id;
+	/* paranoid */ verify( r >= 0 );
+	return r;
+#else
+	return sched_getcpu();
+#endif
+}
+
+//-----------------------------------------------------------------------------
 // Processor
 void main(processorCtx_t *);
@@ -44,6 +85,4 @@
 void * __create_pthread( pthread_t *, void * (*)(void *), void * );
 void __destroy_pthread( pthread_t pthread, void * stack, void ** retval );
-
-
 
 extern cluster * mainCluster;
Index: libcfa/src/concurrency/locks.cfa
===================================================================
--- libcfa/src/concurrency/locks.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/locks.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -16,4 +16,5 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
 
 #include "locks.hfa"
Index: libcfa/src/concurrency/locks.hfa
===================================================================
--- libcfa/src/concurrency/locks.hfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/locks.hfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -24,4 +24,5 @@
 #include "containers/list.hfa"
 
+#include "limits.hfa"
 #include "thread.hfa"
 
@@ -87,4 +88,5 @@
 	bool tryP(BinaryBenaphore & this) {
 		ssize_t c = this.counter;
+		/* paranoid */ verify( c > MIN );
 		return (c >= 1) && __atomic_compare_exchange_n(&this.counter, &c, c-1, false, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
 	}
@@ -94,4 +96,5 @@
 		ssize_t c = 0;
 		for () {
+			/* paranoid */ verify( this.counter < MAX );
 			if (__atomic_compare_exchange_n(&this.counter, &c, c+1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
 				if (c == 0) return true;
@@ -173,4 +176,6 @@
 	ThreadBenaphore sem;
 };
+
+static inline void ?{}(fast_lock & this) { this.owner = 0p; }
 
 static inline bool $try_lock(fast_lock & this, $thread * thrd) {
Index: libcfa/src/concurrency/monitor.cfa
===================================================================
--- libcfa/src/concurrency/monitor.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/monitor.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -15,4 +15,5 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
 
 #include "monitor.hfa"
Index: libcfa/src/concurrency/mutex.cfa
===================================================================
--- libcfa/src/concurrency/mutex.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/mutex.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -17,4 +17,5 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
 
 #include "mutex.hfa"
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/preemption.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -15,4 +15,6 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
+
 // #define __CFA_DEBUG_PRINT_PREEMPTION__
 
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -15,4 +15,6 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
+
 // #define __CFA_DEBUG_PRINT_READY_QUEUE__
 
@@ -20,13 +22,19 @@
 #define USE_RELAXED_FIFO
 // #define USE_WORK_STEALING
+// #define USE_CPU_WORK_STEALING
 
 #include "bits/defs.hfa"
+#include "device/cpu.hfa"
 #include "kernel_private.hfa"
 
-#define _GNU_SOURCE
 #include "stdlib.hfa"
 #include "math.hfa"
 
+#include <errno.h>
 #include <unistd.h>
+
+extern "C" {
+	#include <sys/syscall.h>  // __NR_xxx
+}
 
 #include "ready_subqueue.hfa"
@@ -46,5 +54,7 @@
 #endif
 
-#if   defined(USE_RELAXED_FIFO)
+#if   defined(USE_CPU_WORK_STEALING)
+	#define READYQ_SHARD_FACTOR 2
+#elif defined(USE_RELAXED_FIFO)
 	#define BIAS 4
 	#define READYQ_SHARD_FACTOR 4
@@ -85,4 +95,23 @@
 }
 
+#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
+	// No forward declaration needed
+	#define __kernel_rseq_register rseq_register_current_thread
+	#define __kernel_rseq_unregister rseq_unregister_current_thread
+#elif defined(CFA_HAVE_LINUX_RSEQ_H)
+	void __kernel_raw_rseq_register  (void);
+	void __kernel_raw_rseq_unregister(void);
+
+	#define __kernel_rseq_register __kernel_raw_rseq_register
+	#define __kernel_rseq_unregister __kernel_raw_rseq_unregister
+#else
+	// No forward declaration needed
+	// No initialization needed
+	static inline void noop(void) {}
+
+	#define __kernel_rseq_register noop
+	#define __kernel_rseq_unregister noop
+#endif
+
 //=======================================================================
 // Cluster wide reader-writer lock
@@ -107,4 +136,6 @@
 // Lock-Free registering/unregistering of threads
 unsigned register_proc_id( void ) with(*__scheduler_lock) {
+	__kernel_rseq_register();
+
 	__cfadbg_print_safe(ready_queue, "Kernel : Registering proc %p for RW-Lock\n", proc);
 	bool * handle = (bool *)&kernelTLS().sched_lock;
@@ -161,4 +192,6 @@
 
 	__cfadbg_print_safe(ready_queue, "Kernel : Unregister proc %p\n", proc);
+
+	__kernel_rseq_unregister();
 }
 
@@ -214,11 +247,25 @@
 //=======================================================================
 void ?{}(__ready_queue_t & this) with (this) {
-	lanes.data  = 0p;
-	lanes.tscs  = 0p;
-	lanes.count = 0;
+	#if defined(USE_CPU_WORK_STEALING)
+		lanes.count = cpu_info.hthrd_count * READYQ_SHARD_FACTOR;
+		lanes.data = alloc( lanes.count );
+		lanes.tscs = alloc( lanes.count );
+
+		for( idx; (size_t)lanes.count ) {
+			(lanes.data[idx]){};
+			lanes.tscs[idx].tv = rdtscl();
+		}
+	#else
+		lanes.data  = 0p;
+		lanes.tscs  = 0p;
+		lanes.count = 0;
+	#endif
 }
 
 void ^?{}(__ready_queue_t & this) with (this) {
-	verify( SEQUENTIAL_SHARD == lanes.count );
+	#if !defined(USE_CPU_WORK_STEALING)
+		verify( SEQUENTIAL_SHARD == lanes.count );
+	#endif
+
 	free(lanes.data);
 	free(lanes.tscs);
@@ -226,4 +273,143 @@
 
 //-----------------------------------------------------------------------
+#if defined(USE_CPU_WORK_STEALING)
+	__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd, bool push_local) with (cltr->ready_queue) {
+		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
+
+		processor * const proc = kernelTLS().this_processor;
+		const bool external = !push_local || (!proc) || (cltr != proc->cltr);
+
+		const int cpu = __kernel_getcpu();
+		/* paranoid */ verify(cpu >= 0);
+		/* paranoid */ verify(cpu < cpu_info.hthrd_count);
+		/* paranoid */ verify(cpu * READYQ_SHARD_FACTOR < lanes.count);
+
+		const cpu_map_entry_t & map = cpu_info.llc_map[cpu];
+		/* paranoid */ verify(map.start * READYQ_SHARD_FACTOR < lanes.count);
+		/* paranoid */ verify(map.self * READYQ_SHARD_FACTOR < lanes.count);
+		/* paranoid */ verifyf((map.start + map.count) * READYQ_SHARD_FACTOR <= lanes.count, "have %zu lanes but map can go up to %u", lanes.count, (map.start + map.count) * READYQ_SHARD_FACTOR);
+
+		const int start = map.self * READYQ_SHARD_FACTOR;
+		unsigned i;
+		do {
+			unsigned r;
+			if(unlikely(external)) { r = __tls_rand(); }
+			else { r = proc->rdq.its++; }
+			i = start + (r % READYQ_SHARD_FACTOR);
+			// If we can't lock it retry
+		} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+
+		// Actually push it
+		push(lanes.data[i], thrd);
+
+		// Unlock and return
+		__atomic_unlock( &lanes.data[i].lock );
+
+		#if !defined(__CFA_NO_STATISTICS__)
+			if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED);
+			else __tls_stats()->ready.push.local.success++;
+		#endif
+
+		__cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
+
+	}
+
+	// Pop from the ready queue from a given cluster
+	__attribute__((hot)) $thread * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
+		/* paranoid */ verify( lanes.count > 0 );
+		/* paranoid */ verify( kernelTLS().this_processor );
+
+		const int cpu = __kernel_getcpu();
+		/* paranoid */ verify(cpu >= 0);
+		/* paranoid */ verify(cpu < cpu_info.hthrd_count);
+		/* paranoid */ verify(cpu * READYQ_SHARD_FACTOR < lanes.count);
+
+		const cpu_map_entry_t & map = cpu_info.llc_map[cpu];
+		/* paranoid */ verify(map.start * READYQ_SHARD_FACTOR < lanes.count);
+		/* paranoid */ verify(map.self * READYQ_SHARD_FACTOR < lanes.count);
+		/* paranoid */ verifyf((map.start + map.count) * READYQ_SHARD_FACTOR <= lanes.count, "have %zu lanes but map can go up to %u", lanes.count, (map.start + map.count) * READYQ_SHARD_FACTOR);
+
+		processor * const proc = kernelTLS().this_processor;
+		const int start = map.self * READYQ_SHARD_FACTOR;
+
+		// Did we already have a help target
+		if(proc->rdq.target == -1u) {
+			// if We don't have a
+			unsigned long long min = ts(lanes.data[start]);
+			for(i; READYQ_SHARD_FACTOR) {
+				unsigned long long tsc = ts(lanes.data[start + i]);
+				if(tsc < min) min = tsc;
+			}
+			proc->rdq.cutoff = min;
+
+			/* paranoid */ verify(lanes.count < 65536); // The following code assumes max 65536 cores.
+			/* paranoid */ verify(map.count < 65536); // The following code assumes max 65536 cores.
+			uint64_t chaos = __tls_rand();
+			uint64_t high_chaos = (chaos >> 32);
+			uint64_t  mid_chaos = (chaos >> 16) & 0xffff;
+			uint64_t  low_chaos = chaos & 0xffff;
+
+			unsigned me = map.self;
+			unsigned cpu_chaos = map.start + (mid_chaos % map.count);
+			bool global = cpu_chaos == me;
+
+			if(global) {
+				proc->rdq.target = high_chaos % lanes.count;
+			} else {
+				proc->rdq.target = (cpu_chaos * READYQ_SHARD_FACTOR) + (low_chaos % READYQ_SHARD_FACTOR);
+				/* paranoid */ verify(proc->rdq.target >= (map.start * READYQ_SHARD_FACTOR));
+				/* paranoid */ verify(proc->rdq.target <  ((map.start + map.count) * READYQ_SHARD_FACTOR));
+			}
+
+			/* paranoid */ verify(proc->rdq.target != -1u);
+		}
+		else {
+			const unsigned long long bias = 0; //2_500_000_000;
+			const unsigned long long cutoff = proc->rdq.cutoff > bias ? proc->rdq.cutoff - bias : proc->rdq.cutoff;
+			{
+				unsigned target = proc->rdq.target;
+				proc->rdq.target = -1u;
+				if(lanes.tscs[target].tv < cutoff && ts(lanes.data[target]) < cutoff) {
+					$thread * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
+					proc->rdq.last = target;
+					if(t) return t;
+				}
+			}
+
+			unsigned last = proc->rdq.last;
+			if(last != -1u && lanes.tscs[last].tv < cutoff && ts(lanes.data[last]) < cutoff) {
+				$thread * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.help));
+				if(t) return t;
+			}
+			else {
+				proc->rdq.last = -1u;
+			}
+		}
+
+		for(READYQ_SHARD_FACTOR) {
+			unsigned i = start + (proc->rdq.itr++ % READYQ_SHARD_FACTOR);
+			if($thread * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
+		}
+
+		// All lanes where empty return 0p
+		return 0p;
+	}
+
+	__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+		processor * const proc = kernelTLS().this_processor;
+		unsigned last = proc->rdq.last;
+		if(last != -1u) {
+			struct $thread * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.steal));
+			if(t) return t;
+			proc->rdq.last = -1u;
+		}
+
+		unsigned i = __tls_rand() % lanes.count;
+		return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal));
+	}
+	__attribute__((hot)) struct $thread * pop_search(struct cluster * cltr) {
+		return search(cltr);
+	}
+#endif
 #if defined(USE_RELAXED_FIFO)
 	//-----------------------------------------------------------------------
@@ -519,9 +705,9 @@
 					if(is_empty(sl)) {
 						assert( sl.anchor.next == 0p );
-						assert( sl.anchor.ts   == 0  );
+						assert( sl.anchor.ts   == -1llu );
 						assert( mock_head(sl)  == sl.prev );
 					} else {
 						assert( sl.anchor.next != 0p );
-						assert( sl.anchor.ts   != 0  );
+						assert( sl.anchor.ts   != -1llu );
 						assert( mock_head(sl)  != sl.prev );
 					}
@@ -573,134 +759,141 @@
 		lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
 		for(i; lanes.count) {
-			unsigned long long tsc = ts(lanes.data[i]);
-			lanes.tscs[i].tv = tsc != 0 ? tsc : rdtscl();
+			unsigned long long tsc1 = ts(lanes.data[i]);
+			unsigned long long tsc2 = rdtscl();
+			lanes.tscs[i].tv = min(tsc1, tsc2);
 		}
 	#endif
 }
 
-// Grow the ready queue
-void ready_queue_grow(struct cluster * cltr) {
-	size_t ncount;
-	int target = cltr->procs.total;
-
-	/* paranoid */ verify( ready_mutate_islocked() );
-	__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue\n");
-
-	// Make sure that everything is consistent
-	/* paranoid */ check( cltr->ready_queue );
-
-	// grow the ready queue
-	with( cltr->ready_queue ) {
-		// Find new count
-		// Make sure we always have atleast 1 list
-		if(target >= 2) {
-			ncount = target * READYQ_SHARD_FACTOR;
-		} else {
-			ncount = SEQUENTIAL_SHARD;
-		}
-
-		// Allocate new array (uses realloc and memcpies the data)
-		lanes.data = alloc( ncount, lanes.data`realloc );
-
-		// Fix the moved data
-		for( idx; (size_t)lanes.count ) {
-			fix(lanes.data[idx]);
-		}
-
-		// Construct new data
-		for( idx; (size_t)lanes.count ~ ncount) {
-			(lanes.data[idx]){};
-		}
-
-		// Update original
-		lanes.count = ncount;
-	}
-
-	fix_times(cltr);
-
-	reassign_cltr_id(cltr);
-
-	// Make sure that everything is consistent
-	/* paranoid */ check( cltr->ready_queue );
-
-	__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue done\n");
-
-	/* paranoid */ verify( ready_mutate_islocked() );
-}
-
-// Shrink the ready queue
-void ready_queue_shrink(struct cluster * cltr) {
-	/* paranoid */ verify( ready_mutate_islocked() );
-	__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue\n");
-
-	// Make sure that everything is consistent
-	/* paranoid */ check( cltr->ready_queue );
-
-	int target = cltr->procs.total;
-
-	with( cltr->ready_queue ) {
-		// Remember old count
-		size_t ocount = lanes.count;
-
-		// Find new count
-		// Make sure we always have atleast 1 list
-		lanes.count = target >= 2 ? target * READYQ_SHARD_FACTOR: SEQUENTIAL_SHARD;
-		/* paranoid */ verify( ocount >= lanes.count );
-		/* paranoid */ verify( lanes.count == target * READYQ_SHARD_FACTOR || target < 2 );
-
-		// for printing count the number of displaced threads
-		#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
-			__attribute__((unused)) size_t displaced = 0;
-		#endif
-
-		// redistribute old data
-		for( idx; (size_t)lanes.count ~ ocount) {
-			// Lock is not strictly needed but makes checking invariants much easier
-			__attribute__((unused)) bool locked = __atomic_try_acquire(&lanes.data[idx].lock);
-			verify(locked);
-
-			// As long as we can pop from this lane to push the threads somewhere else in the queue
-			while(!is_empty(lanes.data[idx])) {
-				struct $thread * thrd;
-				unsigned long long _;
-				[thrd, _] = pop(lanes.data[idx]);
-
-				push(cltr, thrd, true);
-
-				// for printing count the number of displaced threads
-				#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
-					displaced++;
-				#endif
-			}
-
-			// Unlock the lane
-			__atomic_unlock(&lanes.data[idx].lock);
-
-			// TODO print the queue statistics here
-
-			^(lanes.data[idx]){};
-		}
-
-		__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue displaced %zu threads\n", displaced);
-
-		// Allocate new array (uses realloc and memcpies the data)
-		lanes.data = alloc( lanes.count, lanes.data`realloc );
-
-		// Fix the moved data
-		for( idx; (size_t)lanes.count ) {
-			fix(lanes.data[idx]);
-		}
-	}
-
-	fix_times(cltr);
-
-	reassign_cltr_id(cltr);
-
-	// Make sure that everything is consistent
-	/* paranoid */ check( cltr->ready_queue );
-
-	__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue done\n");
-	/* paranoid */ verify( ready_mutate_islocked() );
-}
+#if defined(USE_CPU_WORK_STEALING)
+	// ready_queue size is fixed in this case
+	void ready_queue_grow(struct cluster * cltr) {}
+	void ready_queue_shrink(struct cluster * cltr) {}
+#else
+	// Grow the ready queue
+	void ready_queue_grow(struct cluster * cltr) {
+		size_t ncount;
+		int target = cltr->procs.total;
+
+		/* paranoid */ verify( ready_mutate_islocked() );
+		__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue\n");
+
+		// Make sure that everything is consistent
+		/* paranoid */ check( cltr->ready_queue );
+
+		// grow the ready queue
+		with( cltr->ready_queue ) {
+			// Find new count
+			// Make sure we always have atleast 1 list
+			if(target >= 2) {
+				ncount = target * READYQ_SHARD_FACTOR;
+			} else {
+				ncount = SEQUENTIAL_SHARD;
+			}
+
+			// Allocate new array (uses realloc and memcpies the data)
+			lanes.data = alloc( ncount, lanes.data`realloc );
+
+			// Fix the moved data
+			for( idx; (size_t)lanes.count ) {
+				fix(lanes.data[idx]);
+			}
+
+			// Construct new data
+			for( idx; (size_t)lanes.count ~ ncount) {
+				(lanes.data[idx]){};
+			}
+
+			// Update original
+			lanes.count = ncount;
+		}
+
+		fix_times(cltr);
+
+		reassign_cltr_id(cltr);
+
+		// Make sure that everything is consistent
+		/* paranoid */ check( cltr->ready_queue );
+
+		__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue done\n");
+
+		/* paranoid */ verify( ready_mutate_islocked() );
+	}
+
+	// Shrink the ready queue
+	void ready_queue_shrink(struct cluster * cltr) {
+		/* paranoid */ verify( ready_mutate_islocked() );
+		__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue\n");
+
+		// Make sure that everything is consistent
+		/* paranoid */ check( cltr->ready_queue );
+
+		int target = cltr->procs.total;
+
+		with( cltr->ready_queue ) {
+			// Remember old count
+			size_t ocount = lanes.count;
+
+			// Find new count
+			// Make sure we always have atleast 1 list
+			lanes.count = target >= 2 ? target * READYQ_SHARD_FACTOR: SEQUENTIAL_SHARD;
+			/* paranoid */ verify( ocount >= lanes.count );
+			/* paranoid */ verify( lanes.count == target * READYQ_SHARD_FACTOR || target < 2 );
+
+			// for printing count the number of displaced threads
+			#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
+				__attribute__((unused)) size_t displaced = 0;
+			#endif
+
+			// redistribute old data
+			for( idx; (size_t)lanes.count ~ ocount) {
+				// Lock is not strictly needed but makes checking invariants much easier
+				__attribute__((unused)) bool locked = __atomic_try_acquire(&lanes.data[idx].lock);
+				verify(locked);
+
+				// As long as we can pop from this lane to push the threads somewhere else in the queue
+				while(!is_empty(lanes.data[idx])) {
+					struct $thread * thrd;
+					unsigned long long _;
+					[thrd, _] = pop(lanes.data[idx]);
+
+					push(cltr, thrd, true);
+
+					// for printing count the number of displaced threads
+					#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
+						displaced++;
+					#endif
+				}
+
+				// Unlock the lane
+				__atomic_unlock(&lanes.data[idx].lock);
+
+				// TODO print the queue statistics here
+
+				^(lanes.data[idx]){};
+			}
+
+			__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue displaced %zu threads\n", displaced);
+
+			// Allocate new array (uses realloc and memcpies the data)
+			lanes.data = alloc( lanes.count, lanes.data`realloc );
+
+			// Fix the moved data
+			for( idx; (size_t)lanes.count ) {
+				fix(lanes.data[idx]);
+			}
+		}
+
+		fix_times(cltr);
+
+		reassign_cltr_id(cltr);
+
+		// Make sure that everything is consistent
+		/* paranoid */ check( cltr->ready_queue );
+
+		__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue done\n");
+		/* paranoid */ verify( ready_mutate_islocked() );
+	}
+#endif
 
 #if !defined(__CFA_NO_STATISTICS__)
@@ -710,2 +903,59 @@
 	}
 #endif
+
+
+#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
+	// No definition needed
+#elif defined(CFA_HAVE_LINUX_RSEQ_H)
+
+	#if defined( __x86_64 ) || defined( __i386 )
+		#define RSEQ_SIG	0x53053053
+	#elif defined( __ARM_ARCH )
+		#ifdef __ARMEB__
+		#define RSEQ_SIG    0xf3def5e7      /* udf    #24035    ; 0x5de3 (ARMv6+) */
+		#else
+		#define RSEQ_SIG    0xe7f5def3      /* udf    #24035    ; 0x5de3 */
+		#endif
+	#endif
+
+	extern void __disable_interrupts_hard();
+	extern void __enable_interrupts_hard();
+
+	void __kernel_raw_rseq_register  (void) {
+		/* paranoid */ verify( __cfaabi_rseq.cpu_id == RSEQ_CPU_ID_UNINITIALIZED );
+
+		// int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, (sigset_t *)0p, _NSIG / 8);
+		int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, RSEQ_SIG);
+		if(ret != 0) {
+			int e = errno;
+			switch(e) {
+			case EINVAL: abort("KERNEL ERROR: rseq register invalid argument");
+			case ENOSYS: abort("KERNEL ERROR: rseq register no supported");
+			case EFAULT: abort("KERNEL ERROR: rseq register with invalid argument");
+			case EBUSY : abort("KERNEL ERROR: rseq register already registered");
+			case EPERM : abort("KERNEL ERROR: rseq register sig  argument  on unregistration does not match the signature received on registration");
+			default: abort("KERNEL ERROR: rseq register unexpected return %d", e);
+			}
+		}
+	}
+
+	void __kernel_raw_rseq_unregister(void) {
+		/* paranoid */ verify( __cfaabi_rseq.cpu_id >= 0 );
+
+		// int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, (sigset_t *)0p, _NSIG / 8);
+		int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
+		if(ret != 0) {
+			int e = errno;
+			switch(e) {
+			case EINVAL: abort("KERNEL ERROR: rseq unregister invalid argument");
+			case ENOSYS: abort("KERNEL ERROR: rseq unregister no supported");
+			case EFAULT: abort("KERNEL ERROR: rseq unregister with invalid argument");
+			case EBUSY : abort("KERNEL ERROR: rseq unregister already registered");
+			case EPERM : abort("KERNEL ERROR: rseq unregister sig  argument  on unregistration does not match the signature received on registration");
+			default: abort("KERNEL ERROR: rseq unregisteunexpected return %d", e);
+			}
+		}
+	}
+#else
+	// No definition needed
+#endif
Index: libcfa/src/concurrency/ready_subqueue.hfa
===================================================================
--- libcfa/src/concurrency/ready_subqueue.hfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/ready_subqueue.hfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -32,5 +32,5 @@
 	this.prev = mock_head(this);
 	this.anchor.next = 0p;
-	this.anchor.ts   = 0;
+	this.anchor.ts   = -1llu;
 	#if !defined(__CFA_NO_STATISTICS__)
 		this.cnt  = 0;
@@ -44,5 +44,5 @@
 	/* paranoid */ verify( &mock_head(this)->link.ts   == &this.anchor.ts   );
 	/* paranoid */ verify( mock_head(this)->link.next == 0p );
-	/* paranoid */ verify( mock_head(this)->link.ts   == 0  );
+	/* paranoid */ verify( mock_head(this)->link.ts   == -1llu  );
 	/* paranoid */ verify( mock_head(this) == this.prev );
 	/* paranoid */ verify( __alignof__(__intrusive_lane_t) == 128 );
@@ -55,5 +55,5 @@
 	// Make sure the list is empty
 	/* paranoid */ verify( this.anchor.next == 0p );
-	/* paranoid */ verify( this.anchor.ts   == 0  );
+	/* paranoid */ verify( this.anchor.ts   == -1llu );
 	/* paranoid */ verify( mock_head(this)  == this.prev );
 }
@@ -64,13 +64,15 @@
 	/* paranoid */ verify( this.lock );
 	/* paranoid */ verify( node->link.next == 0p );
-	/* paranoid */ verify( node->link.ts   == 0  );
+	/* paranoid */ verify( node->link.ts   == -1llu  );
 	/* paranoid */ verify( this.prev->link.next == 0p );
-	/* paranoid */ verify( this.prev->link.ts   == 0  );
+	/* paranoid */ verify( this.prev->link.ts   == -1llu  );
 	if( this.anchor.next == 0p ) {
 		/* paranoid */ verify( this.anchor.next == 0p );
-		/* paranoid */ verify( this.anchor.ts   == 0  );
+		/* paranoid */ verify( this.anchor.ts   == -1llu );
+		/* paranoid */ verify( this.anchor.ts   != 0  );
 		/* paranoid */ verify( this.prev == mock_head( this ) );
 	} else {
 		/* paranoid */ verify( this.anchor.next != 0p );
+		/* paranoid */ verify( this.anchor.ts   != -1llu );
 		/* paranoid */ verify( this.anchor.ts   != 0  );
 		/* paranoid */ verify( this.prev != mock_head( this ) );
@@ -92,4 +94,5 @@
 	/* paranoid */ verify( this.lock );
 	/* paranoid */ verify( this.anchor.next != 0p );
+	/* paranoid */ verify( this.anchor.ts   != -1llu );
 	/* paranoid */ verify( this.anchor.ts   != 0  );
 
@@ -99,7 +102,7 @@
 	this.anchor.next = node->link.next;
 	this.anchor.ts   = node->link.ts;
-	bool is_empty = this.anchor.ts == 0;
+	bool is_empty = this.anchor.next == 0p;
 	node->link.next = 0p;
-	node->link.ts   = 0;
+	node->link.ts   = -1llu;
 	#if !defined(__CFA_NO_STATISTICS__)
 		this.cnt--;
@@ -110,5 +113,7 @@
 
 	/* paranoid */ verify( node->link.next == 0p );
-	/* paranoid */ verify( node->link.ts   == 0  );
+	/* paranoid */ verify( node->link.ts   == -1llu  );
+	/* paranoid */ verify( node->link.ts   != 0  );
+	/* paranoid */ verify( this.anchor.ts  != 0  );
 	return [node, ts];
 }
@@ -116,5 +121,5 @@
 // Check whether or not list is empty
 static inline bool is_empty(__intrusive_lane_t & this) {
-	return this.anchor.ts == 0;
+	return this.anchor.next == 0p;
 }
 
@@ -122,4 +127,5 @@
 static inline unsigned long long ts(__intrusive_lane_t & this) {
 	// Cannot verify here since it may not be locked
+	/* paranoid */ verify(this.anchor.ts != 0);
 	return this.anchor.ts;
 }
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/thread.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -15,4 +15,5 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
 
 #include "thread.hfa"
@@ -39,5 +40,5 @@
 	curr_cluster = &cl;
 	link.next = 0p;
-	link.ts   = 0;
+	link.ts   = -1llu;
 	preferred = -1u;
 	last_proc = 0p;
Index: libcfa/src/containers/array.hfa
===================================================================
--- libcfa/src/containers/array.hfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/containers/array.hfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -1,13 +1,7 @@
 
 
-// a type whose size is n
-#define Z(n) char[n]
-
-// the inverse of Z(-)
-#define z(N) sizeof(N)
-
-forall( T & ) struct tag {};
+forall( __CFA_tysys_id_only_X & ) struct tag {};
 #define ttag(T) ((tag(T)){})
-#define ztag(n) ttag(Z(n))
+#define ztag(n) ttag(n)
 
 
@@ -18,5 +12,5 @@
 forall( [N], S & | sized(S), Timmed &, Tbase & ) {
     struct arpk {
-        S strides[z(N)];
+        S strides[N];
     };
 
@@ -56,14 +50,14 @@
 
     static inline size_t ?`len( arpk(N, S, Timmed, Tbase) & a ) {
-        return z(N);
+        return N;
     }
 
     // workaround #226 (and array relevance thereof demonstrated in mike102/otype-slow-ndims.cfa)
     static inline void ?{}( arpk(N, S, Timmed, Tbase) & this ) {
-        void ?{}( S (&inner)[z(N)] ) {}
+        void ?{}( S (&inner)[N] ) {}
         ?{}(this.strides);
     }
     static inline void ^?{}( arpk(N, S, Timmed, Tbase) & this ) {
-        void ^?{}( S (&inner)[z(N)] ) {}
+        void ^?{}( S (&inner)[N] ) {}
         ^?{}(this.strides);
     }
@@ -143,9 +137,15 @@
 // Base
 forall( [Nq], Sq & | sized(Sq), Tbase & )
-static inline tag(arpk(Nq, Sq, Tbase, Tbase)) enq_( tag(Tbase), tag(Nq), tag(Sq), tag(Tbase) ) {}
+static inline tag(arpk(Nq, Sq, Tbase, Tbase)) enq_( tag(Tbase), tag(Nq), tag(Sq), tag(Tbase) ) {
+    tag(arpk(Nq, Sq, Tbase, Tbase)) ret;
+    return ret;
+}
 
 // Rec
 forall( [Nq], Sq & | sized(Sq), [N], S & | sized(S), recq &, recr &, Tbase & | { tag(recr) enq_( tag(Tbase), tag(Nq), tag(Sq), tag(recq) ); } )
-static inline tag(arpk(N, S, recr, Tbase)) enq_( tag(Tbase), tag(Nq), tag(Sq), tag(arpk(N, S, recq, Tbase)) ) {}
+static inline tag(arpk(N, S, recr, Tbase)) enq_( tag(Tbase), tag(Nq), tag(Sq), tag(arpk(N, S, recq, Tbase)) ) {
+    tag(arpk(N, S, recr, Tbase)) ret;
+    return ret;
+}
 
 // Wrapper
Index: libcfa/src/containers/vector2.hfa
===================================================================
--- libcfa/src/containers/vector2.hfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
+++ libcfa/src/containers/vector2.hfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -0,0 +1,355 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// vector -- A growable array, with full-service iterators
+//
+// Author           : Michael Brooks
+// Created On       : Thu Jun 23 22:00:00 2021
+// Last Modified By : Michael Brooks
+// Last Modified On : Thu Jun 23 22:00:00 2021
+// Update Count     : 1
+//
+
+#include <stdlib.hfa>
+#include "list.hfa"
+
+forall( T ) {
+    struct vector;
+    
+    struct vector_transit {
+        vector(T) * col_$;
+        ptrdiff_t idx_$;
+    };
+
+    struct vector_exit {
+        vector(T) * invec_$;
+        T * item_$;
+    };
+
+    struct vector_permit {
+        vector(T) * invec_$;
+        T * item_$;
+        inline dlink(vector_permit(T));
+    };
+    P9_EMBEDDED(vector_permit(T), dlink(vector_permit(T)))
+
+    struct vector {
+        T * buffer_first_$;
+        T * buffer_end_$;
+        T * elems_first_$;
+        T * elems_end_$; // wrapped before storing, never == buffer_end_$
+        size_t exit_refcount_$;
+        dlist(vector_permit(T)) live_iters_$;
+    };
+}
+
+static inline
+forall( T ) {
+    
+    // vector
+
+    void ?{}( vector( T ) &, size_t capacity );
+    void ^?{}( vector( T ) & );
+
+    void ?{}( vector( T ) & ) = void;
+    void ?{}( vector( T ) &, vector( T ) & ) = void;
+    vector( T ) & ?=?( vector( T ) &, vector( T ) & ) = void;
+
+    // transit
+
+    void ?{}( vector_transit(T) & ) = void;
+    void ?{}( vector_transit(T) &, vector_transit(T) & );
+    void ^?{}( vector_transit(T) & );
+
+    T ?`val( vector_transit(T) & src );
+    void ?=?( vector_transit(T) & dst, T val );
+
+    // exit
+
+    void ?{}( vector_exit(T) & ) = void;
+    void ?{}( vector_exit(T) &, vector(T) * ) = void;
+
+    void ^?{}( vector_exit(T) & );
+    void ?{}( vector_exit(T) &, vector_transit(T) & );
+    void ?{}( vector_exit(T) &, vector_exit(T) & );
+
+    T ?`val( vector_exit(T) & src );
+    void ?=?( vector_exit(T) & dst, T val );
+    T & ?=?( T & dst, vector_exit(T) & src );
+    void ?*=?( T & dst, vector_exit(T) & src );
+
+    bool ?`moveNext( vector_exit(T) & it );
+
+    // permit
+
+    void ?{}( vector_permit(T) & ) = void;
+
+    void ^?{}( vector_permit(T) & );
+    void ?{}( vector_permit(T) &, vector_transit(T) & );
+    void ?{}( vector_permit(T) &, vector_exit(T) & );
+    void ?{}( vector_permit(T) &, vector_permit(T) & ) = void;
+
+    T ?`val( vector_permit(T) & src );
+
+    // api
+
+    vector_transit(T) push_last( vector( T ) & col, T val );
+    vector_transit(T) ?[?]( vector( T ) &, ptrdiff_t );
+    vector_exit(T) ?`origin( vector( T ) & );
+    size_t ?`capacity( vector(T) & );
+    size_t ?`length( vector(T) & );
+
+    void insert_before( vector( T ) & col, ptrdiff_t idx, T val );
+
+}
+
+static inline
+forall( T ) {
+
+    // vector
+
+    void ?{}( vector( T ) & this, size_t capacity ) {
+        (this.buffer_first_$){ aalloc( capacity ) };
+        (this.buffer_end_$){ this.buffer_first_$ + capacity};
+        (this.elems_first_$){ 0p };
+        (this.elems_end_$){ this.buffer_first_$ };
+        (this.exit_refcount_$){ 0 };
+        (this.live_iters_$){};
+    }
+
+    void ^?{}( vector( T ) & this ) {
+        assert( this.exit_refcount_$ == 0 );
+        free( this.buffer_first_$ );
+        this.buffer_first_$ = 0p;
+        this.buffer_end_$ = 0p;
+        this.elems_first_$ = 0p;
+        this.elems_end_$ = 0p;
+    }
+
+    // transit 
+
+    void ?{}( vector_transit(T) & this, vector_transit(T) & other ) {
+        // call autogen constructor deleted at end of hfa
+        (this){ other.col_$, other.idx_$ };
+    }
+
+    void ^?{}( vector_transit(T) & ) {}
+
+
+    vector_transit(T) ?[?]( vector( T ) & vec, ptrdiff_t idx ) {
+        // call autogen constructor deleted at end of hfa
+        vector_transit(T) ret = { & vec, idx };
+        return ret;
+    }
+
+    T & findElemMem_$( vector(T) & v, ptrdiff_t idx ) {
+        size_t len = v`length;
+        while (idx > len) idx -= len;
+        while (idx < 0  ) idx += len;
+        T * ret = v.elems_first_$ + idx;
+        if (ret < v.buffer_end_$) return *ret;
+        ret -= (v.buffer_end_$ - v.buffer_first_$);
+        assert( v.buffer_first_$ <= ret && ret < v.elems_end_$ );
+        return *ret;
+    }
+
+    T ?`val( vector_transit(T) & src ) {
+        T ret = findElemMem_$( *src.col_$, src.idx_$ );
+        return ret;
+    }
+
+    void ?=?( vector_transit(T) & src, T val ) {
+        findElemMem_$( *src.col_$, src.idx_$ ) = val;
+    }
+
+    // exit
+
+    void ?{}( vector_exit(T) & this, vector_transit(T) & src ) {
+        ( this.invec_$ ){ src.col_$ };
+        ( this.item_$ ){ & findElemMem_$( *src.col_$, src.idx_$ ) };
+
+        this.invec_$->exit_refcount_$ ++;
+    }
+    void ?{}( vector_exit(T) & this, vector_exit(T) & src ){
+        ( this.invec_$ ){ src.invec_$ };
+        ( this.item_$ ){ src.item_$ };
+
+        this.invec_$->exit_refcount_$ ++;
+    }
+
+    void ^?{}( vector_exit(T) & it ) {
+        it.invec_$->exit_refcount_$ --;
+    }
+
+    T ?`val( vector_exit(T) & src ) {
+        return *src.item_$;
+    }
+
+    void ?*=?( T & dst, vector_exit(T) & src ) {
+        dst = *src.item_$;
+    }
+
+    bool ?`moveNext( vector_exit(T) & it ) {
+        if (it.invec_$->elems_first_$ == 0p) {
+            // vector is empty
+            assert ( it.item_$ == 0p ); // it was at origin
+            return false;
+        }
+        assert( it.invec_$->elems_first_$ < it.invec_$->elems_end_$ && "can't handle wraparound yet" ); // temporary: must implement
+        if( it.item_$ == 0p ) {
+            // moving from origin
+            it.item_$ = it.invec_$->elems_first_$;
+        } else {
+            it.item_$ += 1;
+            if( it.item_$ > it.invec_$->buffer_end_$ )
+                it.item_$ = it.invec_$->buffer_first_$;
+        }
+        if ( it.item_$ >= it.invec_$->elems_end_$ ) {
+            // moving to origin
+            it.item_$ = 0p;
+            return false;
+        } else {
+            return true;
+        }
+    }
+
+    // permit
+
+    void ^?{}( vector_permit(T) & this ) {
+        remove(this);
+    }
+
+    void ?{}( vector_permit(T) & this, vector_transit(T) & src ) {
+        ( this.invec_$ ){ src.col_$ };
+        ( this.item_$ ){ & findElemMem_$( *src.col_$, src.idx_$ ) };
+        insert_first( src.col_$->live_iters_$, this );
+    }
+
+    void ?{}( vector_permit(T) & this, vector_exit(T) & src ) {
+        ( this.invec_$ ){ src.invec_$ };
+        ( this.item_$ ){ src.item_$ };
+        insert_first( src.invec_$->live_iters_$, this );
+    }
+
+    T ?`val( vector_permit(T) & src ){
+        return *src.item_$;
+    }
+
+    // vec internals
+
+    void grow( vector( T ) & this ) {
+        size_t newCapacity = 2 * (this.buffer_end_$ - this.buffer_first_$);
+        T * newItems = aalloc( newCapacity );
+        size_t elemCount = this`length;
+        for ( ptrdiff_t pos = 0; pos < elemCount; pos += 1 ) {
+            newItems[pos] = findElemMem_$(this, pos);
+        }
+
+        while ( vector_permit(T) & liveIter = this.live_iters_$`elems; liveIter`moveNext ) {
+            liveIter.item_$ += (newItems - this.buffer_first_$);
+        }
+
+        free( this.buffer_first_$ );
+        this.buffer_first_$ = newItems;
+        this.buffer_end_$ = newItems + newCapacity;
+        this.elems_first_$ = this.buffer_first_$;
+        this.elems_end_$ = this.buffer_first_$ + elemCount;
+        assert (this.elems_end_$ < this.buffer_end_$);
+    }
+
+    // vec api
+
+    vector_transit(T) push_last( vector( T ) & col, T val ) {
+        assert (col.exit_refcount_$ == 0);
+        if (col`length >= col`capacity) {
+            assert (col`length == col`capacity);
+            grow(col);
+        }
+        // call autogen constructor deleted at end of hfa
+        vector_transit(T) ret = { & col, col`length };
+        *col.elems_end_$ = val;
+        if (col.elems_first_$ == 0p) col.elems_first_$ = col.elems_end_$;
+        col.elems_end_$ += 1;
+        if (col.elems_end_$ >= col.buffer_end_$) col.elems_end_$ = col.buffer_first_$;
+        return ret;
+    }
+
+    vector_exit(T) ?`origin( vector( T ) & vec ) {
+
+        // private memberwise constructor, deleted from global namespace at end
+        // autogen constructor would not do the raii
+        void ?{}( vector_exit(T) & this, vector(T) * invec_$, T * item_$ ) {
+            ( this.invec_$ ){ invec_$ };
+            ( this.item_$ ){ item_$ };
+            this.invec_$->exit_refcount_$ ++;
+        }
+
+        vector_exit(T) ret = { &vec, 0p };
+        return ret;
+    }
+
+    bool inRange_$( T * query, T * from, T * to) {
+        if (from == to) return false;
+        if (from < to) return from <= query && query < to;
+        return query < to || from <= query;
+    }
+
+    void insert_before( vector( T ) & col, ptrdiff_t idx, T val ) {
+        assert (col.exit_refcount_$ == 0);
+        if (col`length >= col`capacity) {
+            assert (col`length == col`capacity);
+            grow(col);
+        }
+        
+        T & insertTargetR = findElemMem_$( col, idx );
+        T * insertTarget = & insertTargetR; // doesn't work in one line; must be a bug
+
+        // bubble toward back
+        if ( col.elems_end_$ < insertTarget ) {
+            // two phases of bubbling, to wrap around
+            for (T * tgt = col.elems_end_$; tgt > col.buffer_first_$; tgt--) {
+                *tgt = *(tgt-1);
+            }
+            *col.buffer_first_$ = *(col.buffer_end_$ - 1);
+            for (T * tgt = col.buffer_end_$ - 1; tgt > insertTarget; tgt--) {
+                *tgt = *(tgt-1);
+            }
+        } else {
+            for (T * tgt = col.elems_end_$; tgt > insertTarget; tgt--) {
+                *tgt = *(tgt-1);
+            }
+        }
+
+        col.elems_end_$ += 1;
+        if (col.elems_end_$ == col.buffer_end_$) col.elems_end_$ = col.buffer_first_$;
+
+        *insertTarget = val;
+
+        while ( vector_permit(T) & liveIter = col.live_iters_$`elems; liveIter`moveNext ) {
+            if ( inRange_$(liveIter.item_$, insertTarget, col.elems_end_$) ) {
+                liveIter.item_$ += 1;
+                if (liveIter.item_$ >= col.buffer_end_$) liveIter.item_$ = col.buffer_first_$;
+            }
+        }
+    }
+
+    size_t ?`capacity( vector(T) & v ) {
+        return v.buffer_end_$ - v.buffer_first_$;
+    }
+
+    size_t ?`length( vector(T) & v ) {
+        if (v.elems_first_$ == 0p) return 0;
+        if (v.elems_first_$ < v.elems_end_$ ) return v.elems_end_$ - v.elems_first_$;
+        return v.buffer_end_$ - v.elems_first_$ + v.elems_end_$ - v.buffer_first_$;
+    }
+
+
+} // forall T
+
+forall( T ) {
+    void ?{}( vector_exit(T) &, vector(T) *, T * ) = void;
+    void ?{}( vector_transit(T) & this, vector( T ) * col, ptrdiff_t idx ) = void;
+}
Index: libcfa/src/device/cpu.cfa
===================================================================
--- libcfa/src/device/cpu.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
+++ libcfa/src/device/cpu.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -0,0 +1,423 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2021 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// topology.cfa -- read the data structure
+//
+// Author           : Thierry Delisle
+// Created On       : Thu Jun 10 16:13:07 2021
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#include "device/cpu.hfa"
+
+#include <math.hfa>
+#include <stdlib.hfa>
+
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+extern "C" {
+	#include <dirent.h>
+	#include <sys/types.h>
+	#include <sys/stat.h>
+	#include <fcntl.h>
+}
+
+// search a string for character 'character' but looking atmost at len
+// chars
+static const char * strnchr(const char * str, int character, size_t len) {
+	return (const char *)memchr(str, character, strnlen(str, len));
+}
+
+// Check if have string matches the want string
+// ignoring any characters that are longer than the want string
+static bool strmatch(const char * want, char * have) {
+	size_t w = strlen(want);
+	return strncmp(want, have, w) == 0;
+}
+
+typedef const char * idx_range_t;
+
+// read the value of a string and evaluate it
+// get the end pointer and make sure it is all evaluated
+static unsigned read_value(idx_range_t map, size_t len, const char ** end) {
+	unsigned long val = strtoul(map, (char**)end, 10);
+	/* paranoid */ __attribute__((unused)) size_t read = (*end - map);
+	/* paranoid */ verifyf(read <= len, "String '%s' passed with inconsistent length %zu", map, len);
+	/* paranoid */ verifyf(read == len, "String %.*s not entirely a number, %zu chars left", (int)len, map, len - read);
+	return val;
+}
+
+// Evaluate the width of a comma seperated list of idx
+// for example 'A-B,C-D,E,F' has a width of '(B-A) + (D-C) + 1 + 1'
+// Also has an (non-optional) end ptr like strtoul and friends
+//
+// FIXME : the current implementation only supports 1 comma
+static unsigned read_width(idx_range_t map, size_t len, const char ** end) {
+	// Do we have a comma
+	const char * comma = strnchr(map, ',', len);
+	if(comma != 0p) {
+		// We do! recurse and sum the widths
+		const char * _;
+		size_t split = comma - map;
+		unsigned lhs = read_width(map, split, &_);
+		unsigned rhs = read_width(comma + 1, len - split - 1, end);
+		return lhs + rhs;
+	}
+
+	// No commas, check for a range
+	const char * dash = strnchr(map, '-', len);
+	if(dash != 0p) {
+		const char * _;
+		size_t split = dash - map;
+		unsigned lhs = read_value(map, split, &_);
+		unsigned rhs = read_value(dash + 1, len - split - 1, end);
+		return rhs - lhs + 1;
+	}
+
+	// No range, no comma, just a single value
+	// It's width is 1 and we can consume everything
+	/* paranoid */ verifyf( ({strtoul(map, (char**)end, 10); *end == (map + len); }), "Value in range '%.*s' not a number", (int)len, map);
+	*end = map + len;
+	return 1;
+}
+
+// go through a directory calling fn on each file
+static int iterate_dir( const char * path, void (*fn)(struct dirent * ent) ) {
+	// open the directory
+	DIR *dir = opendir(path);
+	if(dir == 0p) { return ENOTDIR; }
+
+	// call fn for each
+	struct dirent * ent;
+	while ((ent = readdir(dir)) != 0p) {
+		fn( ent );
+	}
+
+	// no longer need this
+	closedir(dir);
+	return 0;
+}
+
+// count the number of directories with the specified prefix
+// the directories counted have the form '[prefix]N' where prefix is the parameter
+// and N is an base 10 integer.
+static int count_prefix_dirs(const char * path, const char * prefix) {
+	// read the directory and find the cpu count
+	// and make sure everything is as expected
+	int max = -1;
+	int count = 0;
+	void lambda(struct dirent * ent) {
+		// were are looking for prefixX, where X is a number
+		// check that it starts with 'cpu
+		char * s = strstr(ent->d_name, prefix);
+		if(s == 0p) { return; }
+		if(s != ent->d_name) { return; }
+
+		// check that the next part is a number
+		s += strlen(prefix);
+		char * end;
+		long int val = strtol(s, &end, 10);
+		if(*end != '\0' || val < 0) { return; }
+
+		// check that it's a directory
+		if(ent->d_type != DT_DIR) { return; }
+
+		// it's a match!
+		max = max(val, max);
+		count++;
+	}
+	iterate_dir(path, lambda);
+
+	/* paranoid */ verifyf(count == max + 1, "Inconsistent %s count, counted %d, but max %s was %d", prefix, count, prefix, (int)max);
+
+	return count;
+}
+
+// Count number of cpus in the system
+static int count_cpus(void) {
+	const char * fpath = "/sys/devices/system/cpu/present";
+	int fd = open(fpath, 0, O_RDONLY);
+	/* paranoid */ verifyf(fd >= 0, "Could not open file %s", fpath);
+
+	char buff[128];
+	ssize_t r = read(fd, buff, 128);
+	/* paranoid */ verifyf(r > 0, "Could not read file %s", fpath);
+	/* paranoid */ verify( buff[r-1] == '\n' );
+	buff[r-1] = '\0';
+
+	/* paranoid */ __attribute__((unused)) int ret =
+	close(fd);
+	/* paranoid */ verifyf(ret == 0, "Could not close file %s", fpath);
+
+	const char * _;
+	int cnt = read_width(buff, r - 1, &_);
+	/* paranoid */ verify(cnt == count_prefix_dirs("/sys/devices/system/cpu", "cpu"));
+	return cnt;
+}
+
+// Count number of cache *indexes* in the system
+// cache indexes are distinct from cache level as Data or Instruction cache
+// can share a level but not an index
+// PITFALL: assumes all cpus have the same indexes as cpu0
+static int count_cache_indexes(void) {
+	return count_prefix_dirs("/sys/devices/system/cpu/cpu0/cache", "index");
+}
+
+
+// read information about a spcficic cache index/cpu file into the output buffer
+static size_t read_cpuidxinfo_into(unsigned cpu, unsigned idx, const char * file, char * out, size_t out_len) {
+	// Pick the file we want and read it
+	char buf[128];
+	/* paranoid */ __attribute__((unused)) int len =
+	snprintf(buf, 128, "/sys/devices/system/cpu/cpu%u/cache/index%u/%s", cpu, idx, file);
+	/* paranoid */ verifyf(len > 0, "Could not generate '%s' filename for cpu %u, index %u", file, cpu, idx);
+
+	int fd = open(buf, 0, O_RDONLY);
+	/* paranoid */ verifyf(fd > 0, "Could not open file '%s'", buf);
+
+	ssize_t r = read(fd, out, out_len);
+	/* paranoid */ verifyf(r > 0, "Could not read file '%s'", buf);
+
+	/* paranoid */ __attribute__((unused)) int ret =
+	close(fd);
+	/* paranoid */ verifyf(ret == 0, "Could not close file '%s'", buf);
+	return r;
+}
+
+// Iterate over the cache indexes of a given cpu
+typedef void (*handle_func_t)(unsigned idx, unsigned char level, idx_range_t range, size_t len);
+static void foreach_cacheidx(unsigned cpu, unsigned idxs, handle_func_t handle) {
+	for(i; idxs) {
+		unsigned idx = idxs - 1 - i;
+		char buf[32];
+
+		// Type says what kind of cache this is,
+		// Options are: Unified, Data, Instruction
+		read_cpuidxinfo_into(cpu, idx, "type", buf, 32);
+		if((!strmatch("Unified", buf)) && (!strmatch("Data", buf))) {
+			// We don't care about instruction caches
+			continue;
+		}
+
+		// Level is the cache level: higher means bigger and slower
+		read_cpuidxinfo_into(cpu, idx, "level", buf, 32);
+		char * end;
+		unsigned long level = strtoul(buf, &end, 10);
+		/* paranoid */ verifyf(level <= 250, "Cpu %u has more than 250 levels of cache, this is not supported", cpu);
+
+		// shared_cpu_list is a range of cpus that share this particular cache
+		size_t n = read_cpuidxinfo_into(cpu, idx, "shared_cpu_list", buf, 32);
+		/* paranoid */ verify( buf[n-1] == '\n' );
+		buf[n-1] = '\0';
+
+		// Simply call the functor
+		handle(idx, level, buf, n - 1);
+	}
+}
+
+
+struct raw_cache_instance {
+	idx_range_t range;
+	unsigned width;
+	unsigned char level;
+	// FIXME add at least size and type
+};
+
+static void  ?{}(raw_cache_instance & this) { this.range = 0p;}
+static void ^?{}(raw_cache_instance & this) { free(this.range);}
+
+raw_cache_instance ** build_raw_cache_table(unsigned cpus, unsigned idxs, unsigned cache_levels)
+{
+	raw_cache_instance ** raw = alloc(cpus);
+	for(i; cpus) {
+		raw[i] = alloc(cache_levels);
+		void addcache(unsigned fidx, unsigned char level, idx_range_t range, size_t len) {
+			/* paranoid */ verifyf(level <= cache_levels, "Unexpected cache level %d on cpu %u index %u", (int)level, i, fidx);
+
+			unsigned idx = cache_levels - level;
+			raw_cache_instance & r = raw[i][idx];
+			r.range = strndup(range, len);
+			r.level = level;
+			const char * end;
+			r.width = read_width(range, len, &end);
+		}
+		foreach_cacheidx(i, idxs, addcache);
+	}
+
+	return raw;
+}
+
+struct llc_map_t {
+	raw_cache_instance * raw;
+	unsigned count;
+	unsigned start;
+};
+
+// returns an allocate list of all the different distinct last level caches
+static [*llc_map_t, size_t cnt] distinct_llcs(unsigned cpus, unsigned llc_idx, raw_cache_instance ** raw) {
+	// Allocate at least one element
+	llc_map_t* ranges = alloc();
+	size_t range_cnt = 1;
+
+	// Initialize with element 0
+	ranges->raw = &raw[0][llc_idx];
+	ranges->count = 0;
+	ranges->start = -1u;
+
+	// Go over all other cpus
+	CPU_LOOP: for(i; 1~cpus) {
+		// Check if the range is already there
+		raw_cache_instance * candidate = &raw[i][llc_idx];
+		for(j; range_cnt) {
+			llc_map_t & exist = ranges[j];
+			// If the range is already there just jump to the next cpu
+			if(0 == strcmp(candidate->range, exist.raw->range)) continue CPU_LOOP;
+		}
+
+		// The range wasn't there, added to the list
+		ranges = alloc(range_cnt + 1, ranges`realloc);
+		ranges[range_cnt].raw = candidate;
+		ranges[range_cnt].count = 0;
+		ranges[range_cnt].start = -1u;
+		range_cnt++;
+	}
+
+	// return what we have
+	return [ranges, range_cnt];
+}
+
+struct cpu_pairing_t {
+	unsigned cpu;
+	unsigned id;
+};
+
+int ?<?( cpu_pairing_t lhs, cpu_pairing_t rhs ) {
+	return lhs.id < rhs.id;
+}
+
+static [[]cpu_pairing_t] get_cpu_pairings(unsigned cpus, raw_cache_instance ** raw, llc_map_t * maps, size_t map_cnt) {
+	cpu_pairing_t * pairings = alloc(cpus);
+
+	CPU_LOOP: for(i; cpus) {
+		pairings[i].cpu = i;
+		idx_range_t want = raw[i][0].range;
+		MAP_LOOP: for(j; map_cnt) {
+			if(0 != strcmp(want, maps[j].raw->range)) continue MAP_LOOP;
+
+			pairings[i].id = j;
+			continue CPU_LOOP;
+		}
+
+		/* paranoid */ verifyf( false, "Cpu %u map doesn't match", i );
+	}
+
+	return pairings;
+}
+
+#include <fstream.hfa>
+
+extern "C" {
+	void __cfaabi_device_startup( void ) {
+		int cpus = count_cpus();
+		int idxs = count_cache_indexes();
+
+		// Count actual cache levels
+		unsigned cache_levels = 0;
+		unsigned llc = 0;
+		{
+			unsigned char prev = -1u;
+			void first(unsigned idx, unsigned char level, const char * map, size_t len) {
+				/* paranoid */ verifyf(level < prev, "Index %u of cpu 0 has cache levels out of order: %u then %u", idx, (unsigned)prev, (unsigned)level);
+				llc = max(llc, level);
+				prev = level;
+				cache_levels++;
+			}
+			foreach_cacheidx(0, idxs, first);
+		}
+
+		// Read in raw data
+		raw_cache_instance ** raw = build_raw_cache_table(cpus, idxs, cache_levels);
+
+		// Find number of distinct cache instances
+		llc_map_t * maps;
+		size_t map_cnt;
+		[maps, map_cnt] =  distinct_llcs(cpus, cache_levels - llc, raw);
+
+		#if defined(__CFA_WITH_VERIFY__)
+		// Verify that the caches cover the all the cpus
+		{
+			unsigned width1 = 0;
+			unsigned width2 = 0;
+			for(i; map_cnt) {
+				const char * _;
+				width1 += read_width(maps[i].raw->range, strlen(maps[i].raw->range), &_);
+				width2 += maps[i].raw->width;
+			}
+			verify(width1 == cpus);
+			verify(width2 == cpus);
+		}
+		#endif
+
+		// Get mappings from cpu to cache instance
+		cpu_pairing_t * pairings = get_cpu_pairings(cpus, raw, maps, map_cnt);
+
+		// Sort by cache instance
+		qsort(pairings, cpus);
+
+		{
+			unsigned it = 0;
+			for(i; cpus) {
+				unsigned llc_id = pairings[i].id;
+				if(maps[llc_id].start == -1u) {
+					maps[llc_id].start = it;
+					it += maps[llc_id].raw->width;
+					/* paranoid */ verify(maps[llc_id].start < it);
+					/* paranoid */ verify(it != -1u);
+				}
+			}
+			/* paranoid */ verify(it == cpus);
+		}
+
+		// From the mappings build the actual cpu map we want
+		struct cpu_map_entry_t * entries = alloc(cpus);
+		for(i; cpus) { entries[i].count = 0; }
+		for(i; cpus) {
+			/* paranoid */ verify(pairings[i].id < map_cnt);
+			unsigned c = pairings[i].cpu;
+			unsigned llc_id = pairings[i].id;
+			unsigned width = maps[llc_id].raw->width;
+			unsigned start = maps[llc_id].start;
+			unsigned self  = start + (maps[llc_id].count++);
+			entries[c].count = width;
+			entries[c].start = start;
+			entries[c].self  = self;
+		}
+
+		// get rid of the temporary data
+		free(maps);
+		free(pairings);
+
+		for(i; cpus) {
+			for(j; cache_levels) {
+				^(raw[i][j]){};
+			}
+			free(raw[i]);
+		}
+		free(raw);
+
+		cpu_info.llc_map = entries;
+		cpu_info.hthrd_count = cpus;
+	}
+
+	void __cfaabi_device_shutdown( void ) {
+		free(cpu_info.llc_map);
+	}
+}
Index: libcfa/src/device/cpu.hfa
===================================================================
--- libcfa/src/device/cpu.hfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
+++ libcfa/src/device/cpu.hfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -0,0 +1,32 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2021 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// cpu.hfa -- read the data structure
+//
+// Author           : Thierry Delisle
+// Created On       : Fri Jun 11 15:22:23 2021
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#include <stddef.h>
+
+struct cpu_map_entry_t {
+	unsigned self;
+	unsigned start;
+	unsigned count;
+};
+
+struct cpu_info_t {
+	 // array of size [hthrd_count]
+	const cpu_map_entry_t * llc_map;
+
+	 // Number of _hardware_ threads present in the system
+	size_t hthrd_count;
+};
+
+cpu_info_t cpu_info;
Index: libcfa/src/exception.c
===================================================================
--- libcfa/src/exception.c	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/exception.c	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -27,15 +27,4 @@
 #include "stdhdr/assert.h"
 #include "virtual.h"
-
-#if defined( __ARM_ARCH )
-#warning FIX ME: temporary hack to keep ARM build working
-#ifndef _URC_FATAL_PHASE1_ERROR
-#define _URC_FATAL_PHASE1_ERROR 3
-#endif // ! _URC_FATAL_PHASE1_ERROR
-#ifndef _URC_FATAL_PHASE2_ERROR
-#define _URC_FATAL_PHASE2_ERROR 2
-#endif // ! _URC_FATAL_PHASE2_ERROR
-#endif // __ARM_ARCH
-
 #include "lsda.h"
 
@@ -267,6 +256,12 @@
 	// the whole stack.
 
+#if defined( __x86_64 ) || defined( __i386 )
 	// We did not simply reach the end of the stack without finding a handler. This is an error.
 	if ( ret != _URC_END_OF_STACK ) {
+#else // defined( __ARM_ARCH )
+	// The return code from _Unwind_RaiseException seems to be corrupt on ARM at end of stack.
+	// This workaround tries to keep default exception handling working. 
+	if ( ret == _URC_FATAL_PHASE1_ERROR || ret == _URC_FATAL_PHASE2_ERROR ) {
+#endif
 		printf("UNWIND ERROR %d after raise exception\n", ret);
 		abort();
@@ -301,5 +296,5 @@
 }
 
-#if defined( __x86_64 ) || defined( __i386 )
+#if defined( __x86_64 ) || defined( __i386 ) || defined( __ARM_ARCH )
 // This is our personality routine. For every stack frame annotated with
 // ".cfi_personality 0x3,__gcfa_personality_v0" this function will be called twice when unwinding.
@@ -419,6 +414,5 @@
 				    _Unwind_GetCFA(unwind_context) + 24;
 #				elif defined( __ARM_ARCH )
-#				    warning FIX ME: check if anything needed for ARM
-				    42;
+				    _Unwind_GetCFA(unwind_context) + 40;
 #				endif
 				int (*matcher)(exception_t *) = *(int(**)(exception_t *))match_pos;
@@ -537,5 +531,9 @@
 	// HEADER
 	".LFECFA1:\n"
+#if defined( __x86_64 ) || defined( __i386 )
 	"	.globl	__gcfa_personality_v0\n"
+#else // defined( __ARM_ARCH )
+	"	.global	__gcfa_personality_v0\n"
+#endif
 	"	.section	.gcc_except_table,\"a\",@progbits\n"
 	// TABLE HEADER (important field is the BODY length at the end)
@@ -569,5 +567,9 @@
 	// No clue what this does specifically
 	"	.section	.data.rel.local.CFA.ref.__gcfa_personality_v0,\"awG\",@progbits,CFA.ref.__gcfa_personality_v0,comdat\n"
+#if defined( __x86_64 ) || defined( __i386 )
 	"	.align 8\n"
+#else // defined( __ARM_ARCH )
+	"	.align 3\n"
+#endif
 	"	.type CFA.ref.__gcfa_personality_v0, @object\n"
 	"	.size CFA.ref.__gcfa_personality_v0, 8\n"
@@ -575,6 +577,8 @@
 #if defined( __x86_64 )
 	"	.quad __gcfa_personality_v0\n"
-#else // then __i386
+#elif defined( __i386 )
 	"	.long __gcfa_personality_v0\n"
+#else // defined( __ARM_ARCH )
+	"	.xword __gcfa_personality_v0\n"
 #endif
 );
@@ -583,5 +587,9 @@
 	// HEADER
 	".LFECFA1:\n"
+#if defined( __x86_64 ) || defined( __i386 )
 	"	.globl	__gcfa_personality_v0\n"
+#else // defined( __ARM_ARCH )
+	"	.global	__gcfa_personality_v0\n"
+#endif
 	"	.section	.gcc_except_table,\"a\",@progbits\n"
 	// TABLE HEADER (important field is the BODY length at the end)
@@ -612,20 +620,5 @@
 #pragma GCC pop_options
 
-#elif defined( __ARM_ARCH )
-_Unwind_Reason_Code __gcfa_personality_v0(
-		int version,
-		_Unwind_Action actions,
-		unsigned long long exception_class,
-		struct _Unwind_Exception * unwind_exception,
-		struct _Unwind_Context * unwind_context) {
-	return _URC_CONTINUE_UNWIND;
-}
-
-__attribute__((noinline))
-void __cfaehm_try_terminate(void (*try_block)(),
-		void (*catch_block)(int index, exception_t * except),
-		__attribute__((unused)) int (*match_block)(exception_t * except)) {
-}
 #else
 	#error unsupported hardware architecture
-#endif // __x86_64 || __i386
+#endif // __x86_64 || __i386 || __ARM_ARCH
Index: libcfa/src/interpose.cfa
===================================================================
--- libcfa/src/interpose.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/interpose.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -95,5 +95,4 @@
 
 extern "C" {
-	void __cfaabi_interpose_startup(void)  __attribute__(( constructor( STARTUP_PRIORITY_CORE ) ));
 	void __cfaabi_interpose_startup( void ) {
 		const char *version = 0p;
Index: libcfa/src/startup.cfa
===================================================================
--- libcfa/src/startup.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/startup.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -20,6 +20,6 @@
 
 extern "C" {
-    void __cfaabi_appready_startup( void ) __attribute__(( constructor( STARTUP_PRIORITY_APPREADY ) ));
-    void __cfaabi_appready_startup( void ) {
+	void __cfaabi_appready_startup( void ) __attribute__(( constructor( STARTUP_PRIORITY_APPREADY ) ));
+	void __cfaabi_appready_startup( void ) {
 		tzset();										// initialize time global variables
 		setlocale( LC_NUMERIC, getenv("LANG") );
@@ -28,16 +28,32 @@
 		heapAppStart();
 		#endif // __CFA_DEBUG__
-    } // __cfaabi_appready_startup
+	} // __cfaabi_appready_startup
 
-    void __cfaabi_appready_shutdown( void ) __attribute__(( destructor( STARTUP_PRIORITY_APPREADY ) ));
-    void __cfaabi_appready_shutdown( void ) {
+	void __cfaabi_appready_shutdown( void ) __attribute__(( destructor( STARTUP_PRIORITY_APPREADY ) ));
+	void __cfaabi_appready_shutdown( void ) {
 		#ifdef __CFA_DEBUG__
 		extern void heapAppStop();
 		heapAppStop();
 		#endif // __CFA_DEBUG__
-    } // __cfaabi_appready_shutdown
+	} // __cfaabi_appready_shutdown
 
-    void disable_interrupts() __attribute__(( weak )) {}
-    void enable_interrupts() __attribute__(( weak )) {}
+	void disable_interrupts() __attribute__(( weak )) {}
+	void enable_interrupts() __attribute__(( weak )) {}
+
+
+	extern void __cfaabi_interpose_startup( void );
+	extern void __cfaabi_device_startup   ( void );
+	extern void __cfaabi_device_shutdown  ( void );
+
+	void __cfaabi_core_startup( void ) __attribute__(( constructor( STARTUP_PRIORITY_CORE ) ));
+	void __cfaabi_core_startup( void ) {
+		__cfaabi_interpose_startup();
+		__cfaabi_device_startup();
+	} // __cfaabi_core_startup
+
+	void __cfaabi_core_shutdown( void ) __attribute__(( destructor( STARTUP_PRIORITY_CORE ) ));
+	void __cfaabi_core_shutdown( void ) {
+		__cfaabi_device_shutdown();
+	} // __cfaabi_core_shutdown
 } // extern "C"
 
Index: libcfa/src/stdhdr/pthread.h
===================================================================
--- libcfa/src/stdhdr/pthread.h	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
+++ libcfa/src/stdhdr/pthread.h	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -0,0 +1,24 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+// 
+// pthread.h -- 
+// 
+// Author           : Peter A. Buhr
+// Created On       : Wed Jun 16 13:39:06 2021
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Wed Jun 16 13:39:42 2021
+// Update Count     : 1
+// 
+
+extern "C" {
+#include_next <pthread.h>								// has internal check for multiple expansion
+} // extern "C"
+
+// Local Variables: //
+// tab-width: 4 //
+// mode: c++ //
+// compile-command: "make install" //
+// End: //