Index: libcfa/src/concurrency/coroutine.cfa
===================================================================
--- libcfa/src/concurrency/coroutine.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/coroutine.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -15,4 +15,5 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
 
 #include "coroutine.hfa"
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/io.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -15,4 +15,5 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
 
 #if defined(__CFA_DEBUG__)
@@ -23,5 +24,4 @@
 
 #if defined(CFA_HAVE_LINUX_IO_URING_H)
-	#define _GNU_SOURCE         /* See feature_test_macros(7) */
 	#include <errno.h>
 	#include <signal.h>
Index: libcfa/src/concurrency/io/setup.cfa
===================================================================
--- libcfa/src/concurrency/io/setup.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/io/setup.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -15,5 +15,5 @@
 
 #define __cforall_thread__
-#define _GNU_SOURCE         /* See feature_test_macros(7) */
+#define _GNU_SOURCE
 
 #if defined(__CFA_DEBUG__)
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/kernel.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -15,4 +15,6 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
+
 // #define __CFA_DEBUG_PRINT_RUNTIME_CORE__
 
@@ -278,5 +280,5 @@
 
 				// Spin a little on I/O, just in case
-					for(5) {
+				for(5) {
 					__maybe_io_drain( this );
 					readyThread = pop_fast( this->cltr );
@@ -285,5 +287,5 @@
 
 				// no luck, try stealing a few times
-					for(5) {
+				for(5) {
 					if( __maybe_io_drain( this ) ) {
 						readyThread = pop_fast( this->cltr );
@@ -422,4 +424,6 @@
 		__cfactx_switch( &proc_cor->context, &thrd_dst->context );
 		// when __cfactx_switch returns we are back in the processor coroutine
+
+
 
 		/* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd_dst->canary );
@@ -522,6 +526,6 @@
 
 	/* paranoid */ verify( ! __preemption_enabled() );
-	/* paranoid */ verifyf( ((uintptr_t)thrd_src->context.SP) < ((uintptr_t)__get_stack(thrd_src->curr_cor)->base ), "ERROR : Returning $thread %p has been corrupted.\n StackPointer too small.\n", thrd_src );
-	/* paranoid */ verifyf( ((uintptr_t)thrd_src->context.SP) > ((uintptr_t)__get_stack(thrd_src->curr_cor)->limit), "ERROR : Returning $thread %p has been corrupted.\n StackPointer too large.\n", thrd_src );
+	/* paranoid */ verifyf( ((uintptr_t)thrd_src->context.SP) < ((uintptr_t)__get_stack(thrd_src->curr_cor)->base ) || thrd_src->corctx_flag, "ERROR : Returning $thread %p has been corrupted.\n StackPointer too small.\n", thrd_src );
+	/* paranoid */ verifyf( ((uintptr_t)thrd_src->context.SP) > ((uintptr_t)__get_stack(thrd_src->curr_cor)->limit) || thrd_src->corctx_flag, "ERROR : Returning $thread %p has been corrupted.\n StackPointer too large.\n", thrd_src );
 }
 
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/kernel.hfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -66,4 +66,5 @@
 		unsigned id;
 		unsigned target;
+		unsigned last;
 		unsigned long long int cutoff;
 	} rdq;
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -15,11 +15,15 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
 
 // C Includes
 #include <errno.h>              // errno
+#include <signal.h>
 #include <string.h>             // strerror
 #include <unistd.h>             // sysconf
+
 extern "C" {
       #include <limits.h>       // PTHREAD_STACK_MIN
+	#include <unistd.h>       // syscall
 	#include <sys/eventfd.h>  // eventfd
       #include <sys/mman.h>     // mprotect
@@ -136,4 +140,16 @@
 };
 
+#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
+	// No data needed
+#elif defined(CFA_HAVE_LINUX_RSEQ_H)
+	extern "Cforall" {
+		__attribute__((aligned(128))) thread_local volatile struct rseq __cfaabi_rseq @= {
+			.cpu_id : RSEQ_CPU_ID_UNINITIALIZED,
+		};
+	}
+#else
+	// No data needed
+#endif
+
 //-----------------------------------------------------------------------------
 // Struct to steal stack
@@ -468,5 +484,5 @@
 	self_mon_p = &self_mon;
 	link.next = 0p;
-	link.ts   = 0;
+	link.ts   = -1llu;
 	preferred = -1u;
 	last_proc = 0p;
@@ -497,4 +513,5 @@
 	this.rdq.id  = -1u;
 	this.rdq.target = -1u;
+	this.rdq.last = -1u;
 	this.rdq.cutoff = 0ull;
 	do_terminate = false;
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -16,4 +16,8 @@
 #pragma once
 
+#if !defined(__cforall_thread__)
+	#error kernel_private.hfa should only be included in libcfathread source
+#endif
+
 #include "kernel.hfa"
 #include "thread.hfa"
@@ -22,8 +26,19 @@
 #include "stats.hfa"
 
+extern "C" {
+#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
+	#include <rseq/rseq.h>
+#elif defined(CFA_HAVE_LINUX_RSEQ_H)
+	#include <linux/rseq.h>
+#else
+	#ifndef _GNU_SOURCE
+	#error kernel_private requires gnu_source
+	#endif
+	#include <sched.h>
+#endif
+}
+
 //-----------------------------------------------------------------------------
 // Scheduler
-
-
 extern "C" {
 	void disable_interrupts() OPTIONAL_THREAD;
@@ -39,4 +54,30 @@
 
 //-----------------------------------------------------------------------------
+// Hardware
+
+#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
+	// No data needed
+#elif defined(CFA_HAVE_LINUX_RSEQ_H)
+	extern "Cforall" {
+		extern __attribute__((aligned(128))) thread_local volatile struct rseq __cfaabi_rseq;
+	}
+#else
+	// No data needed
+#endif
+
+static inline int __kernel_getcpu() {
+	/* paranoid */ verify( ! __preemption_enabled() );
+#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
+	return rseq_current_cpu();
+#elif defined(CFA_HAVE_LINUX_RSEQ_H)
+	int r = __cfaabi_rseq.cpu_id;
+	/* paranoid */ verify( r >= 0 );
+	return r;
+#else
+	return sched_getcpu();
+#endif
+}
+
+//-----------------------------------------------------------------------------
 // Processor
 void main(processorCtx_t *);
@@ -44,6 +85,4 @@
 void * __create_pthread( pthread_t *, void * (*)(void *), void * );
 void __destroy_pthread( pthread_t pthread, void * stack, void ** retval );
-
-
 
 extern cluster * mainCluster;
Index: libcfa/src/concurrency/locks.cfa
===================================================================
--- libcfa/src/concurrency/locks.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/locks.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -16,4 +16,5 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
 
 #include "locks.hfa"
Index: libcfa/src/concurrency/locks.hfa
===================================================================
--- libcfa/src/concurrency/locks.hfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/locks.hfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -24,4 +24,5 @@
 #include "containers/list.hfa"
 
+#include "limits.hfa"
 #include "thread.hfa"
 
@@ -87,4 +88,5 @@
 	bool tryP(BinaryBenaphore & this) {
 		ssize_t c = this.counter;
+		/* paranoid */ verify( c > MIN );
 		return (c >= 1) && __atomic_compare_exchange_n(&this.counter, &c, c-1, false, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
 	}
@@ -94,4 +96,5 @@
 		ssize_t c = 0;
 		for () {
+			/* paranoid */ verify( this.counter < MAX );
 			if (__atomic_compare_exchange_n(&this.counter, &c, c+1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
 				if (c == 0) return true;
@@ -173,4 +176,6 @@
 	ThreadBenaphore sem;
 };
+
+static inline void ?{}(fast_lock & this) { this.owner = 0p; }
 
 static inline bool $try_lock(fast_lock & this, $thread * thrd) {
Index: libcfa/src/concurrency/monitor.cfa
===================================================================
--- libcfa/src/concurrency/monitor.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/monitor.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -15,4 +15,5 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
 
 #include "monitor.hfa"
Index: libcfa/src/concurrency/mutex.cfa
===================================================================
--- libcfa/src/concurrency/mutex.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/mutex.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -17,4 +17,5 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
 
 #include "mutex.hfa"
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/preemption.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -15,4 +15,6 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
+
 // #define __CFA_DEBUG_PRINT_PREEMPTION__
 
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -15,4 +15,6 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
+
 // #define __CFA_DEBUG_PRINT_READY_QUEUE__
 
@@ -20,13 +22,19 @@
 #define USE_RELAXED_FIFO
 // #define USE_WORK_STEALING
+// #define USE_CPU_WORK_STEALING
 
 #include "bits/defs.hfa"
+#include "device/cpu.hfa"
 #include "kernel_private.hfa"
 
-#define _GNU_SOURCE
 #include "stdlib.hfa"
 #include "math.hfa"
 
+#include <errno.h>
 #include <unistd.h>
+
+extern "C" {
+	#include <sys/syscall.h>  // __NR_xxx
+}
 
 #include "ready_subqueue.hfa"
@@ -46,5 +54,7 @@
 #endif
 
-#if   defined(USE_RELAXED_FIFO)
+#if   defined(USE_CPU_WORK_STEALING)
+	#define READYQ_SHARD_FACTOR 2
+#elif defined(USE_RELAXED_FIFO)
 	#define BIAS 4
 	#define READYQ_SHARD_FACTOR 4
@@ -85,4 +95,23 @@
 }
 
+#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
+	// No forward declaration needed
+	#define __kernel_rseq_register rseq_register_current_thread
+	#define __kernel_rseq_unregister rseq_unregister_current_thread
+#elif defined(CFA_HAVE_LINUX_RSEQ_H)
+	void __kernel_raw_rseq_register  (void);
+	void __kernel_raw_rseq_unregister(void);
+
+	#define __kernel_rseq_register __kernel_raw_rseq_register
+	#define __kernel_rseq_unregister __kernel_raw_rseq_unregister
+#else
+	// No forward declaration needed
+	// No initialization needed
+	static inline void noop(void) {}
+
+	#define __kernel_rseq_register noop
+	#define __kernel_rseq_unregister noop
+#endif
+
 //=======================================================================
 // Cluster wide reader-writer lock
@@ -107,4 +136,6 @@
 // Lock-Free registering/unregistering of threads
 unsigned register_proc_id( void ) with(*__scheduler_lock) {
+	__kernel_rseq_register();
+
 	__cfadbg_print_safe(ready_queue, "Kernel : Registering proc %p for RW-Lock\n", proc);
 	bool * handle = (bool *)&kernelTLS().sched_lock;
@@ -161,4 +192,6 @@
 
 	__cfadbg_print_safe(ready_queue, "Kernel : Unregister proc %p\n", proc);
+
+	__kernel_rseq_unregister();
 }
 
@@ -214,11 +247,25 @@
 //=======================================================================
 void ?{}(__ready_queue_t & this) with (this) {
-	lanes.data  = 0p;
-	lanes.tscs  = 0p;
-	lanes.count = 0;
+	#if defined(USE_CPU_WORK_STEALING)
+		lanes.count = cpu_info.hthrd_count * READYQ_SHARD_FACTOR;
+		lanes.data = alloc( lanes.count );
+		lanes.tscs = alloc( lanes.count );
+
+		for( idx; (size_t)lanes.count ) {
+			(lanes.data[idx]){};
+			lanes.tscs[idx].tv = rdtscl();
+		}
+	#else
+		lanes.data  = 0p;
+		lanes.tscs  = 0p;
+		lanes.count = 0;
+	#endif
 }
 
 void ^?{}(__ready_queue_t & this) with (this) {
-	verify( SEQUENTIAL_SHARD == lanes.count );
+	#if !defined(USE_CPU_WORK_STEALING)
+		verify( SEQUENTIAL_SHARD == lanes.count );
+	#endif
+
 	free(lanes.data);
 	free(lanes.tscs);
@@ -226,4 +273,143 @@
 
 //-----------------------------------------------------------------------
+#if defined(USE_CPU_WORK_STEALING)
+	__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd, bool push_local) with (cltr->ready_queue) {
+		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
+
+		processor * const proc = kernelTLS().this_processor;
+		const bool external = !push_local || (!proc) || (cltr != proc->cltr);
+
+		const int cpu = __kernel_getcpu();
+		/* paranoid */ verify(cpu >= 0);
+		/* paranoid */ verify(cpu < cpu_info.hthrd_count);
+		/* paranoid */ verify(cpu * READYQ_SHARD_FACTOR < lanes.count);
+
+		const cpu_map_entry_t & map = cpu_info.llc_map[cpu];
+		/* paranoid */ verify(map.start * READYQ_SHARD_FACTOR < lanes.count);
+		/* paranoid */ verify(map.self * READYQ_SHARD_FACTOR < lanes.count);
+		/* paranoid */ verifyf((map.start + map.count) * READYQ_SHARD_FACTOR <= lanes.count, "have %zu lanes but map can go up to %u", lanes.count, (map.start + map.count) * READYQ_SHARD_FACTOR);
+
+		const int start = map.self * READYQ_SHARD_FACTOR;
+		unsigned i;
+		do {
+			unsigned r;
+			if(unlikely(external)) { r = __tls_rand(); }
+			else { r = proc->rdq.its++; }
+			i = start + (r % READYQ_SHARD_FACTOR);
+			// If we can't lock it retry
+		} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+
+		// Actually push it
+		push(lanes.data[i], thrd);
+
+		// Unlock and return
+		__atomic_unlock( &lanes.data[i].lock );
+
+		#if !defined(__CFA_NO_STATISTICS__)
+			if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED);
+			else __tls_stats()->ready.push.local.success++;
+		#endif
+
+		__cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
+
+	}
+
+	// Pop from the ready queue from a given cluster
+	__attribute__((hot)) $thread * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
+		/* paranoid */ verify( lanes.count > 0 );
+		/* paranoid */ verify( kernelTLS().this_processor );
+
+		const int cpu = __kernel_getcpu();
+		/* paranoid */ verify(cpu >= 0);
+		/* paranoid */ verify(cpu < cpu_info.hthrd_count);
+		/* paranoid */ verify(cpu * READYQ_SHARD_FACTOR < lanes.count);
+
+		const cpu_map_entry_t & map = cpu_info.llc_map[cpu];
+		/* paranoid */ verify(map.start * READYQ_SHARD_FACTOR < lanes.count);
+		/* paranoid */ verify(map.self * READYQ_SHARD_FACTOR < lanes.count);
+		/* paranoid */ verifyf((map.start + map.count) * READYQ_SHARD_FACTOR <= lanes.count, "have %zu lanes but map can go up to %u", lanes.count, (map.start + map.count) * READYQ_SHARD_FACTOR);
+
+		processor * const proc = kernelTLS().this_processor;
+		const int start = map.self * READYQ_SHARD_FACTOR;
+
+		// Did we already have a help target
+		if(proc->rdq.target == -1u) {
+			// if We don't have a
+			unsigned long long min = ts(lanes.data[start]);
+			for(i; READYQ_SHARD_FACTOR) {
+				unsigned long long tsc = ts(lanes.data[start + i]);
+				if(tsc < min) min = tsc;
+			}
+			proc->rdq.cutoff = min;
+
+			/* paranoid */ verify(lanes.count < 65536); // The following code assumes max 65536 cores.
+			/* paranoid */ verify(map.count < 65536); // The following code assumes max 65536 cores.
+			uint64_t chaos = __tls_rand();
+			uint64_t high_chaos = (chaos >> 32);
+			uint64_t  mid_chaos = (chaos >> 16) & 0xffff;
+			uint64_t  low_chaos = chaos & 0xffff;
+
+			unsigned me = map.self;
+			unsigned cpu_chaos = map.start + (mid_chaos % map.count);
+			bool global = cpu_chaos == me;
+
+			if(global) {
+				proc->rdq.target = high_chaos % lanes.count;
+			} else {
+				proc->rdq.target = (cpu_chaos * READYQ_SHARD_FACTOR) + (low_chaos % READYQ_SHARD_FACTOR);
+				/* paranoid */ verify(proc->rdq.target >= (map.start * READYQ_SHARD_FACTOR));
+				/* paranoid */ verify(proc->rdq.target <  ((map.start + map.count) * READYQ_SHARD_FACTOR));
+			}
+
+			/* paranoid */ verify(proc->rdq.target != -1u);
+		}
+		else {
+			const unsigned long long bias = 0; //2_500_000_000;
+			const unsigned long long cutoff = proc->rdq.cutoff > bias ? proc->rdq.cutoff - bias : proc->rdq.cutoff;
+			{
+				unsigned target = proc->rdq.target;
+				proc->rdq.target = -1u;
+				if(lanes.tscs[target].tv < cutoff && ts(lanes.data[target]) < cutoff) {
+					$thread * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
+					proc->rdq.last = target;
+					if(t) return t;
+				}
+			}
+
+			unsigned last = proc->rdq.last;
+			if(last != -1u && lanes.tscs[last].tv < cutoff && ts(lanes.data[last]) < cutoff) {
+				$thread * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.help));
+				if(t) return t;
+			}
+			else {
+				proc->rdq.last = -1u;
+			}
+		}
+
+		for(READYQ_SHARD_FACTOR) {
+			unsigned i = start + (proc->rdq.itr++ % READYQ_SHARD_FACTOR);
+			if($thread * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
+		}
+
+		// All lanes where empty return 0p
+		return 0p;
+	}
+
+	__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+		processor * const proc = kernelTLS().this_processor;
+		unsigned last = proc->rdq.last;
+		if(last != -1u) {
+			struct $thread * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.steal));
+			if(t) return t;
+			proc->rdq.last = -1u;
+		}
+
+		unsigned i = __tls_rand() % lanes.count;
+		return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal));
+	}
+	__attribute__((hot)) struct $thread * pop_search(struct cluster * cltr) {
+		return search(cltr);
+	}
+#endif
 #if defined(USE_RELAXED_FIFO)
 	//-----------------------------------------------------------------------
@@ -519,9 +705,9 @@
 					if(is_empty(sl)) {
 						assert( sl.anchor.next == 0p );
-						assert( sl.anchor.ts   == 0  );
+						assert( sl.anchor.ts   == -1llu );
 						assert( mock_head(sl)  == sl.prev );
 					} else {
 						assert( sl.anchor.next != 0p );
-						assert( sl.anchor.ts   != 0  );
+						assert( sl.anchor.ts   != -1llu );
 						assert( mock_head(sl)  != sl.prev );
 					}
@@ -573,134 +759,141 @@
 		lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
 		for(i; lanes.count) {
-			unsigned long long tsc = ts(lanes.data[i]);
-			lanes.tscs[i].tv = tsc != 0 ? tsc : rdtscl();
+			unsigned long long tsc1 = ts(lanes.data[i]);
+			unsigned long long tsc2 = rdtscl();
+			lanes.tscs[i].tv = min(tsc1, tsc2);
 		}
 	#endif
 }
 
-// Grow the ready queue
-void ready_queue_grow(struct cluster * cltr) {
-	size_t ncount;
-	int target = cltr->procs.total;
-
-	/* paranoid */ verify( ready_mutate_islocked() );
-	__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue\n");
-
-	// Make sure that everything is consistent
-	/* paranoid */ check( cltr->ready_queue );
-
-	// grow the ready queue
-	with( cltr->ready_queue ) {
-		// Find new count
-		// Make sure we always have atleast 1 list
-		if(target >= 2) {
-			ncount = target * READYQ_SHARD_FACTOR;
-		} else {
-			ncount = SEQUENTIAL_SHARD;
-		}
-
-		// Allocate new array (uses realloc and memcpies the data)
-		lanes.data = alloc( ncount, lanes.data`realloc );
-
-		// Fix the moved data
-		for( idx; (size_t)lanes.count ) {
-			fix(lanes.data[idx]);
-		}
-
-		// Construct new data
-		for( idx; (size_t)lanes.count ~ ncount) {
-			(lanes.data[idx]){};
-		}
-
-		// Update original
-		lanes.count = ncount;
-	}
-
-	fix_times(cltr);
-
-	reassign_cltr_id(cltr);
-
-	// Make sure that everything is consistent
-	/* paranoid */ check( cltr->ready_queue );
-
-	__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue done\n");
-
-	/* paranoid */ verify( ready_mutate_islocked() );
-}
-
-// Shrink the ready queue
-void ready_queue_shrink(struct cluster * cltr) {
-	/* paranoid */ verify( ready_mutate_islocked() );
-	__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue\n");
-
-	// Make sure that everything is consistent
-	/* paranoid */ check( cltr->ready_queue );
-
-	int target = cltr->procs.total;
-
-	with( cltr->ready_queue ) {
-		// Remember old count
-		size_t ocount = lanes.count;
-
-		// Find new count
-		// Make sure we always have atleast 1 list
-		lanes.count = target >= 2 ? target * READYQ_SHARD_FACTOR: SEQUENTIAL_SHARD;
-		/* paranoid */ verify( ocount >= lanes.count );
-		/* paranoid */ verify( lanes.count == target * READYQ_SHARD_FACTOR || target < 2 );
-
-		// for printing count the number of displaced threads
-		#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
-			__attribute__((unused)) size_t displaced = 0;
-		#endif
-
-		// redistribute old data
-		for( idx; (size_t)lanes.count ~ ocount) {
-			// Lock is not strictly needed but makes checking invariants much easier
-			__attribute__((unused)) bool locked = __atomic_try_acquire(&lanes.data[idx].lock);
-			verify(locked);
-
-			// As long as we can pop from this lane to push the threads somewhere else in the queue
-			while(!is_empty(lanes.data[idx])) {
-				struct $thread * thrd;
-				unsigned long long _;
-				[thrd, _] = pop(lanes.data[idx]);
-
-				push(cltr, thrd, true);
-
-				// for printing count the number of displaced threads
-				#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
-					displaced++;
-				#endif
-			}
-
-			// Unlock the lane
-			__atomic_unlock(&lanes.data[idx].lock);
-
-			// TODO print the queue statistics here
-
-			^(lanes.data[idx]){};
-		}
-
-		__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue displaced %zu threads\n", displaced);
-
-		// Allocate new array (uses realloc and memcpies the data)
-		lanes.data = alloc( lanes.count, lanes.data`realloc );
-
-		// Fix the moved data
-		for( idx; (size_t)lanes.count ) {
-			fix(lanes.data[idx]);
-		}
-	}
-
-	fix_times(cltr);
-
-	reassign_cltr_id(cltr);
-
-	// Make sure that everything is consistent
-	/* paranoid */ check( cltr->ready_queue );
-
-	__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue done\n");
-	/* paranoid */ verify( ready_mutate_islocked() );
-}
+#if defined(USE_CPU_WORK_STEALING)
+	// ready_queue size is fixed in this case
+	void ready_queue_grow(struct cluster * cltr) {}
+	void ready_queue_shrink(struct cluster * cltr) {}
+#else
+	// Grow the ready queue
+	void ready_queue_grow(struct cluster * cltr) {
+		size_t ncount;
+		int target = cltr->procs.total;
+
+		/* paranoid */ verify( ready_mutate_islocked() );
+		__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue\n");
+
+		// Make sure that everything is consistent
+		/* paranoid */ check( cltr->ready_queue );
+
+		// grow the ready queue
+		with( cltr->ready_queue ) {
+			// Find new count
+			// Make sure we always have atleast 1 list
+			if(target >= 2) {
+				ncount = target * READYQ_SHARD_FACTOR;
+			} else {
+				ncount = SEQUENTIAL_SHARD;
+			}
+
+			// Allocate new array (uses realloc and memcpies the data)
+			lanes.data = alloc( ncount, lanes.data`realloc );
+
+			// Fix the moved data
+			for( idx; (size_t)lanes.count ) {
+				fix(lanes.data[idx]);
+			}
+
+			// Construct new data
+			for( idx; (size_t)lanes.count ~ ncount) {
+				(lanes.data[idx]){};
+			}
+
+			// Update original
+			lanes.count = ncount;
+		}
+
+		fix_times(cltr);
+
+		reassign_cltr_id(cltr);
+
+		// Make sure that everything is consistent
+		/* paranoid */ check( cltr->ready_queue );
+
+		__cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue done\n");
+
+		/* paranoid */ verify( ready_mutate_islocked() );
+	}
+
+	// Shrink the ready queue
+	void ready_queue_shrink(struct cluster * cltr) {
+		/* paranoid */ verify( ready_mutate_islocked() );
+		__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue\n");
+
+		// Make sure that everything is consistent
+		/* paranoid */ check( cltr->ready_queue );
+
+		int target = cltr->procs.total;
+
+		with( cltr->ready_queue ) {
+			// Remember old count
+			size_t ocount = lanes.count;
+
+			// Find new count
+			// Make sure we always have atleast 1 list
+			lanes.count = target >= 2 ? target * READYQ_SHARD_FACTOR: SEQUENTIAL_SHARD;
+			/* paranoid */ verify( ocount >= lanes.count );
+			/* paranoid */ verify( lanes.count == target * READYQ_SHARD_FACTOR || target < 2 );
+
+			// for printing count the number of displaced threads
+			#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
+				__attribute__((unused)) size_t displaced = 0;
+			#endif
+
+			// redistribute old data
+			for( idx; (size_t)lanes.count ~ ocount) {
+				// Lock is not strictly needed but makes checking invariants much easier
+				__attribute__((unused)) bool locked = __atomic_try_acquire(&lanes.data[idx].lock);
+				verify(locked);
+
+				// As long as we can pop from this lane to push the threads somewhere else in the queue
+				while(!is_empty(lanes.data[idx])) {
+					struct $thread * thrd;
+					unsigned long long _;
+					[thrd, _] = pop(lanes.data[idx]);
+
+					push(cltr, thrd, true);
+
+					// for printing count the number of displaced threads
+					#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
+						displaced++;
+					#endif
+				}
+
+				// Unlock the lane
+				__atomic_unlock(&lanes.data[idx].lock);
+
+				// TODO print the queue statistics here
+
+				^(lanes.data[idx]){};
+			}
+
+			__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue displaced %zu threads\n", displaced);
+
+			// Allocate new array (uses realloc and memcpies the data)
+			lanes.data = alloc( lanes.count, lanes.data`realloc );
+
+			// Fix the moved data
+			for( idx; (size_t)lanes.count ) {
+				fix(lanes.data[idx]);
+			}
+		}
+
+		fix_times(cltr);
+
+		reassign_cltr_id(cltr);
+
+		// Make sure that everything is consistent
+		/* paranoid */ check( cltr->ready_queue );
+
+		__cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue done\n");
+		/* paranoid */ verify( ready_mutate_islocked() );
+	}
+#endif
 
 #if !defined(__CFA_NO_STATISTICS__)
@@ -710,2 +903,59 @@
 	}
 #endif
+
+
+#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
+	// No definition needed
+#elif defined(CFA_HAVE_LINUX_RSEQ_H)
+
+	#if defined( __x86_64 ) || defined( __i386 )
+		#define RSEQ_SIG	0x53053053
+	#elif defined( __ARM_ARCH )
+		#ifdef __ARMEB__
+		#define RSEQ_SIG    0xf3def5e7      /* udf    #24035    ; 0x5de3 (ARMv6+) */
+		#else
+		#define RSEQ_SIG    0xe7f5def3      /* udf    #24035    ; 0x5de3 */
+		#endif
+	#endif
+
+	extern void __disable_interrupts_hard();
+	extern void __enable_interrupts_hard();
+
+	void __kernel_raw_rseq_register  (void) {
+		/* paranoid */ verify( __cfaabi_rseq.cpu_id == RSEQ_CPU_ID_UNINITIALIZED );
+
+		// int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, (sigset_t *)0p, _NSIG / 8);
+		int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, RSEQ_SIG);
+		if(ret != 0) {
+			int e = errno;
+			switch(e) {
+			case EINVAL: abort("KERNEL ERROR: rseq register invalid argument");
+			case ENOSYS: abort("KERNEL ERROR: rseq register no supported");
+			case EFAULT: abort("KERNEL ERROR: rseq register with invalid argument");
+			case EBUSY : abort("KERNEL ERROR: rseq register already registered");
+			case EPERM : abort("KERNEL ERROR: rseq register sig  argument  on unregistration does not match the signature received on registration");
+			default: abort("KERNEL ERROR: rseq register unexpected return %d", e);
+			}
+		}
+	}
+
+	void __kernel_raw_rseq_unregister(void) {
+		/* paranoid */ verify( __cfaabi_rseq.cpu_id >= 0 );
+
+		// int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, (sigset_t *)0p, _NSIG / 8);
+		int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
+		if(ret != 0) {
+			int e = errno;
+			switch(e) {
+			case EINVAL: abort("KERNEL ERROR: rseq unregister invalid argument");
+			case ENOSYS: abort("KERNEL ERROR: rseq unregister no supported");
+			case EFAULT: abort("KERNEL ERROR: rseq unregister with invalid argument");
+			case EBUSY : abort("KERNEL ERROR: rseq unregister already registered");
+			case EPERM : abort("KERNEL ERROR: rseq unregister sig  argument  on unregistration does not match the signature received on registration");
+			default: abort("KERNEL ERROR: rseq unregisteunexpected return %d", e);
+			}
+		}
+	}
+#else
+	// No definition needed
+#endif
Index: libcfa/src/concurrency/ready_subqueue.hfa
===================================================================
--- libcfa/src/concurrency/ready_subqueue.hfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/ready_subqueue.hfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -32,5 +32,5 @@
 	this.prev = mock_head(this);
 	this.anchor.next = 0p;
-	this.anchor.ts   = 0;
+	this.anchor.ts   = -1llu;
 	#if !defined(__CFA_NO_STATISTICS__)
 		this.cnt  = 0;
@@ -44,5 +44,5 @@
 	/* paranoid */ verify( &mock_head(this)->link.ts   == &this.anchor.ts   );
 	/* paranoid */ verify( mock_head(this)->link.next == 0p );
-	/* paranoid */ verify( mock_head(this)->link.ts   == 0  );
+	/* paranoid */ verify( mock_head(this)->link.ts   == -1llu  );
 	/* paranoid */ verify( mock_head(this) == this.prev );
 	/* paranoid */ verify( __alignof__(__intrusive_lane_t) == 128 );
@@ -55,5 +55,5 @@
 	// Make sure the list is empty
 	/* paranoid */ verify( this.anchor.next == 0p );
-	/* paranoid */ verify( this.anchor.ts   == 0  );
+	/* paranoid */ verify( this.anchor.ts   == -1llu );
 	/* paranoid */ verify( mock_head(this)  == this.prev );
 }
@@ -64,13 +64,15 @@
 	/* paranoid */ verify( this.lock );
 	/* paranoid */ verify( node->link.next == 0p );
-	/* paranoid */ verify( node->link.ts   == 0  );
+	/* paranoid */ verify( node->link.ts   == -1llu  );
 	/* paranoid */ verify( this.prev->link.next == 0p );
-	/* paranoid */ verify( this.prev->link.ts   == 0  );
+	/* paranoid */ verify( this.prev->link.ts   == -1llu  );
 	if( this.anchor.next == 0p ) {
 		/* paranoid */ verify( this.anchor.next == 0p );
-		/* paranoid */ verify( this.anchor.ts   == 0  );
+		/* paranoid */ verify( this.anchor.ts   == -1llu );
+		/* paranoid */ verify( this.anchor.ts   != 0  );
 		/* paranoid */ verify( this.prev == mock_head( this ) );
 	} else {
 		/* paranoid */ verify( this.anchor.next != 0p );
+		/* paranoid */ verify( this.anchor.ts   != -1llu );
 		/* paranoid */ verify( this.anchor.ts   != 0  );
 		/* paranoid */ verify( this.prev != mock_head( this ) );
@@ -92,4 +94,5 @@
 	/* paranoid */ verify( this.lock );
 	/* paranoid */ verify( this.anchor.next != 0p );
+	/* paranoid */ verify( this.anchor.ts   != -1llu );
 	/* paranoid */ verify( this.anchor.ts   != 0  );
 
@@ -99,7 +102,7 @@
 	this.anchor.next = node->link.next;
 	this.anchor.ts   = node->link.ts;
-	bool is_empty = this.anchor.ts == 0;
+	bool is_empty = this.anchor.next == 0p;
 	node->link.next = 0p;
-	node->link.ts   = 0;
+	node->link.ts   = -1llu;
 	#if !defined(__CFA_NO_STATISTICS__)
 		this.cnt--;
@@ -110,5 +113,7 @@
 
 	/* paranoid */ verify( node->link.next == 0p );
-	/* paranoid */ verify( node->link.ts   == 0  );
+	/* paranoid */ verify( node->link.ts   == -1llu  );
+	/* paranoid */ verify( node->link.ts   != 0  );
+	/* paranoid */ verify( this.anchor.ts  != 0  );
 	return [node, ts];
 }
@@ -116,5 +121,5 @@
 // Check whether or not list is empty
 static inline bool is_empty(__intrusive_lane_t & this) {
-	return this.anchor.ts == 0;
+	return this.anchor.next == 0p;
 }
 
@@ -122,4 +127,5 @@
 static inline unsigned long long ts(__intrusive_lane_t & this) {
 	// Cannot verify here since it may not be locked
+	/* paranoid */ verify(this.anchor.ts != 0);
 	return this.anchor.ts;
 }
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 5a46e09dae381c4d10eda5d14c8e5293ab0dcbb9)
+++ libcfa/src/concurrency/thread.cfa	(revision 660665fdcc1db2b22595e18697e5ba335dda9b19)
@@ -15,4 +15,5 @@
 
 #define __cforall_thread__
+#define _GNU_SOURCE
 
 #include "thread.hfa"
@@ -39,5 +40,5 @@
 	curr_cluster = &cl;
 	link.next = 0p;
-	link.ts   = 0;
+	link.ts   = -1llu;
 	preferred = -1u;
 	last_proc = 0p;