Index: benchmark/readyQ/cycle.cfa
===================================================================
--- benchmark/readyQ/cycle.cfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ benchmark/readyQ/cycle.cfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -84,5 +84,5 @@
 		}
 
-		printf("Duration (ms)        : %'ld\n", (end - start)`ms);
+		printf("Duration (ms)        : %'ld\n", (end - start)`dms);
 		printf("Number of processors : %'d\n", nprocs);
 		printf("Number of threads    : %'d\n", tthreads);
@@ -90,10 +90,10 @@
 		printf("Total Operations(ops): %'15llu\n", global_counter);
 		printf("Total blocks         : %'15llu\n", global_blocks);
-		printf("Ops per second       : %'18.2lf\n", ((double)global_counter) / (end - start)`s);
-		printf("ns per ops           : %'18.2lf\n", ((double)(end - start)`ns) / global_counter);
+		printf("Ops per second       : %'18.2lf\n", ((double)global_counter) / (end - start)`ds);
+		printf("ns per ops           : %'18.2lf\n", (end - start)`dns / global_counter);
 		printf("Ops per threads      : %'15llu\n", global_counter / tthreads);
 		printf("Ops per procs        : %'15llu\n", global_counter / nprocs);
-		printf("Ops/sec/procs        : %'18.2lf\n", (((double)global_counter) / nprocs) / (end - start)`s);
-		printf("ns per ops/procs     : %'18.2lf\n", ((double)(end - start)`ns) / (global_counter / nprocs));
+		printf("Ops/sec/procs        : %'18.2lf\n", (((double)global_counter) / nprocs) / (end - start)`ds);
+		printf("ns per ops/procs     : %'18.2lf\n", (end - start)`dns / (global_counter / nprocs));
 		fflush(stdout);
 	}
Index: benchmark/readyQ/cycle.go
===================================================================
--- benchmark/readyQ/cycle.go	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ benchmark/readyQ/cycle.go	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -72,5 +72,5 @@
 	p.Printf("Cycle size (# thrds) : %d\n", ring_size);
 	p.Printf("Total Operations(ops): %15d\n", global_counter)
-	p.Printf("Yields per second    : %18.2f\n", float64(global_counter) / delta.Seconds())
+	p.Printf("Ops per second       : %18.2f\n", float64(global_counter) / delta.Seconds())
 	p.Printf("ns per ops           : %18.2f\n", float64(delta.Nanoseconds()) / float64(global_counter))
 	p.Printf("Ops per threads      : %15d\n", global_counter / uint64(tthreads))
Index: benchmark/readyQ/rq_bench.hfa
===================================================================
--- benchmark/readyQ/rq_bench.hfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ benchmark/readyQ/rq_bench.hfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -88,5 +88,5 @@
 }
 
-struct bench_sem {
+struct __attribute__((aligned(128))) bench_sem {
 	struct $thread * volatile ptr;
 };
Index: benchmark/readyQ/rq_bench.hpp
===================================================================
--- benchmark/readyQ/rq_bench.hpp	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ benchmark/readyQ/rq_bench.hpp	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -75,5 +75,5 @@
 }
 
-class bench_sem {
+class __attribute__((aligned(128))) bench_sem {
 	Fibre * volatile ptr = nullptr;
 public:
Index: benchmark/rmit.py
===================================================================
--- benchmark/rmit.py	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ benchmark/rmit.py	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -173,5 +173,4 @@
 	# ================================================================================
 	# Prepare to run
-	print(actions)
 
 	# find expected time
@@ -226,5 +225,8 @@
 			d = [r[0], r[1]]
 			for k in headers[2:]:
-				d.append(r[2][k])
+				try:
+					d.append(r[2][k])
+				except:
+					d.append(0.0)
 
 			data.append(d)
Index: libcfa/src/concurrency/coroutine.cfa
===================================================================
--- libcfa/src/concurrency/coroutine.cfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ libcfa/src/concurrency/coroutine.cfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -134,5 +134,5 @@
 void ^?{}($coroutine& this) {
 	if(this.state != Halted && this.state != Start && this.state != Primed) {
-		$coroutine * src = TL_GET( this_thread )->curr_cor;
+		$coroutine * src = active_coroutine();
 		$coroutine * dst = &this;
 
@@ -240,5 +240,5 @@
 
 	struct $coroutine * __cfactx_cor_finish(void) {
-		struct $coroutine * cor = kernelTLS.this_thread->curr_cor;
+		struct $coroutine * cor = active_coroutine();
 
 		if(cor->state == Primed) {
Index: libcfa/src/concurrency/coroutine.hfa
===================================================================
--- libcfa/src/concurrency/coroutine.hfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ libcfa/src/concurrency/coroutine.hfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -63,5 +63,5 @@
 void prime(T & cor);
 
-static inline struct $coroutine * active_coroutine() { return TL_GET( this_thread )->curr_cor; }
+static inline struct $coroutine * active_coroutine() { return active_thread()->curr_cor; }
 
 //-----------------------------------------------------------------------------
@@ -87,5 +87,5 @@
 
 	// set new coroutine that task is executing
-	TL_GET( this_thread )->curr_cor = dst;
+	active_thread()->curr_cor = dst;
 
 	// context switch to specified coroutine
@@ -112,5 +112,5 @@
 		// will also migrate which means this value will
 		// stay in syn with the TLS
-		$coroutine * src = TL_GET( this_thread )->curr_cor;
+		$coroutine * src = active_coroutine();
 
 		assertf( src->last != 0,
@@ -138,12 +138,12 @@
 	// will also migrate which means this value will
 	// stay in syn with the TLS
-	$coroutine * src = TL_GET( this_thread )->curr_cor;
+	$coroutine * src = active_coroutine();
 	$coroutine * dst = get_coroutine(cor);
 
 	if( unlikely(dst->context.SP == 0p) ) {
-		TL_GET( this_thread )->curr_cor = dst;
+		active_thread()->curr_cor = dst;
 		__stack_prepare(&dst->stack, 65000);
 		__cfactx_start(main, dst, cor, __cfactx_invoke_coroutine);
-		TL_GET( this_thread )->curr_cor = src;
+		active_thread()->curr_cor = src;
 	}
 
@@ -175,5 +175,5 @@
 	// will also migrate which means this value will
 	// stay in syn with the TLS
-	$coroutine * src = TL_GET( this_thread )->curr_cor;
+	$coroutine * src = active_coroutine();
 
 	// not resuming self ?
Index: libcfa/src/concurrency/exception.cfa
===================================================================
--- libcfa/src/concurrency/exception.cfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ libcfa/src/concurrency/exception.cfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -72,5 +72,5 @@
 	void * stop_param;
 
-	struct $thread * this_thread = TL_GET( this_thread );
+	struct $thread * this_thread = active_thread();
 	if ( &this_thread->self_cor != this_thread->curr_cor ) {
 		struct $coroutine * cor = this_thread->curr_cor;
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ libcfa/src/concurrency/io.cfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -76,5 +76,5 @@
 
 	static inline bool next( __leaderlock_t & this ) {
-		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+		/* paranoid */ verify( ! __preemption_enabled() );
 		struct $thread * nextt;
 		for() {
@@ -168,5 +168,5 @@
 	// This is NOT thread-safe
 	static [int, bool] __drain_io( & struct __io_data ring ) {
-		/* paranoid */ verify( !kernelTLS.preemption_state.enabled );
+		/* paranoid */ verify( ! __preemption_enabled() );
 
 		unsigned to_submit = 0;
@@ -404,5 +404,5 @@
 					return;
 				}
-				/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+				/* paranoid */ verify( ! __preemption_enabled() );
 				__STATS__( true,
 					io.submit_q.leader += 1;
@@ -442,5 +442,5 @@
 
 			#if defined(LEADER_LOCK)
-				/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+				/* paranoid */ verify( ! __preemption_enabled() );
 				next(ring.submit_q.submit_lock);
 			#else
Index: libcfa/src/concurrency/io/setup.cfa
===================================================================
--- libcfa/src/concurrency/io/setup.cfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ libcfa/src/concurrency/io/setup.cfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -149,5 +149,5 @@
 		id.full_proc = false;
 		id.id = doregister(&id);
-		kernelTLS.this_proc_id = &id;
+		__cfaabi_tls.this_proc_id = &id;
 		__cfaabi_dbg_print_safe( "Kernel : IO poller thread starting\n" );
 
@@ -179,5 +179,5 @@
 				__cfadbg_print_safe(io_core, "Kernel I/O : Unparking io poller %p\n", io_ctx);
 				#if !defined( __CFA_NO_STATISTICS__ )
-					kernelTLS.this_stats = io_ctx->self.curr_cluster->stats;
+					__cfaabi_tls.this_stats = io_ctx->self.curr_cluster->stats;
 				#endif
 				post( io_ctx->sem );
Index: bcfa/src/concurrency/iocall.cfa
===================================================================
--- libcfa/src/concurrency/iocall.cfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ 	(revision )
@@ -1,621 +1,0 @@
-//
-// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
-//
-// The contents of this file are covered under the licence agreement in the
-// file "LICENCE" distributed with Cforall.
-//
-// iocall.cfa --
-//
-// Author           : Thierry Delisle
-// Created On       : Wed Jul  1 14:51:00 2020
-// Last Modified By :
-// Last Modified On :
-// Update Count     :
-//
-
-#define __cforall_thread__
-
-#include "bits/defs.hfa"
-#include "kernel.hfa"
-
-//=============================================================================================
-// I/O uring backend
-//=============================================================================================
-
-#if defined(CFA_HAVE_LINUX_IO_URING_H)
-	#include <assert.h>
-	#include <stdint.h>
-	#include <errno.h>
-	#include <linux/io_uring.h>
-
-	#include "kernel/fwd.hfa"
-	#include "io/types.hfa"
-
-	extern [* struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data );
-	extern void __submit( struct io_context * ctx, __u32 idx ) __attribute__((nonnull (1)));
-
-	static inline void ?{}(struct io_uring_sqe & this, __u8 opcode, int fd) {
-		this.opcode = opcode;
-		#if !defined(IOSQE_ASYNC)
-			this.flags = 0;
-		#else
-			this.flags = IOSQE_ASYNC;
-		#endif
-		this.ioprio = 0;
-		this.fd = fd;
-		this.off = 0;
-		this.addr = 0;
-		this.len = 0;
-		this.rw_flags = 0;
-		this.__pad2[0] = this.__pad2[1] = this.__pad2[2] = 0;
-	}
-
-	static inline void ?{}(struct io_uring_sqe & this, __u8 opcode, int fd, void * addr, __u32 len, __u64 off ) {
-		(this){ opcode, fd };
-		this.off = off;
-		this.addr = (__u64)(uintptr_t)addr;
-		this.len = len;
-	}
-
-	static inline io_context * __get_io_context( void ) {
-		cluster * cltr = active_cluster();
-		/* paranoid */ verifyf( cltr, "No active cluster for io operation\n");
-		assertf( cltr->io.cnt > 0, "Cluster %p has no default io contexts and no context was specified\n", cltr );
-		/* paranoid */ verifyf( cltr->io.ctxs, "default io contexts for cluster %p are missing\n", cltr);
-		return &cltr->io.ctxs[ __tls_rand() % cltr->io.cnt ];
-	}
-
-
-	#if defined(CFA_HAVE_IOSQE_FIXED_FILE) && defined(CFA_HAVE_IOSQE_IO_DRAIN) && defined(CFA_HAVE_IOSQE_ASYNC)
-		#define REGULAR_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_DRAIN | IOSQE_ASYNC)
-	#elif defined(CFA_HAVE_IOSQE_FIXED_FILE) && defined(CFA_HAVE_IOSQE_ASYNC)
-		#define REGULAR_FLAGS (IOSQE_FIXED_FILE | IOSQE_ASYNC)
-	#elif defined(CFA_HAVE_IOSQE_FIXED_FILE) && defined(CFA_HAVE_IOSQE_IO_DRAIN)
-		#define REGULAR_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_DRAIN)
-	#elif defined(CFA_HAVE_IOSQE_IO_DRAIN) && defined(CFA_HAVE_IOSQE_ASYNC)
-		#define REGULAR_FLAGS (IOSQE_IO_DRAIN | IOSQE_ASYNC)
-	#elif defined(CFA_HAVE_IOSQE_FIXED_FILE)
-		#define REGULAR_FLAGS (IOSQE_FIXED_FILE)
-	#elif defined(CFA_HAVE_IOSQE_IO_DRAIN)
-		#define REGULAR_FLAGS (IOSQE_IO_DRAIN)
-	#elif defined(CFA_HAVE_IOSQE_ASYNC)
-		#define REGULAR_FLAGS (IOSQE_ASYNC)
-	#else
-		#define REGULAR_FLAGS (0)
-	#endif
-
-	#if defined(CFA_HAVE_IOSQE_IO_LINK) && defined(CFA_HAVE_IOSQE_IO_HARDLINK)
-		#define LINK_FLAGS (IOSQE_IO_LINK | IOSQE_IO_HARDLINK)
-	#elif defined(CFA_HAVE_IOSQE_IO_LINK)
-		#define LINK_FLAGS (IOSQE_IO_LINK)
-	#elif defined(CFA_HAVE_IOSQE_IO_HARDLINK)
-		#define LINK_FLAGS (IOSQE_IO_HARDLINK)
-	#else
-		#define LINK_FLAGS (0)
-	#endif
-
-	#if defined(CFA_HAVE_SPLICE_F_FD_IN_FIXED)
-		#define SPLICE_FLAGS (SPLICE_F_FD_IN_FIXED)
-	#else
-		#define SPLICE_FLAGS (0)
-	#endif
-
-	#define __submit_prelude \
-		if( 0 != (submit_flags & LINK_FLAGS) ) { errno = ENOTSUP; return -1; } \
-		(void)timeout; (void)cancellation; \
-		if( !context ) context = __get_io_context(); \
-		__io_user_data_t data = { 0 }; \
-		struct __io_data & ring = *context->thrd.ring; \
-		struct io_uring_sqe * sqe; \
-		__u32 idx; \
-		__u8 sflags = REGULAR_FLAGS & submit_flags; \
-		[sqe, idx] = __submit_alloc( ring, (__u64)(uintptr_t)&data ); \
-		sqe->flags = sflags;
-
-	#define __submit_wait \
-		/*__cfaabi_bits_print_safe( STDERR_FILENO, "Preparing user data %p for %p\n", &data, data.thrd );*/ \
-		verify( sqe->user_data == (__u64)(uintptr_t)&data ); \
-		__submit( context, idx ); \
-		wait( data.sem ); \
-		if( data.result < 0 ) { \
-			errno = -data.result; \
-			return -1; \
-		} \
-		return data.result;
-#endif
-
-//=============================================================================================
-// I/O Forwards
-//=============================================================================================
-#include <time.hfa>
-
-// Some forward declarations
-#include <errno.h>
-#include <unistd.h>
-
-extern "C" {
-	#include <sys/types.h>
-	#include <sys/socket.h>
-	#include <sys/syscall.h>
-
-#if defined(HAVE_PREADV2)
-	struct iovec;
-	extern ssize_t preadv2 (int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
-#endif
-#if defined(HAVE_PWRITEV2)
-	struct iovec;
-	extern ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
-#endif
-
-	extern int fsync(int fd);
-
-	#if __OFF_T_MATCHES_OFF64_T
-		typedef __off64_t off_t;
-	#else
-		typedef __off_t off_t;
-	#endif
-	typedef __off64_t off64_t;
-	extern int sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags);
-
-	struct msghdr;
-	struct sockaddr;
-	extern ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags);
-	extern ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags);
-	extern ssize_t send(int sockfd, const void *buf, size_t len, int flags);
-	extern ssize_t recv(int sockfd, void *buf, size_t len, int flags);
-	extern int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
-	extern int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
-
-	extern int fallocate(int fd, int mode, off_t offset, off_t len);
-	extern int posix_fadvise(int fd, off_t offset, off_t len, int advice);
-	extern int madvise(void *addr, size_t length, int advice);
-
-	extern int openat(int dirfd, const char *pathname, int flags, mode_t mode);
-	extern int close(int fd);
-
-	extern ssize_t read (int fd, void *buf, size_t count);
-
-	extern ssize_t splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags);
-	extern ssize_t tee(int fd_in, int fd_out, size_t len, unsigned int flags);
-}
-
-//=============================================================================================
-// I/O Interface
-//=============================================================================================
-
-//-----------------------------------------------------------------------------
-// Asynchronous operations
-#if defined(HAVE_PREADV2)
-	ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-		#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_READV)
-			return preadv2(fd, iov, iovcnt, offset, flags);
-		#else
-			__submit_prelude
-
-			sqe->opcode = IORING_OP_READV;
-			sqe->ioprio = 0;
-			sqe->fd = fd;
-			sqe->off = offset;
-			sqe->addr = (__u64)iov;
-			sqe->len = iovcnt;
-			sqe->rw_flags = 0;
-			sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
-
-			__submit_wait
-		#endif
-	}
-#endif
-
-#if defined(HAVE_PWRITEV2)
-	ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-		#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_WRITEV)
-			return pwritev2(fd, iov, iovcnt, offset, flags);
-		#else
-			__submit_prelude
-
-			sqe->opcode = IORING_OP_WRITEV;
-			sqe->ioprio = 0;
-			sqe->fd = fd;
-			sqe->off = offset;
-			sqe->addr = (__u64)iov;
-			sqe->len = iovcnt;
-			sqe->rw_flags = 0;
-			sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
-
-			__submit_wait
-		#endif
-	}
-#endif
-
-int cfa_fsync(int fd, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FSYNC)
-		return fsync(fd);
-	#else
-		__submit_prelude
-
-		sqe->opcode = IORING_OP_FSYNC;
-		sqe->ioprio = 0;
-		sqe->fd = fd;
-		sqe->off = 0;
-		sqe->addr = 0;
-		sqe->len = 0;
-		sqe->rw_flags = 0;
-		sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
-
-		__submit_wait
-	#endif
-}
-
-int cfa_sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SYNC_FILE_RANGE)
-		return sync_file_range(fd, offset, nbytes, flags);
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_SYNC_FILE_RANGE, fd };
-		sqe->off = offset;
-		sqe->len = nbytes;
-		sqe->sync_range_flags = flags;
-
-		__submit_wait
-	#endif
-}
-
-
-ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SENDMSG)
-		return sendmsg(sockfd, msg, flags);
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_SENDMSG, sockfd, msg, 1, 0 };
-		sqe->msg_flags = flags;
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_RECVMSG)
-		return recvmsg(sockfd, msg, flags);
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_RECVMSG, sockfd, msg, 1, 0 };
-		sqe->msg_flags = flags;
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SEND)
-		return send( sockfd, buf, len, flags );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_SEND, sockfd };
-		sqe->addr = (__u64)buf;
-		sqe->len = len;
-		sqe->msg_flags = flags;
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_RECV)
-		return recv( sockfd, buf, len, flags );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_RECV, sockfd };
-		sqe->addr = (__u64)buf;
-		sqe->len = len;
-		sqe->msg_flags = flags;
-
-		__submit_wait
-	#endif
-}
-
-int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_ACCEPT)
-		return accept4( sockfd, addr, addrlen, flags );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_ACCEPT, sockfd };
-		sqe->addr  = (__u64)addr;
-		sqe->addr2 = (__u64)addrlen;
-		sqe->accept_flags = flags;
-
-		__submit_wait
-	#endif
-}
-
-int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_CONNECT)
-		return connect( sockfd, addr, addrlen );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_CONNECT, sockfd };
-		sqe->addr = (__u64)addr;
-		sqe->off  = (__u64)addrlen;
-
-		__submit_wait
-	#endif
-}
-
-int cfa_fallocate(int fd, int mode, off_t offset, off_t len, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FALLOCATE)
-		return fallocate( fd, mode, offset, len );
-	#else
-		__submit_prelude
-
-		#warning FALLOCATE documentation for linux 5.7 is incorrect, and does not handle mode
-
-		(*sqe){ IORING_OP_FALLOCATE, fd };
-		sqe->off = offset;
-		sqe->len = mode;
-		sqe->addr = len;
-
-		__submit_wait
-	#endif
-}
-
-int cfa_fadvise(int fd, off_t offset, off_t len, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FADVISE)
-		return posix_fadvise( fd, offset, len, advice );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_FADVISE, fd };
-		sqe->off = (__u64)offset;
-		sqe->len = len;
-		sqe->fadvise_advice = advice;
-
-		__submit_wait
-	#endif
-}
-
-int cfa_madvise(void *addr, size_t length, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_MADVISE)
-		return madvise( addr, length, advice );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_MADVISE, 0 };
-		sqe->addr = (__u64)addr;
-		sqe->len = length;
-		sqe->fadvise_advice = advice;
-
-		__submit_wait
-	#endif
-}
-
-int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_OPENAT)
-		return openat( dirfd, pathname, flags, mode );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_OPENAT, dirfd };
-		sqe->addr = (__u64)pathname;
-		sqe->open_flags = flags;
-		sqe->len = mode;
-
-		__submit_wait
-	#endif
-}
-
-int cfa_close(int fd, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_CLOSE)
-		return close( fd );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_CLOSE, fd };
-
-		__submit_wait
-	#endif
-}
-
-// Forward declare in case it is not supported
-struct statx;
-int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_STATX)
-		#if defined(__NR_statx)
-			return syscall( __NR_statx, dirfd, pathname, flags, mask, statxbuf );
-		#else
-			errno = ENOTSUP;
-			return -1;
-		#endif
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_STATX, dirfd, pathname, mask, (__u64)statxbuf };
-		sqe->statx_flags = flags;
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_read(int fd, void *buf, size_t count, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_READ)
-		return read( fd, buf, count );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_READ, fd, buf, count, 0 };
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_write(int fd, void *buf, size_t count, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_WRITE)
-		return read( fd, buf, count );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_WRITE, fd, buf, count, 0 };
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SPLICE)
-		return splice( fd_in, off_in, fd_out, off_out, len, flags );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_SPLICE, fd_out };
-		if( off_out ) {
-			sqe->off = *off_out;
-		}
-		else {
-			sqe->off = (__u64)-1;
-		}
-		sqe->len = len;
-		sqe->splice_fd_in  = fd_in;
-		if( off_in ) {
-			sqe->splice_off_in = *off_in;
-		}
-		else {
-			sqe->splice_off_in = (__u64)-1;
-		}
-		sqe->splice_flags  = flags | (SPLICE_FLAGS & submit_flags);
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_TEE)
-		return tee( fd_in, fd_out, len, flags );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_TEE, fd_out, 0p, len, 0 };
-		sqe->splice_fd_in = fd_in;
-		sqe->splice_flags  = flags | (SPLICE_FLAGS & submit_flags);
-
-		__submit_wait
-	#endif
-}
-
-//-----------------------------------------------------------------------------
-// Check if a function is asynchronous
-
-// Macro magic to reduce the size of the following switch case
-#define IS_DEFINED_APPLY(f, ...) f(__VA_ARGS__)
-#define IS_DEFINED_SECOND(first, second, ...) second
-#define IS_DEFINED_TEST(expansion) _CFA_IO_FEATURE_##expansion
-#define IS_DEFINED(macro) IS_DEFINED_APPLY( IS_DEFINED_SECOND,IS_DEFINED_TEST(macro) false, true)
-
-bool has_user_level_blocking( fptr_t func ) {
-	#if defined(CFA_HAVE_LINUX_IO_URING_H)
-		#if defined(HAVE_PREADV2)
-			if( /*func == (fptr_t)preadv2 || */
-				func == (fptr_t)cfa_preadv2 )
-				#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_READV ,
-				return IS_DEFINED(CFA_HAVE_IORING_OP_READV);
-		#endif
-
-		#if defined(HAVE_PWRITEV2)
-			if( /*func == (fptr_t)pwritev2 || */
-				func == (fptr_t)cfa_pwritev2 )
-				#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_WRITEV ,
-				return IS_DEFINED(CFA_HAVE_IORING_OP_WRITEV);
-		#endif
-
-		if( /*func == (fptr_t)fsync || */
-			func == (fptr_t)cfa_fsync )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_FSYNC ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_FSYNC);
-
-		if( /*func == (fptr_t)ync_file_range || */
-			func == (fptr_t)cfa_sync_file_range )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_SYNC_FILE_RANGE ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_SYNC_FILE_RANGE);
-
-		if( /*func == (fptr_t)sendmsg || */
-			func == (fptr_t)cfa_sendmsg )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_SENDMSG ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_SENDMSG);
-
-		if( /*func == (fptr_t)recvmsg || */
-			func == (fptr_t)cfa_recvmsg )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_RECVMSG ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_RECVMSG);
-
-		if( /*func == (fptr_t)send || */
-			func == (fptr_t)cfa_send )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_SEND ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_SEND);
-
-		if( /*func == (fptr_t)recv || */
-			func == (fptr_t)cfa_recv )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_RECV ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_RECV);
-
-		if( /*func == (fptr_t)accept4 || */
-			func == (fptr_t)cfa_accept4 )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_ACCEPT ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_ACCEPT);
-
-		if( /*func == (fptr_t)connect || */
-			func == (fptr_t)cfa_connect )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_CONNECT ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_CONNECT);
-
-		if( /*func == (fptr_t)fallocate || */
-			func == (fptr_t)cfa_fallocate )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_FALLOCATE ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_FALLOCATE);
-
-		if( /*func == (fptr_t)posix_fadvise || */
-			func == (fptr_t)cfa_fadvise )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_FADVISE ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_FADVISE);
-
-		if( /*func == (fptr_t)madvise || */
-			func == (fptr_t)cfa_madvise )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_MADVISE ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_MADVISE);
-
-		if( /*func == (fptr_t)openat || */
-			func == (fptr_t)cfa_openat )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_OPENAT ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_OPENAT);
-
-		if( /*func == (fptr_t)close || */
-			func == (fptr_t)cfa_close )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_CLOSE ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_CLOSE);
-
-		if( /*func == (fptr_t)read || */
-			func == (fptr_t)cfa_read )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_READ ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_READ);
-
-		if( /*func == (fptr_t)write || */
-			func == (fptr_t)cfa_write )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_WRITE ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_WRITE);
-
-		if( /*func == (fptr_t)splice || */
-			func == (fptr_t)cfa_splice )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_SPLICE ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_SPLICE);
-
-		if( /*func == (fptr_t)tee || */
-			func == (fptr_t)cfa_tee )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_TEE ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_TEE);
-	#endif
-
-	return false;
-}
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ libcfa/src/concurrency/kernel.cfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -122,6 +122,6 @@
 	// Because of a bug, we couldn't initialized the seed on construction
 	// Do it here
-	kernelTLS.rand_seed ^= rdtscl();
-	kernelTLS.ready_rng.fwd_seed = 25214903917_l64u * (rdtscl() ^ (uintptr_t)&runner);
+	__cfaabi_tls.rand_seed ^= rdtscl();
+	__cfaabi_tls.ready_rng.fwd_seed = 25214903917_l64u * (rdtscl() ^ (uintptr_t)&runner);
 	__tls_rand_advance_bck();
 
@@ -217,5 +217,5 @@
 		// and it make sense for it to be set in all other cases except here
 		// fake it
-		kernelTLS.this_thread = mainThread;
+		__cfaabi_tls.this_thread = mainThread;
 	}
 
@@ -230,5 +230,5 @@
 // from the processor coroutine to the target thread
 static void __run_thread(processor * this, $thread * thrd_dst) {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	/* paranoid */ verifyf( thrd_dst->state == Ready || thrd_dst->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", thrd_dst->state, thrd_dst->preempted);
 	/* paranoid */ verifyf( thrd_dst->link.next == 0p, "Expected null got %p", thrd_dst->link.next );
@@ -247,8 +247,8 @@
 
 		// Update global state
-		kernelTLS.this_thread = thrd_dst;
-
-		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-		/* paranoid */ verify( kernelTLS.this_thread == thrd_dst );
+		kernelTLS().this_thread = thrd_dst;
+
+		/* paranoid */ verify( ! __preemption_enabled() );
+		/* paranoid */ verify( kernelTLS().this_thread == thrd_dst );
 		/* paranoid */ verify( thrd_dst->context.SP );
 		/* paranoid */ verify( thrd_dst->state != Halted );
@@ -267,9 +267,9 @@
 		/* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) < ((uintptr_t)__get_stack(thrd_dst->curr_cor)->base ), "ERROR : Destination $thread %p has been corrupted.\n StackPointer too small.\n", thrd_dst );
 		/* paranoid */ verify( thrd_dst->context.SP );
-		/* paranoid */ verify( kernelTLS.this_thread == thrd_dst );
-		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+		/* paranoid */ verify( kernelTLS().this_thread == thrd_dst );
+		/* paranoid */ verify( ! __preemption_enabled() );
 
 		// Reset global state
-		kernelTLS.this_thread = 0p;
+		kernelTLS().this_thread = 0p;
 
 		// We just finished running a thread, there are a few things that could have happened.
@@ -315,15 +315,15 @@
 	proc_cor->state = Active;
 
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 }
 
 // KERNEL_ONLY
 void returnToKernel() {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	$coroutine * proc_cor = get_coroutine(kernelTLS.this_processor->runner);
-	$thread * thrd_src = kernelTLS.this_thread;
+	/* paranoid */ verify( ! __preemption_enabled() );
+	$coroutine * proc_cor = get_coroutine(kernelTLS().this_processor->runner);
+	$thread * thrd_src = kernelTLS().this_thread;
 
 	#if !defined(__CFA_NO_STATISTICS__)
-		struct processor * last_proc = kernelTLS.this_processor;
+		struct processor * last_proc = kernelTLS().this_processor;
 	#endif
 
@@ -345,10 +345,10 @@
 
 	#if !defined(__CFA_NO_STATISTICS__)
-		if(last_proc != kernelTLS.this_processor) {
+		if(last_proc != kernelTLS().this_processor) {
 			__tls_stats()->ready.threads.migration++;
 		}
 	#endif
 
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	/* paranoid */ verifyf( ((uintptr_t)thrd_src->context.SP) < ((uintptr_t)__get_stack(thrd_src->curr_cor)->base ), "ERROR : Returning $thread %p has been corrupted.\n StackPointer too small.\n", thrd_src );
 	/* paranoid */ verifyf( ((uintptr_t)thrd_src->context.SP) > ((uintptr_t)__get_stack(thrd_src->curr_cor)->limit), "ERROR : Returning $thread %p has been corrupted.\n StackPointer too large.\n", thrd_src );
@@ -359,8 +359,8 @@
 // KERNEL ONLY
 void __schedule_thread( $thread * thrd ) {
+	/* paranoid */ verify( ! __preemption_enabled() );
 	/* paranoid */ verify( thrd );
 	/* paranoid */ verify( thrd->state != Halted );
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	/* paranoid */ verify( kernelTLS.this_proc_id );
+	/* paranoid */ verify( kernelTLS().this_proc_id );
 	/* paranoid */ #if defined( __CFA_WITH_VERIFY__ )
 	/* paranoid */ 	if( thrd->state == Blocked || thrd->state == Start ) assertf( thrd->preempted == __NO_PREEMPTION,
@@ -380,11 +380,11 @@
 	ready_schedule_unlock();
 
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 }
 
 // KERNEL ONLY
 static inline $thread * __next_thread(cluster * this) with( *this ) {
-	/* paranoid */ verify( kernelTLS.this_proc_id );
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
+	/* paranoid */ verify( kernelTLS().this_proc_id );
 
 	ready_schedule_lock();
@@ -392,6 +392,6 @@
 	ready_schedule_unlock();
 
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	/* paranoid */ verify( kernelTLS.this_proc_id );
+	/* paranoid */ verify( kernelTLS().this_proc_id );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	return thrd;
 }
@@ -399,6 +399,6 @@
 // KERNEL ONLY
 static inline $thread * __next_thread_slow(cluster * this) with( *this ) {
-	/* paranoid */ verify( kernelTLS.this_proc_id );
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
+	/* paranoid */ verify( kernelTLS().this_proc_id );
 
 	ready_schedule_lock();
@@ -406,6 +406,6 @@
 	ready_schedule_unlock();
 
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	/* paranoid */ verify( kernelTLS.this_proc_id );
+	/* paranoid */ verify( kernelTLS().this_proc_id );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	return thrd;
 }
@@ -414,9 +414,4 @@
 	if( !thrd ) return;
 
-	/* paranoid */ verify( kernelTLS.this_proc_id );
-	bool full = kernelTLS.this_proc_id->full_proc;
-	if(full) disable_interrupts();
-
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 	int old_ticket = __atomic_fetch_add(&thrd->ticket, 1, __ATOMIC_SEQ_CST);
 	switch(old_ticket) {
@@ -428,6 +423,20 @@
 			/* paranoid */ verify( thrd->state == Blocked );
 
-			// Wake lost the race,
-			__schedule_thread( thrd );
+			{
+				/* paranoid */ verify( publicTLS_get(this_proc_id) );
+				bool full = publicTLS_get(this_proc_id)->full_proc;
+				if(full) disable_interrupts();
+
+				/* paranoid */ verify( ! __preemption_enabled() );
+
+				// Wake lost the race,
+				__schedule_thread( thrd );
+
+				/* paranoid */ verify( ! __preemption_enabled() );
+
+				if(full) enable_interrupts( __cfaabi_dbg_ctx );
+				/* paranoid */ verify( publicTLS_get(this_proc_id) );
+			}
+
 			break;
 		default:
@@ -435,21 +444,17 @@
 			abort("Thread %p (%s) has mismatch park/unpark\n", thrd, thrd->self_cor.name);
 	}
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-
-	if(full) enable_interrupts( __cfaabi_dbg_ctx );
-	/* paranoid */ verify( kernelTLS.this_proc_id );
 }
 
 void park( void ) {
-	/* paranoid */ verify( kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( __preemption_enabled() );
 	disable_interrupts();
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	/* paranoid */ verify( kernelTLS.this_thread->preempted == __NO_PREEMPTION );
+	/* paranoid */ verify( ! __preemption_enabled() );
+	/* paranoid */ verify( kernelTLS().this_thread->preempted == __NO_PREEMPTION );
 
 	returnToKernel();
 
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	enable_interrupts( __cfaabi_dbg_ctx );
-	/* paranoid */ verify( kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( __preemption_enabled() );
 
 }
@@ -460,5 +465,5 @@
 	// Should never return
 	void __cfactx_thrd_leave() {
-		$thread * thrd = TL_GET( this_thread );
+		$thread * thrd = active_thread();
 		$monitor * this = &thrd->self_mon;
 
@@ -473,5 +478,5 @@
 
 		// Leave the thread
-		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+		/* paranoid */ verify( ! __preemption_enabled() );
 		returnToKernel();
 		abort();
@@ -483,9 +488,9 @@
 // KERNEL ONLY
 bool force_yield( __Preemption_Reason reason ) {
-	/* paranoid */ verify( kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( __preemption_enabled() );
 	disable_interrupts();
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-
-	$thread * thrd = kernelTLS.this_thread;
+	/* paranoid */ verify( ! __preemption_enabled() );
+
+	$thread * thrd = kernelTLS().this_thread;
 	/* paranoid */ verify(thrd->state == Active);
 
@@ -501,7 +506,7 @@
 	}
 
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	enable_interrupts_noPoll();
-	/* paranoid */ verify( kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( __preemption_enabled() );
 
 	return preempted;
@@ -513,5 +518,5 @@
 // Wake a thread from the front if there are any
 static void __wake_one(cluster * this) {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	/* paranoid */ verify( ready_schedule_islocked() );
 
@@ -533,5 +538,5 @@
 
 	/* paranoid */ verify( ready_schedule_islocked() );
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 
 	return;
@@ -543,5 +548,5 @@
 
 	disable_interrupts();
-		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+		/* paranoid */ verify( ! __preemption_enabled() );
 		post( this->idle );
 	enable_interrupts( __cfaabi_dbg_ctx );
@@ -549,5 +554,5 @@
 
 static void push  (__cluster_idles & this, processor & proc) {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	lock( this );
 		this.idle++;
@@ -556,9 +561,9 @@
 		insert_first(this.list, proc);
 	unlock( this );
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 }
 
 static void remove(__cluster_idles & this, processor & proc) {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	lock( this );
 		this.idle--;
@@ -567,5 +572,5 @@
 		remove(proc);
 	unlock( this );
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 }
 
@@ -611,5 +616,5 @@
 	}
 
-	return kernelTLS.this_thread;
+	return __cfaabi_tls.this_thread;
 }
 
@@ -636,5 +641,5 @@
 
 int kernel_abort_lastframe( void ) __attribute__ ((__nothrow__)) {
-	return get_coroutine(kernelTLS.this_thread) == get_coroutine(mainThread) ? 4 : 2;
+	return get_coroutine(kernelTLS().this_thread) == get_coroutine(mainThread) ? 4 : 2;
 }
 
@@ -668,5 +673,5 @@
 	if ( count < 0 ) {
 		// queue current task
-		append( waiting, kernelTLS.this_thread );
+		append( waiting, active_thread() );
 
 		// atomically release spin lock and block
@@ -718,5 +723,5 @@
 		void __cfaabi_dbg_record_lock(__spinlock_t & this, const char prev_name[]) {
 			this.prev_name = prev_name;
-			this.prev_thrd = kernelTLS.this_thread;
+			this.prev_thrd = kernelTLS().this_thread;
 		}
 	}
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ libcfa/src/concurrency/kernel.hfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -275,8 +275,10 @@
 static inline [cluster *&, cluster *& ] __get( cluster & this ) __attribute__((const)) { return this.node.[next, prev]; }
 
-static inline struct processor * active_processor() { return TL_GET( this_processor ); } // UNSAFE
-static inline struct cluster   * active_cluster  () { return TL_GET( this_processor )->cltr; }
+static inline struct processor * active_processor() { return publicTLS_get( this_processor ); } // UNSAFE
+static inline struct cluster   * active_cluster  () { return publicTLS_get( this_processor )->cltr; }
 
 #if !defined(__CFA_NO_STATISTICS__)
+	void print_stats_now( cluster & this, int flags );
+
 	static inline void print_stats_at_exit( cluster & this, int flags ) {
 		this.print_stats |= flags;
Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -55,13 +55,24 @@
 				uint64_t bck_seed;
 			} ready_rng;
-		} kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
+		} __cfaabi_tls __attribute__ ((tls_model ( "initial-exec" )));
 
+		extern bool __preemption_enabled();
 
+		static inline KernelThreadData & kernelTLS( void ) {
+			/* paranoid */ verify( ! __preemption_enabled() );
+			return __cfaabi_tls;
+		}
+
+		extern uintptr_t __cfatls_get( unsigned long int member );
+		// #define publicTLS_get( member ) ((typeof(__cfaabi_tls.member))__cfatls_get( __builtin_offsetof(KernelThreadData, member) ))
+		#define publicTLS_get( member ) (__cfaabi_tls.member)
+		// extern forall(otype T) T __cfatls_get( T * member, T value );
+		// #define publicTLS_set( member, value ) __cfatls_set( (typeof(member)*)__builtin_offsetof(KernelThreadData, member), value );
 
 		static inline uint64_t __tls_rand() {
 			#if defined(__SIZEOF_INT128__)
-				return __lehmer64( kernelTLS.rand_seed );
+				return __lehmer64( kernelTLS().rand_seed );
 			#else
-				return __xorshift64( kernelTLS.rand_seed );
+				return __xorshift64( kernelTLS().rand_seed );
 			#endif
 		}
@@ -75,11 +86,11 @@
 		static inline unsigned __tls_rand_fwd() {
 
-			kernelTLS.ready_rng.fwd_seed = (A * kernelTLS.ready_rng.fwd_seed + C) & (M - 1);
-			return kernelTLS.ready_rng.fwd_seed >> D;
+			kernelTLS().ready_rng.fwd_seed = (A * kernelTLS().ready_rng.fwd_seed + C) & (M - 1);
+			return kernelTLS().ready_rng.fwd_seed >> D;
 		}
 
 		static inline unsigned __tls_rand_bck() {
-			unsigned int r = kernelTLS.ready_rng.bck_seed >> D;
-			kernelTLS.ready_rng.bck_seed = AI * (kernelTLS.ready_rng.bck_seed - C) & (M - 1);
+			unsigned int r = kernelTLS().ready_rng.bck_seed >> D;
+			kernelTLS().ready_rng.bck_seed = AI * (kernelTLS().ready_rng.bck_seed - C) & (M - 1);
 			return r;
 		}
@@ -92,25 +103,9 @@
 
 		static inline void __tls_rand_advance_bck(void) {
-			kernelTLS.ready_rng.bck_seed = kernelTLS.ready_rng.fwd_seed;
+			kernelTLS().ready_rng.bck_seed = kernelTLS().ready_rng.fwd_seed;
 		}
 	}
 
-	#if 0 // def __ARM_ARCH
-		// function prototypes are only really used by these macros on ARM
-		void disable_global_interrupts();
-		void enable_global_interrupts();
 
-		#define TL_GET( member ) ( { __typeof__( kernelTLS.member ) target; \
-			disable_global_interrupts(); \
-			target = kernelTLS.member; \
-			enable_global_interrupts(); \
-			target; } )
-		#define TL_SET( member, value ) disable_global_interrupts(); \
-			kernelTLS.member = value; \
-			enable_global_interrupts();
-	#else
-		#define TL_GET( member ) kernelTLS.member
-		#define TL_SET( member, value ) kernelTLS.member = value;
-	#endif
 
 	extern void disable_interrupts();
@@ -121,5 +116,9 @@
 		extern void park( void );
 		extern void unpark( struct $thread * this );
-		static inline struct $thread * active_thread () { return TL_GET( this_thread ); }
+		static inline struct $thread * active_thread () {
+			struct $thread * t = publicTLS_get( this_thread );
+			/* paranoid */ verify( t );
+			return t;
+		}
 
 		extern bool force_yield( enum __Preemption_Reason );
@@ -140,7 +139,7 @@
 		#if !defined(__CFA_NO_STATISTICS__)
 			static inline struct __stats_t * __tls_stats() {
-				/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-				/* paranoid */ verify( kernelTLS.this_stats );
-				return kernelTLS.this_stats;
+				/* paranoid */ verify( ! __preemption_enabled() );
+				/* paranoid */ verify( kernelTLS().this_stats );
+				return kernelTLS().this_stats;
 			}
 
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -118,5 +118,5 @@
 //-----------------------------------------------------------------------------
 // Global state
-thread_local struct KernelThreadData kernelTLS __attribute__ ((tls_model ( "initial-exec" ))) @= {
+thread_local struct KernelThreadData __cfaabi_tls __attribute__ ((tls_model ( "initial-exec" ))) @= {
 	NULL,												// cannot use 0p
 	NULL,
@@ -156,5 +156,5 @@
 // Kernel boot procedures
 static void __kernel_startup(void) {
-	verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	__cfadbg_print_safe(runtime_core, "Kernel : Starting\n");
 
@@ -212,11 +212,11 @@
 
 	//initialize the global state variables
-	kernelTLS.this_processor = mainProcessor;
-	kernelTLS.this_proc_id   = (__processor_id_t*)mainProcessor;
-	kernelTLS.this_thread    = mainThread;
+	__cfaabi_tls.this_processor = mainProcessor;
+	__cfaabi_tls.this_proc_id   = (__processor_id_t*)mainProcessor;
+	__cfaabi_tls.this_thread    = mainThread;
 
 	#if !defined( __CFA_NO_STATISTICS__ )
-		kernelTLS.this_stats = (__stats_t *)& storage_mainProcStats;
-		__init_stats( kernelTLS.this_stats );
+		__cfaabi_tls.this_stats = (__stats_t *)& storage_mainProcStats;
+		__init_stats( __cfaabi_tls.this_stats );
 	#endif
 
@@ -234,5 +234,5 @@
 	// context. Hence, the main thread does not begin through __cfactx_invoke_thread, like all other threads. The trick here is that
 	// mainThread is on the ready queue when this call is made.
-	__kernel_first_resume( kernelTLS.this_processor );
+	__kernel_first_resume( __cfaabi_tls.this_processor );
 
 
@@ -251,7 +251,8 @@
 	__cfadbg_print_safe(runtime_core, "Kernel : Started\n--------------------------------------------------\n\n");
 
-	verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	enable_interrupts( __cfaabi_dbg_ctx );
-	verify( TL_GET( preemption_state.enabled ) );
+	/* paranoid */ verify( __preemption_enabled() );
+
 }
 
@@ -262,7 +263,7 @@
 	mainCluster->io.ctxs = 0p;
 
-	/* paranoid */ verify( TL_GET( preemption_state.enabled ) );
+	/* paranoid */ verify( __preemption_enabled() );
 	disable_interrupts();
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 
 	__cfadbg_print_safe(runtime_core, "\n--------------------------------------------------\nKernel : Shutting down\n");
@@ -272,5 +273,5 @@
 	// which is currently here
 	__atomic_store_n(&mainProcessor->do_terminate, true, __ATOMIC_RELEASE);
-	__kernel_last_resume( kernelTLS.this_processor );
+	__kernel_last_resume( __cfaabi_tls.this_processor );
 	mainThread->self_cor.state = Halted;
 
@@ -321,12 +322,12 @@
 		__stats_t local_stats;
 		__init_stats( &local_stats );
-		kernelTLS.this_stats = &local_stats;
+		__cfaabi_tls.this_stats = &local_stats;
 	#endif
 
 	processor * proc = (processor *) arg;
-	kernelTLS.this_processor = proc;
-	kernelTLS.this_proc_id   = (__processor_id_t*)proc;
-	kernelTLS.this_thread    = 0p;
-	kernelTLS.preemption_state.[enabled, disable_count] = [false, 1];
+	__cfaabi_tls.this_processor = proc;
+	__cfaabi_tls.this_proc_id   = (__processor_id_t*)proc;
+	__cfaabi_tls.this_thread    = 0p;
+	__cfaabi_tls.preemption_state.[enabled, disable_count] = [false, 1];
 	// SKULLDUGGERY: We want to create a context for the processor coroutine
 	// which is needed for the 2-step context switch. However, there is no reason
@@ -340,5 +341,5 @@
 
 	//Set global state
-	kernelTLS.this_thread = 0p;
+	__cfaabi_tls.this_thread = 0p;
 
 	//We now have a proper context from which to schedule threads
@@ -370,11 +371,11 @@
 	$coroutine * dst = get_coroutine(this->runner);
 
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	kernelTLS.this_thread->curr_cor = dst;
+	/* paranoid */ verify( ! __preemption_enabled() );
+
+	__cfaabi_tls.this_thread->curr_cor = dst;
 	__stack_prepare( &dst->stack, 65000 );
 	__cfactx_start(main, dst, this->runner, __cfactx_invoke_coroutine);
 
-	verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 
 	dst->last = &src->self_cor;
@@ -394,5 +395,5 @@
 	/* paranoid */ verify(src->state == Active);
 
-	verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 }
 
@@ -402,7 +403,7 @@
 	$coroutine * dst = get_coroutine(this->runner);
 
-	verify( ! kernelTLS.preemption_state.enabled );
-	verify( dst->starter == src );
-	verify( dst->context.SP );
+	/* paranoid */ verify( ! __preemption_enabled() );
+	/* paranoid */ verify( dst->starter == src );
+	/* paranoid */ verify( dst->context.SP );
 
 	// SKULLDUGGERY in debug the processors check that the
@@ -546,5 +547,5 @@
 
 		P( terminated );
-		verify( kernelTLS.this_processor != &this);
+		/* paranoid */ verify( active_processor() != &this);
 	}
 
@@ -696,5 +697,5 @@
 #if defined(__CFA_WITH_VERIFY__)
 static bool verify_fwd_bck_rng(void) {
-	kernelTLS.ready_rng.fwd_seed = 25214903917_l64u * (rdtscl() ^ (uintptr_t)&verify_fwd_bck_rng);
+	__cfaabi_tls.ready_rng.fwd_seed = 25214903917_l64u * (rdtscl() ^ (uintptr_t)&verify_fwd_bck_rng);
 
 	unsigned values[10];
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -38,4 +38,6 @@
 #endif
 ;
+
+extern bool __preemption_enabled();
 
 //release/wake-up the following resources
@@ -181,8 +183,9 @@
 //  creating/destroying queues
 static inline void ready_schedule_lock(void) with(*__scheduler_lock) {
-	/*paranoid*/ verify( kernelTLS.this_proc_id );
-
-	unsigned iproc = kernelTLS.this_proc_id->id;
-	/*paranoid*/ verify(data[iproc].handle == kernelTLS.this_proc_id);
+	/* paranoid */ verify( ! __preemption_enabled() );
+	/* paranoid */ verify( kernelTLS().this_proc_id );
+
+	unsigned iproc = kernelTLS().this_proc_id->id;
+	/*paranoid*/ verify(data[iproc].handle == kernelTLS().this_proc_id);
 	/*paranoid*/ verify(iproc < ready);
 
@@ -207,8 +210,9 @@
 
 static inline void ready_schedule_unlock(void) with(*__scheduler_lock) {
-	/*paranoid*/ verify( kernelTLS.this_proc_id );
-
-	unsigned iproc = kernelTLS.this_proc_id->id;
-	/*paranoid*/ verify(data[iproc].handle == kernelTLS.this_proc_id);
+	/* paranoid */ verify( ! __preemption_enabled() );
+	/* paranoid */ verify( kernelTLS().this_proc_id );
+
+	unsigned iproc = kernelTLS().this_proc_id->id;
+	/*paranoid*/ verify(data[iproc].handle == kernelTLS().this_proc_id);
 	/*paranoid*/ verify(iproc < ready);
 	/*paranoid*/ verify(data[iproc].lock);
@@ -223,6 +227,7 @@
 #ifdef __CFA_WITH_VERIFY__
 	static inline bool ready_schedule_islocked(void) {
-		/*paranoid*/ verify( kernelTLS.this_proc_id );
-		__processor_id_t * proc = kernelTLS.this_proc_id;
+		/* paranoid */ verify( ! __preemption_enabled() );
+		/*paranoid*/ verify( kernelTLS().this_proc_id );
+		__processor_id_t * proc = kernelTLS().this_proc_id;
 		return __scheduler_lock->data[proc->id].owned;
 	}
Index: libcfa/src/concurrency/locks.cfa
===================================================================
--- libcfa/src/concurrency/locks.cfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ libcfa/src/concurrency/locks.cfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -76,18 +76,19 @@
 
 void lock( blocking_lock & this ) with( this ) {
+	$thread * thrd = active_thread();
 	lock( lock __cfaabi_dbg_ctx2 );
-	if ( owner == kernelTLS.this_thread && !multi_acquisition) {
+	if ( owner == thrd && !multi_acquisition) {
 		fprintf(stderr, "A single acquisition lock holder attempted to reacquire the lock resulting in a deadlock."); // Possibly throw instead
     	exit(EXIT_FAILURE);
-	} else if ( owner != 0p && owner != kernelTLS.this_thread ) {
-		append( blocked_threads, kernelTLS.this_thread );
+	} else if ( owner != 0p && owner != thrd ) {
+		append( blocked_threads, thrd );
 		wait_count++;
 		unlock( lock );
 		park( );
-	} else if ( owner == kernelTLS.this_thread && multi_acquisition ) {
+	} else if ( owner == thrd && multi_acquisition ) {
 		recursion_count++;
 		unlock( lock );
 	} else {
-		owner = kernelTLS.this_thread;
+		owner = thrd;
 		recursion_count = 1;
 		unlock( lock );
@@ -96,11 +97,12 @@
 
 bool try_lock( blocking_lock & this ) with( this ) {
+	$thread * thrd = active_thread();
 	bool ret = false;
 	lock( lock __cfaabi_dbg_ctx2 );
 	if ( owner == 0p ) {
-		owner = kernelTLS.this_thread;
+		owner = thrd;
 		if ( multi_acquisition ) recursion_count = 1;
 		ret = true;
-	} else if ( owner == kernelTLS.this_thread && multi_acquisition ) {
+	} else if ( owner == thrd && multi_acquisition ) {
 		recursion_count++;
 		ret = true;
@@ -113,8 +115,8 @@
 	lock( lock __cfaabi_dbg_ctx2 );
 	if ( owner == 0p ){ // no owner implies lock isn't held
-		fprintf( stderr, "There was an attempt to release a lock that isn't held" ); 
+		fprintf( stderr, "There was an attempt to release a lock that isn't held" );
 		return;
-	} else if ( strict_owner && owner != kernelTLS.this_thread ) {
-		fprintf( stderr, "A thread other than the owner attempted to release an owner lock" ); 
+	} else if ( strict_owner && active_thread() ) {
+		fprintf( stderr, "A thread other than the owner attempted to release an owner lock" );
 		return;
 	}
@@ -163,7 +165,7 @@
     lock( lock __cfaabi_dbg_ctx2 );
 	if ( owner == 0p ){ // no owner implies lock isn't held
-		fprintf( stderr, "A lock that is not held was passed to a synchronization lock" ); 
-	} else if ( strict_owner && owner != kernelTLS.this_thread ) {
-		fprintf( stderr, "A thread other than the owner of a lock passed it to a synchronization lock" ); 
+		fprintf( stderr, "A lock that is not held was passed to a synchronization lock" );
+	} else if ( strict_owner && active_thread() ) {
+		fprintf( stderr, "A thread other than the owner of a lock passed it to a synchronization lock" );
 	} else {
 		$thread * thrd = pop_head( blocked_threads );
@@ -246,4 +248,5 @@
 				unlock( cond->lock );
 				#if !defined( __CFA_NO_STATISTICS__ )
+					#warning unprotected access to tls TODO discuss this
 					kernelTLS.this_stats = copy->t->curr_cluster->stats;
 				#endif
@@ -338,5 +341,5 @@
 			remove_( *i.lock );
 		}
-		
+
 		unlock( lock );
 		park( ); // blocks here
@@ -374,35 +377,35 @@
 
 	void wait( condition_variable(L) & this ) with(this) {
-		info_thread( L ) i = { kernelTLS.this_thread };
+		info_thread( L ) i = { active_thread() };
 		queue_info_thread( this, i );
 	}
 
 	void wait( condition_variable(L) & this, uintptr_t info ) with(this) {
-		info_thread( L ) i = { kernelTLS.this_thread, info };
+		info_thread( L ) i = { active_thread(), info };
 		queue_info_thread( this, i );
 	}
-	
+
 	void wait( condition_variable(L) & this, Duration duration ) with(this) {
-		info_thread( L ) i = { kernelTLS.this_thread };
+		info_thread( L ) i = { active_thread() };
 		queue_info_thread_timeout(this, i, __kernel_get_time() + duration );
 	}
 
-	void wait( condition_variable(L) & this, uintptr_t info, Duration duration ) with(this) { 
-		info_thread( L ) i = { kernelTLS.this_thread, info };
+	void wait( condition_variable(L) & this, uintptr_t info, Duration duration ) with(this) {
+		info_thread( L ) i = { active_thread(), info };
 		queue_info_thread_timeout(this, i, __kernel_get_time() + duration );
 	}
 
 	void wait( condition_variable(L) & this, Time time ) with(this) {
-		info_thread( L ) i = { kernelTLS.this_thread };
+		info_thread( L ) i = { active_thread() };
 		queue_info_thread_timeout(this, i, time);
 	}
 
 	void wait( condition_variable(L) & this, uintptr_t info, Time time ) with(this) {
-		info_thread( L ) i = { kernelTLS.this_thread, info };
+		info_thread( L ) i = { active_thread(), info };
 		queue_info_thread_timeout(this, i, time);
 	}
 
 	void wait( condition_variable(L) & this, L & l ) with(this) {
-		info_thread(L) i = { kernelTLS.this_thread };
+		info_thread(L) i = { active_thread() };
 		i.lock = &l;
 		queue_info_thread( this, i );
@@ -410,29 +413,29 @@
 
 	void wait( condition_variable(L) & this, L & l, uintptr_t info ) with(this) {
-		info_thread(L) i = { kernelTLS.this_thread, info };
+		info_thread(L) i = { active_thread(), info };
 		i.lock = &l;
 		queue_info_thread( this, i );
 	}
-	
+
 	void wait( condition_variable(L) & this, L & l, Duration duration ) with(this) {
-		info_thread(L) i = { kernelTLS.this_thread };
+		info_thread(L) i = { active_thread() };
 		i.lock = &l;
 		queue_info_thread_timeout(this, i, __kernel_get_time() + duration );
 	}
-	
+
 	void wait( condition_variable(L) & this, L & l, uintptr_t info, Duration duration ) with(this) {
-		info_thread(L) i = { kernelTLS.this_thread, info };
+		info_thread(L) i = { active_thread(), info };
 		i.lock = &l;
 		queue_info_thread_timeout(this, i, __kernel_get_time() + duration );
 	}
-	
+
 	void wait( condition_variable(L) & this, L & l, Time time ) with(this) {
-		info_thread(L) i = { kernelTLS.this_thread };
+		info_thread(L) i = { active_thread() };
 		i.lock = &l;
 		queue_info_thread_timeout(this, i, time );
 	}
-	
+
 	void wait( condition_variable(L) & this, L & l, uintptr_t info, Time time ) with(this) {
-		info_thread(L) i = { kernelTLS.this_thread, info };
+		info_thread(L) i = { active_thread(), info };
 		i.lock = &l;
 		queue_info_thread_timeout(this, i, time );
Index: libcfa/src/concurrency/monitor.cfa
===================================================================
--- libcfa/src/concurrency/monitor.cfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ libcfa/src/concurrency/monitor.cfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -82,8 +82,8 @@
 // Enter single monitor
 static void __enter( $monitor * this, const __monitor_group_t & group ) {
+	$thread * thrd = active_thread();
+
 	// Lock the monitor spinlock
 	lock( this->lock __cfaabi_dbg_ctx2 );
-	// Interrupts disable inside critical section
-	$thread * thrd = kernelTLS.this_thread;
 
 	__cfaabi_dbg_print_safe( "Kernel : %10p Entering mon %p (%p)\n", thrd, this, this->owner);
@@ -126,5 +126,5 @@
 		__cfaabi_dbg_print_safe( "Kernel : %10p Entered  mon %p\n", thrd, this);
 
-		/* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+		/* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 		return;
 	}
@@ -132,5 +132,5 @@
 	__cfaabi_dbg_print_safe( "Kernel : %10p Entered  mon %p\n", thrd, this);
 
-	/* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+	/* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 	/* paranoid */ verify( this->lock.lock );
 
@@ -141,8 +141,8 @@
 
 static void __dtor_enter( $monitor * this, fptr_t func, bool join ) {
+	$thread * thrd = active_thread();
+
 	// Lock the monitor spinlock
 	lock( this->lock __cfaabi_dbg_ctx2 );
-	// Interrupts disable inside critical section
-	$thread * thrd = kernelTLS.this_thread;
 
 	__cfaabi_dbg_print_safe( "Kernel : %10p Entering dtor for mon %p (%p)\n", thrd, this, this->owner);
@@ -155,5 +155,5 @@
 		__set_owner( this, thrd );
 
-		verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+		verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 
 		unlock( this->lock );
@@ -174,5 +174,5 @@
 		this->owner = thrd;
 
-		verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+		verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 
 		unlock( this->lock );
@@ -200,5 +200,5 @@
 
 		// Release the next thread
-		/* paranoid */ verifyf( urgent->owner->waiting_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+		/* paranoid */ verifyf( urgent->owner->waiting_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 		unpark( urgent->owner->waiting_thread );
 
@@ -207,5 +207,5 @@
 
 		// Some one was waiting for us, enter
-		/* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+		/* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 	}
 	else {
@@ -224,5 +224,5 @@
 		park();
 
-		/* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+		/* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 		return;
 	}
@@ -237,7 +237,7 @@
 	lock( this->lock __cfaabi_dbg_ctx2 );
 
-	__cfaabi_dbg_print_safe( "Kernel : %10p Leaving mon %p (%p)\n", kernelTLS.this_thread, this, this->owner);
-
-	/* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+	__cfaabi_dbg_print_safe( "Kernel : %10p Leaving mon %p (%p)\n", active_thread(), this, this->owner);
+
+	/* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 
 	// Leaving a recursion level, decrement the counter
@@ -270,6 +270,6 @@
 void __dtor_leave( $monitor * this, bool join ) {
 	__cfaabi_dbg_debug_do(
-		if( TL_GET( this_thread ) != this->owner ) {
-			abort( "Destroyed monitor %p has inconsistent owner, expected %p got %p.\n", this, TL_GET( this_thread ), this->owner);
+		if( active_thread() != this->owner ) {
+			abort( "Destroyed monitor %p has inconsistent owner, expected %p got %p.\n", this, active_thread(), this->owner);
 		}
 		if( this->recursion != 1  && !join ) {
@@ -287,5 +287,5 @@
 	/* paranoid */ verify( this->lock.lock );
 	/* paranoid */ verifyf( thrd == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", thrd, this->owner, this->recursion, this );
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	/* paranoid */ verify( thrd->state == Halted );
 	/* paranoid */ verify( this->recursion == 1 );
@@ -303,5 +303,5 @@
 	// Unpark the next owner if needed
 	/* paranoid */ verifyf( !new_owner || new_owner == this->owner, "Expected owner to be %p, got %p (m: %p)", new_owner, this->owner, this );
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	/* paranoid */ verify( thrd->state == Halted );
 	unpark( new_owner );
@@ -327,5 +327,5 @@
 // Sorts monitors before entering
 void ?{}( monitor_guard_t & this, $monitor * m [], __lock_size_t count, fptr_t func ) {
-	$thread * thrd = TL_GET( this_thread );
+	$thread * thrd = active_thread();
 
 	// Store current array
@@ -362,5 +362,5 @@
 
 	// Restore thread context
-	TL_GET( this_thread )->monitors = this.prev;
+	active_thread()->monitors = this.prev;
 }
 
@@ -369,5 +369,5 @@
 void ?{}( monitor_dtor_guard_t & this, $monitor * m [], fptr_t func, bool join ) {
 	// optimization
-	$thread * thrd = TL_GET( this_thread );
+	$thread * thrd = active_thread();
 
 	// Store current array
@@ -392,5 +392,5 @@
 
 	// Restore thread context
-	TL_GET( this_thread )->monitors = this.prev;
+	active_thread()->monitors = this.prev;
 }
 
@@ -432,5 +432,5 @@
 
 	// Create the node specific to this wait operation
-	wait_ctx( TL_GET( this_thread ), user_info );
+	wait_ctx( active_thread(), user_info );
 
 	// Append the current wait operation to the ones already queued on the condition
@@ -483,5 +483,5 @@
 	//Some more checking in debug
 	__cfaabi_dbg_debug_do(
-		$thread * this_thrd = TL_GET( this_thread );
+		$thread * this_thrd = active_thread();
 		if ( this.monitor_count != this_thrd->monitors.size ) {
 			abort( "Signal on condition %p made with different number of monitor(s), expected %zi got %zi", &this, this.monitor_count, this_thrd->monitors.size );
@@ -531,5 +531,5 @@
 
 	// Create the node specific to this wait operation
-	wait_ctx_primed( kernelTLS.this_thread, 0 )
+	wait_ctx_primed( active_thread(), 0 )
 
 	//save contexts
@@ -630,5 +630,5 @@
 
 				// Create the node specific to this wait operation
-				wait_ctx_primed( kernelTLS.this_thread, 0 );
+				wait_ctx_primed( active_thread(), 0 );
 
 				// Save monitor states
@@ -682,5 +682,5 @@
 
 	// Create the node specific to this wait operation
-	wait_ctx_primed( kernelTLS.this_thread, 0 );
+	wait_ctx_primed( active_thread(), 0 );
 
 	monitor_save;
@@ -688,5 +688,5 @@
 
 	for( __lock_size_t i = 0; i < count; i++) {
-		verify( monitors[i]->owner == kernelTLS.this_thread );
+		verify( monitors[i]->owner == active_thread() );
 	}
 
@@ -724,10 +724,10 @@
 static inline void __set_owner( $monitor * monitors [], __lock_size_t count, $thread * owner ) {
 	/* paranoid */ verify ( monitors[0]->lock.lock );
-	/* paranoid */ verifyf( monitors[0]->owner == kernelTLS.this_thread, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, monitors[0]->owner, monitors[0]->recursion, monitors[0] );
+	/* paranoid */ verifyf( monitors[0]->owner == active_thread(), "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), monitors[0]->owner, monitors[0]->recursion, monitors[0] );
 	monitors[0]->owner        = owner;
 	monitors[0]->recursion    = 1;
 	for( __lock_size_t i = 1; i < count; i++ ) {
 		/* paranoid */ verify ( monitors[i]->lock.lock );
-		/* paranoid */ verifyf( monitors[i]->owner == kernelTLS.this_thread, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, monitors[i]->owner, monitors[i]->recursion, monitors[i] );
+		/* paranoid */ verifyf( monitors[i]->owner == active_thread(), "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), monitors[i]->owner, monitors[i]->recursion, monitors[i] );
 		monitors[i]->owner        = owner;
 		monitors[i]->recursion    = 0;
@@ -755,5 +755,5 @@
 		//regardless of if we are ready to baton pass,
 		//we need to set the monitor as in use
-		/* paranoid */ verifyf( !this->owner || kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+		/* paranoid */ verifyf( !this->owner || active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 		__set_owner( this,  urgent->owner->waiting_thread );
 
@@ -764,5 +764,5 @@
 	// Get the next thread in the entry_queue
 	$thread * new_owner = pop_head( this->entry_queue );
-	/* paranoid */ verifyf( !this->owner || kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+	/* paranoid */ verifyf( !this->owner || active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 	/* paranoid */ verify( !new_owner || new_owner->link.next == 0p );
 	__set_owner( this, new_owner );
@@ -892,5 +892,5 @@
 
 static inline void brand_condition( condition & this ) {
-	$thread * thrd = TL_GET( this_thread );
+	$thread * thrd = active_thread();
 	if( !this.monitors ) {
 		// __cfaabi_dbg_print_safe( "Branding\n" );
Index: libcfa/src/concurrency/mutex.cfa
===================================================================
--- libcfa/src/concurrency/mutex.cfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ libcfa/src/concurrency/mutex.cfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -40,5 +40,5 @@
 	lock( lock __cfaabi_dbg_ctx2 );
 	if( is_locked ) {
-		append( blocked_threads, kernelTLS.this_thread );
+		append( blocked_threads, active_thread() );
 		unlock( lock );
 		park();
@@ -86,14 +86,14 @@
 	lock( lock __cfaabi_dbg_ctx2 );
 	if( owner == 0p ) {
-		owner = kernelTLS.this_thread;
+		owner = active_thread();
 		recursion_count = 1;
 		unlock( lock );
 	}
-	else if( owner == kernelTLS.this_thread ) {
+	else if( owner == active_thread() ) {
 		recursion_count++;
 		unlock( lock );
 	}
 	else {
-		append( blocked_threads, kernelTLS.this_thread );
+		append( blocked_threads, active_thread() );
 		unlock( lock );
 		park();
@@ -105,9 +105,9 @@
 	lock( lock __cfaabi_dbg_ctx2 );
 	if( owner == 0p ) {
-		owner = kernelTLS.this_thread;
+		owner = active_thread();
 		recursion_count = 1;
 		ret = true;
 	}
-	else if( owner == kernelTLS.this_thread ) {
+	else if( owner == active_thread() ) {
 		recursion_count++;
 		ret = true;
@@ -159,5 +159,5 @@
 void wait(condition_variable & this) {
 	lock( this.lock __cfaabi_dbg_ctx2 );
-	append( this.blocked_threads, kernelTLS.this_thread );
+	append( this.blocked_threads, active_thread() );
 	unlock( this.lock );
 	park();
@@ -167,5 +167,5 @@
 void wait(condition_variable & this, L & l) {
 	lock( this.lock __cfaabi_dbg_ctx2 );
-	append( this.blocked_threads, kernelTLS.this_thread );
+	append( this.blocked_threads, active_thread() );
 	unlock(l);
 	unlock(this.lock);
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ libcfa/src/concurrency/preemption.cfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -10,6 +10,6 @@
 // Created On       : Mon Jun 5 14:20:42 2017
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Wed Aug 26 16:46:03 2020
-// Update Count     : 53
+// Last Modified On : Fri Nov  6 07:42:13 2020
+// Update Count     : 54
 //
 
@@ -163,11 +163,97 @@
 // Kernel Signal Tools
 //=============================================================================================
-
-__cfaabi_dbg_debug_do( static thread_local void * last_interrupt = 0; )
+// In a user-level threading system, there are handful of thread-local variables where this problem occurs on the ARM.
+//
+// For each kernel thread running user-level threads, there is a flag variable to indicate if interrupts are
+// enabled/disabled for that kernel thread. Therefore, this variable is made thread local.
+//
+// For example, this code fragment sets the state of the "interrupt" variable in thread-local memory.
+//
+// _Thread_local volatile int interrupts;
+// int main() {
+//     interrupts = 0; // disable interrupts }
+//
+// which generates the following code on the ARM
+//
+// (gdb) disassemble main
+// Dump of assembler code for function main:
+//    0x0000000000000610 <+0>:	mrs	x1, tpidr_el0
+//    0x0000000000000614 <+4>:	mov	w0, #0x0                   	// #0
+//    0x0000000000000618 <+8>:	add	x1, x1, #0x0, lsl #12
+//    0x000000000000061c <+12>:	add	x1, x1, #0x10
+//    0x0000000000000620 <+16>:	str	wzr, [x1]
+//    0x0000000000000624 <+20>:	ret
+//
+// The mrs moves a pointer from coprocessor register tpidr_el0 into register x1.  Register w0 is set to 0. The two adds
+// increase the TLS pointer with the displacement (offset) 0x10, which is the location in the TSL of variable
+// "interrupts".  Finally, 0 is stored into "interrupts" through the pointer in register x1 that points into the
+// TSL. Now once x1 has the pointer to the location of the TSL for kernel thread N, it can be be preempted at a
+// user-level and the user thread is put on the user-level ready-queue. When the preempted thread gets to the front of
+// the user-level ready-queue it is run on kernel thread M. It now stores 0 into "interrupts" back on kernel thread N,
+// turning off interrupt on the wrong kernel thread.
+//
+// On the x86, the following code is generated for the same code fragment.
+//
+// (gdb) disassemble main
+// Dump of assembler code for function main:
+//    0x0000000000400420 <+0>:	movl   $0x0,%fs:0xfffffffffffffffc
+//    0x000000000040042c <+12>:	xor    %eax,%eax
+//    0x000000000040042e <+14>:	retq
+//
+// and there is base-displacement addressing used to atomically reset variable "interrupts" off of the TSL pointer in
+// register "fs".
+//
+// Hence, the ARM has base-displacement address for the general purpose registers, BUT not to the coprocessor
+// registers. As a result, generating the address for the write into variable "interrupts" is no longer atomic.
+//
+// Note this problem does NOT occur when just using multiple kernel threads because the preemption ALWAYS restarts the
+// thread on the same kernel thread.
+//
+// The obvious question is why does ARM use a coprocessor register to store the TSL pointer given that coprocessor
+// registers are second-class registers with respect to the instruction set. One possible answer is that they did not
+// want to dedicate one of the general registers to hold the TLS pointer and there was a free coprocessor register
+// available.
+
+//----------
+// special case for preemption since used often
+bool __preemption_enabled() {
+	// create a assembler label before
+	// marked as clobber all to avoid movement
+	asm volatile("__cfaasm_check_before:":::"memory");
+
+	// access tls as normal
+	bool enabled = __cfaabi_tls.preemption_state.enabled;
+
+	// create a assembler label after
+	// marked as clobber all to avoid movement
+	asm volatile("__cfaasm_check_after:":::"memory");
+	return enabled;
+}
+
+//----------
+// Get data from the TLS block
+uintptr_t __cfatls_get( unsigned long int offset ) __attribute__((__noinline__)); //no inline to avoid problems
+uintptr_t __cfatls_get( unsigned long int offset ) {
+	// create a assembler label before
+	// marked as clobber all to avoid movement
+	asm volatile("__cfaasm_get_before:":::"memory");
+
+	// access tls as normal (except for pointer arithmetic)
+	uintptr_t val = *(uintptr_t*)((uintptr_t)&__cfaabi_tls + offset);
+
+	// create a assembler label after
+	// marked as clobber all to avoid movement
+	asm volatile("__cfaasm_get_after:":::"memory");
+	return val;
+}
 
 extern "C" {
 	// Disable interrupts by incrementing the counter
 	void disable_interrupts() {
-		with( kernelTLS.preemption_state ) {
+		// create a assembler label before
+		// marked as clobber all to avoid movement
+		asm volatile("__cfaasm_disable_before:":::"memory");
+
+		with( __cfaabi_tls.preemption_state ) {
 			#if GCC_VERSION > 50000
 			static_assert(__atomic_always_lock_free(sizeof(enabled), &enabled), "Must be lock-free");
@@ -186,4 +272,8 @@
 			verify( new_val < 65_000u );              // If this triggers someone is disabling interrupts without enabling them
 		}
+
+		// create a assembler label after
+		// marked as clobber all to avoid movement
+		asm volatile("__cfaasm_disable_after:":::"memory");
 	}
 
@@ -191,8 +281,12 @@
 	// If counter reaches 0, execute any pending __cfactx_switch
 	void enable_interrupts( __cfaabi_dbg_ctx_param ) {
-		processor   * proc = kernelTLS.this_processor; // Cache the processor now since interrupts can start happening after the atomic store
+		// create a assembler label before
+		// marked as clobber all to avoid movement
+		asm volatile("__cfaasm_enable_before:":::"memory");
+
+		processor   * proc = __cfaabi_tls.this_processor; // Cache the processor now since interrupts can start happening after the atomic store
 		/* paranoid */ verify( proc );
 
-		with( kernelTLS.preemption_state ){
+		with( __cfaabi_tls.preemption_state ){
 			unsigned short prev = disable_count;
 			disable_count -= 1;
@@ -221,4 +315,8 @@
 		// For debugging purposes : keep track of the last person to enable the interrupts
 		__cfaabi_dbg_debug_do( proc->last_enable = caller; )
+
+		// create a assembler label after
+		// marked as clobber all to avoid movement
+		asm volatile("__cfaasm_enable_after:":::"memory");
 	}
 
@@ -226,19 +324,27 @@
 	// Don't execute any pending __cfactx_switch even if counter reaches 0
 	void enable_interrupts_noPoll() {
-		unsigned short prev = kernelTLS.preemption_state.disable_count;
-		kernelTLS.preemption_state.disable_count -= 1;
+		// create a assembler label before
+		// marked as clobber all to avoid movement
+		asm volatile("__cfaasm_nopoll_before:":::"memory");
+
+		unsigned short prev = __cfaabi_tls.preemption_state.disable_count;
+		__cfaabi_tls.preemption_state.disable_count -= 1;
 		verifyf( prev != 0u, "Incremented from %u\n", prev );                     // If this triggers someone is enabled already enabled interrupts
 		if( prev == 1 ) {
 			#if GCC_VERSION > 50000
-			static_assert(__atomic_always_lock_free(sizeof(kernelTLS.preemption_state.enabled), &kernelTLS.preemption_state.enabled), "Must be lock-free");
+			static_assert(__atomic_always_lock_free(sizeof(__cfaabi_tls.preemption_state.enabled), &__cfaabi_tls.preemption_state.enabled), "Must be lock-free");
 			#endif
 			// Set enabled flag to true
 			// should be atomic to avoid preemption in the middle of the operation.
 			// use memory order RELAXED since there is no inter-thread on this variable requirements
-			__atomic_store_n(&kernelTLS.preemption_state.enabled, true, __ATOMIC_RELAXED);
+			__atomic_store_n(&__cfaabi_tls.preemption_state.enabled, true, __ATOMIC_RELAXED);
 
 			// Signal the compiler that a fence is needed but only for signal handlers
 			__atomic_signal_fence(__ATOMIC_RELEASE);
 		}
+
+		// create a assembler label after
+		// marked as clobber all to avoid movement
+		asm volatile("__cfaasm_nopoll_after:":::"memory");
 	}
 }
@@ -275,5 +381,5 @@
 static void timeout( $thread * this ) {
 	#if !defined( __CFA_NO_STATISTICS__ )
-		kernelTLS.this_stats = this->curr_cluster->stats;
+		kernelTLS().this_stats = this->curr_cluster->stats;
 	#endif
 	unpark( this );
@@ -286,8 +392,8 @@
 static inline bool preemption_ready() {
 	// Check if preemption is safe
-	bool ready = kernelTLS.preemption_state.enabled && ! kernelTLS.preemption_state.in_progress;
+	bool ready = __cfaabi_tls.preemption_state.enabled && ! __cfaabi_tls.preemption_state.in_progress;
 
 	// Adjust the pending flag accordingly
-	kernelTLS.this_processor->pending_preemption = !ready;
+	__cfaabi_tls.this_processor->pending_preemption = !ready;
 	return ready;
 }
@@ -303,6 +409,6 @@
 
 	// Start with preemption disabled until ready
-	kernelTLS.preemption_state.enabled = false;
-	kernelTLS.preemption_state.disable_count = 1;
+	__cfaabi_tls.preemption_state.enabled = false;
+	__cfaabi_tls.preemption_state.disable_count = 1;
 
 	// Initialize the event kernel
@@ -362,9 +468,51 @@
 // Kernel Signal Handlers
 //=============================================================================================
+struct asm_region {
+	void * before;
+	void * after;
+};
+
+//-----------------------------------------------------------------------------
+// Some assembly required
+#if defined( __i386 )
+	#define __cfaasm_label( label ) \
+		({ \
+			struct asm_region region; \
+			asm( \
+				"movl $__cfaasm_" #label "_before, %[vb]\n\t" \
+				"movl $__cfaasm_" #label "_after , %[va]\n\t" \
+				 : [vb]"=r"(region.before), [vb]"=r"(region.before) \
+			); \
+			region; \
+		});
+#elif defined( __x86_64 )
+	#ifdef __PIC__
+		#define PLT "@PLT"
+	#else
+		#define PLT ""
+	#endif
+	#define __cfaasm_label( label ) \
+		({ \
+			struct asm_region region; \
+			asm( \
+				"movq $__cfaasm_" #label "_before" PLT ", %[vb]\n\t" \
+				"movq $__cfaasm_" #label "_after"  PLT ", %[va]\n\t" \
+				 : [vb]"=r"(region.before), [va]"=r"(region.after) \
+			); \
+			region; \
+		});
+#elif defined( __aarch64__ )
+	#error __cfaasm_label undefined for arm
+#else
+	#error unknown hardware architecture
+#endif
+
+__cfaabi_dbg_debug_do( static thread_local void * last_interrupt = 0; )
 
 // Context switch signal handler
 // Receives SIGUSR1 signal and causes the current thread to yield
 static void sigHandler_ctxSwitch( __CFA_SIGPARMS__ ) {
-	__cfaabi_dbg_debug_do( last_interrupt = (void *)(cxt->uc_mcontext.CFA_REG_IP); )
+	void * ip = (void *)(cxt->uc_mcontext.CFA_REG_IP);
+	__cfaabi_dbg_debug_do( last_interrupt = ip; )
 
 	// SKULLDUGGERY: if a thread creates a processor and the immediately deletes it,
@@ -372,9 +520,9 @@
 	// before the kernel thread has even started running. When that happens, an interrupt
 	// with a null 'this_processor' will be caught, just ignore it.
-	if(! kernelTLS.this_processor ) return;
+	if(! __cfaabi_tls.this_processor ) return;
 
 	choose(sfp->si_value.sival_int) {
 		case PREEMPT_NORMAL   : ;// Normal case, nothing to do here
-		case PREEMPT_TERMINATE: verify( __atomic_load_n( &kernelTLS.this_processor->do_terminate, __ATOMIC_SEQ_CST ) );
+		case PREEMPT_TERMINATE: verify( __atomic_load_n( &__cfaabi_tls.this_processor->do_terminate, __ATOMIC_SEQ_CST ) );
 		default:
 			abort( "internal error, signal value is %d", sfp->si_value.sival_int );
@@ -384,8 +532,15 @@
 	if( !preemption_ready() ) { return; }
 
-	__cfaabi_dbg_print_buffer_decl( " KERNEL: preempting core %p (%p @ %p).\n", kernelTLS.this_processor, kernelTLS.this_thread, (void *)(cxt->uc_mcontext.CFA_REG_IP) );
+	struct asm_region region;
+	region = __cfaasm_label( get     ); if( ip >= region.before && ip <= region.after ) return;
+	region = __cfaasm_label( check   ); if( ip >= region.before && ip <= region.after ) return;
+	region = __cfaasm_label( disable ); if( ip >= region.before && ip <= region.after ) return;
+	region = __cfaasm_label( enable  ); if( ip >= region.before && ip <= region.after ) return;
+	region = __cfaasm_label( nopoll  ); if( ip >= region.before && ip <= region.after ) return;
+
+	__cfaabi_dbg_print_buffer_decl( " KERNEL: preempting core %p (%p @ %p).\n", __cfaabi_tls.this_processor, __cfaabi_tls.this_thread, (void *)(cxt->uc_mcontext.CFA_REG_IP) );
 
 	// Sync flag : prevent recursive calls to the signal handler
-	kernelTLS.preemption_state.in_progress = true;
+	__cfaabi_tls.preemption_state.in_progress = true;
 
 	// Clear sighandler mask before context switching.
@@ -397,7 +552,6 @@
 	}
 
-	// TODO: this should go in finish action
 	// Clear the in progress flag
-	kernelTLS.preemption_state.in_progress = false;
+	__cfaabi_tls.preemption_state.in_progress = false;
 
 	// Preemption can occur here
@@ -416,5 +570,5 @@
 	id.full_proc = false;
 	id.id = doregister(&id);
-	kernelTLS.this_proc_id = &id;
+	__cfaabi_tls.this_proc_id = &id;
 
 	// Block sigalrms to control when they arrive
@@ -484,5 +638,5 @@
 
 void __cfaabi_check_preemption() {
-	bool ready = kernelTLS.preemption_state.enabled;
+	bool ready = __preemption_enabled();
 	if(!ready) { abort("Preemption should be ready"); }
 
@@ -507,5 +661,5 @@
 #ifdef __CFA_WITH_VERIFY__
 bool __cfaabi_dbg_in_kernel() {
-	return !kernelTLS.preemption_state.enabled;
+	return !__preemption_enabled();
 }
 #endif
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -150,5 +150,5 @@
 //  queues or removing them.
 uint_fast32_t ready_mutate_lock( void ) with(*__scheduler_lock) {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 
 	// Step 1 : lock global lock
@@ -166,10 +166,10 @@
 	}
 
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	return s;
 }
 
 void ready_mutate_unlock( uint_fast32_t last_s ) with(*__scheduler_lock) {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 
 	// Step 1 : release local locks
@@ -188,5 +188,5 @@
 	__atomic_store_n(&lock, (bool)false, __ATOMIC_RELEASE);
 
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 }
 
@@ -252,5 +252,5 @@
 		preferred =
 			//*
-			kernelTLS.this_processor ? kernelTLS.this_processor->id * 4 : -1;
+			kernelTLS().this_processor ? kernelTLS().this_processor->id * 4 : -1;
 			/*/
 			thrd->link.preferred * 4;
@@ -331,5 +331,5 @@
 		// Don't bother trying locally too much
 		int local_tries = 8;
-		preferred = kernelTLS.this_processor->id * 4;
+		preferred = kernelTLS().this_processor->id * 4;
 	#endif
 
Index: libcfa/src/stdlib.hfa
===================================================================
--- libcfa/src/stdlib.hfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ libcfa/src/stdlib.hfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -101,4 +101,23 @@
 		return (T *)pvalloc( sizeof(T) );				// C pvalloc
 	} // pvalloc
+
+	void free( T * addr ) {
+		free( (void *) addr ); 							// C free
+	} // free
+} // distribution
+
+static inline forall( ttype TT | { void free( TT ); } ) {
+	// T* does not take void* and vice-versa
+
+	void free( void * addr, TT rest ) {
+		free( addr );
+		free( rest );
+	} // free
+
+	forall( dtype T | sized(T) )
+	void free( T * addr, TT rest ) {
+		free( addr );
+		free( rest );
+	} // free
 } // distribution
 
Index: tests/alloc.cfa
===================================================================
--- tests/alloc.cfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ tests/alloc.cfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -208,5 +208,14 @@
 
 
+	int const_count, dest_count;
 	struct Struct { int x; double y; };
+	void  ?{}( Struct & a ) {					// construct
+		a.[ x, y ] = [ -1, -1.0 ];
+	}
+	void  ?{}( Struct & a, int x, double y ) {	// initialize
+		a.[ x, y ] = [ x, y ];
+		const_count++;
+	}
+	void ^?{}( Struct & a ) {  dest_count++; }	// destruct
 	Struct st, st1, sta[dim], sta1[dim], * stp, * stp1;
 
@@ -329,18 +338,27 @@
 	printf( "\n" );
 
+	const_count = dest_count = 0;
 	stp = new( 42, 42.5 );
+	assert( const_count == 1 && dest_count == 0 );						// assertion for testing
 	stp1 = new( 42, 42.5 );
+	assert( const_count == 2 && dest_count == 0 );						// assertion for testing
+
 	printf( "CFA new initialize\n%d %g %d %g\n", stp->x, stp->y, stp1->x, stp1->y );
 	delete( stp, stp1 );
+	assert( const_count == 2 && dest_count == 2 );						// assertion for testing
 
 	// new, array types
 	stp = anew( dim, 42, 42.5 );
+	assert( const_count == 2 + dim && dest_count == 2 );				// assertion for testing
 	printf( "CFA array new initialize\n" );
 	for ( i; dim ) { printf( "%d %g, ", stp[i].x, stp[i].y ); }
 	printf( "\n" );
+
 	stp1 = anew( dim, 42, 42.5 );
+	assert( const_count == 2 + 2 * dim && dest_count == 2 );			// assertion for testing
 	for ( i; dim ) { printf( "%d %g, ", stp1[i].x, stp1[i].y ); }
 	printf( "\n" );
 	adelete( stp, stp1 );
+	assert( const_count == 2 + 2 * dim && dest_count == 2 + 2 * dim);	// assertion for testing
 
 	// extras
@@ -354,5 +372,8 @@
 	*ip = 0xdeadbeef;
 	printf( "CFA deep malloc %#x\n", *ip );
-	free( ip );
+
+	dp = alloc(5.0`fill); // just for testing multiple free
+	assert(*dp == 5.0);
+	free( ip, dp );
 
 #ifdef ERR1
Index: tests/malloc.cfa
===================================================================
--- tests/malloc.cfa	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ tests/malloc.cfa	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -319,5 +319,5 @@
 	free(ip);
 
-	free( 0p ); // sanity check
+	free( (void*) 0p ); // sanity check
 	free( NULL ); // sanity check
 
Index: tools/stat.py
===================================================================
--- tools/stat.py	(revision 16ba4a6fa0eb60a0cd27fb96882e37dbf94834dd)
+++ tools/stat.py	(revision 18f0b707a42dceef0ea6f927ff708cc6bd645627)
@@ -1,3 +1,3 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 
 import sys
@@ -17,5 +17,5 @@
 		avg = numpy.mean  (content)
 		std = numpy.std   (content)
-		print "median {0:.1f} avg {1:.1f} stddev {2:.1f}".format( med, avg, std )
+		print("median {0:.1f} avg {1:.1f} stddev {2:.1f}".format( med, avg, std ))