Index: benchmark/readyQ/bench.go
===================================================================
--- benchmark/readyQ/bench.go	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
+++ benchmark/readyQ/bench.go	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -0,0 +1,74 @@
+package main
+
+import (
+	"bufio"
+	"flag"
+	"fmt"
+	"os"
+	"runtime"
+	"sync/atomic"
+	"time"
+)
+
+var clock_mode bool
+var threads_left int64
+var stop int32
+var duration float64
+var stop_count uint64
+var nprocs int
+var nthreads int
+
+func fflush(f *bufio.Writer) {
+	defer f.Flush()
+	f.Write([]byte("\r"))
+}
+
+func wait(start time.Time, is_tty bool) {
+	f := bufio.NewWriter(os.Stdout)
+	tdur := time.Duration(duration)
+	for true {
+		time.Sleep(100 * time.Millisecond)
+		end := time.Now()
+		delta := end.Sub(start)
+		if is_tty {
+			fmt.Printf(" %.1f",delta.Seconds())
+			fflush(f)
+		}
+		if clock_mode && delta >= (tdur * time.Second) {
+			break
+		} else if !clock_mode && atomic.LoadInt64(&threads_left) == 0 {
+			break
+		}
+	}
+}
+
+func bench_init() {
+	nprocsOpt := flag.Int("p", 1, "The number of processors")
+	nthreadsOpt := flag.Int("t", 1, "The number of threads")
+	durationOpt := flag.Float64("d", 0, "Duration of the experiment in seconds")
+	stopOpt := flag.Uint64("i", 0, "Duration of the experiment in iterations")
+
+	flag.Parse()
+
+	nprocs = *nprocsOpt
+	nthreads = *nthreadsOpt
+	duration = *durationOpt
+	stop_count = *stopOpt
+
+	if duration > 0 && stop_count > 0 {
+		panic(fmt.Sprintf("--duration and --iterations cannot be used together\n"))
+	} else if duration > 0 {
+		clock_mode = true
+		stop_count = 0xFFFFFFFFFFFFFFFF
+		fmt.Printf("Running for %f seconds\n", duration)
+	} else if stop_count > 0 {
+		clock_mode = false
+		fmt.Printf("Running for %d iterations\n", stop_count)
+	} else {
+		duration = 5
+		clock_mode = true
+		fmt.Printf("Running for %f seconds\n", duration)
+	}
+
+	runtime.GOMAXPROCS(nprocs)
+}
Index: benchmark/readyQ/cycle.cfa
===================================================================
--- benchmark/readyQ/cycle.cfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ benchmark/readyQ/cycle.cfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -1,21 +1,34 @@
 #include "rq_bench.hfa"
 
-thread Partner {
-	Partner * partner;
+struct Partner {
 	unsigned long long count;
+	unsigned long long blocks;
+	bench_sem self;
+	bench_sem * next;
 };
 
 void ?{}( Partner & this ) {
-	((thread&)this){ bench_cluster };
+	this.count = this.blocks = 0;
 }
 
-void main( Partner & this ) {
-	this.count = 0;
+thread BThrd {
+	Partner & partner;
+};
+
+void ?{}( BThrd & this, Partner * partner ) {
+	((thread&)this){ bench_cluster };
+	&this.partner = partner;
+}
+
+void ^?{}( BThrd & mutex this ) {}
+
+void main( BThrd & thrd ) with(thrd.partner) {
+	count = 0;
 	for() {
-		park();
-		unpark( *this.partner );
-		this.count ++;
+		blocks += wait( self );
+		post( *next );
+		count ++;
 		if( clock_mode && stop) break;
-		if(!clock_mode && this.count >= stop_count) break;
+		if(!clock_mode && count >= stop_count) break;
 	}
 
@@ -33,4 +46,5 @@
 	{
 		unsigned long long global_counter = 0;
+		unsigned long long global_blocks  = 0;
 		unsigned tthreads = nthreads * ring_size;
 		Time start, end;
@@ -38,8 +52,13 @@
 		{
 			threads_left = tthreads;
-			Partner threads[tthreads];
+			BThrd * threads[tthreads];
+			Partner thddata[tthreads];
 			for(i; tthreads) {
 				unsigned pi = (i + nthreads) % tthreads;
-				threads[i].partner = &threads[pi];
+				thddata[i].next = &thddata[pi].self;
+			}
+			for(int i = 0; i < tthreads; i++) {
+				threads[i] = malloc();
+				(*threads[i]){ &thddata[i] };
 			}
 			printf("Starting\n");
@@ -49,5 +68,5 @@
 
 			for(i; nthreads) {
-				unpark( threads[i] );
+				post( thddata[i].self );
 			}
 			wait(start, is_tty);
@@ -58,19 +77,23 @@
 
 			for(i; tthreads) {
-				global_counter += join( threads[i] ).count;
+				Partner & partner = join( *threads[i] ).partner;
+				global_counter += partner.count;
+				global_blocks  += partner.blocks;
+				delete(threads[i]);
 			}
 		}
 
-		printf("Duration (ms)       : %'ld\n", (end - start)`ms);
-		printf("Number of processors: %'d\n", nprocs);
-		printf("Number of threads   : %'d\n", tthreads);
-		printf("Cycle size (# thrds): %'d\n", ring_size);
-		printf("Yields per second   : %'18.2lf\n", ((double)global_counter) / (end - start)`s);
-		printf("ns per yields       : %'18.2lf\n", ((double)(end - start)`ns) / global_counter);
-		printf("Total yields        : %'15llu\n", global_counter);
-		printf("Yields per threads  : %'15llu\n", global_counter / tthreads);
-		printf("Yields per procs    : %'15llu\n", global_counter / nprocs);
-		printf("Yields/sec/procs    : %'18.2lf\n", (((double)global_counter) / nprocs) / (end - start)`s);
-		printf("ns per yields/procs : %'18.2lf\n", ((double)(end - start)`ns) / (global_counter / nprocs));
+		printf("Duration (ms)        : %'ld\n", (end - start)`dms);
+		printf("Number of processors : %'d\n", nprocs);
+		printf("Number of threads    : %'d\n", tthreads);
+		printf("Cycle size (# thrds) : %'d\n", ring_size);
+		printf("Total Operations(ops): %'15llu\n", global_counter);
+		printf("Total blocks         : %'15llu\n", global_blocks);
+		printf("Ops per second       : %'18.2lf\n", ((double)global_counter) / (end - start)`ds);
+		printf("ns per ops           : %'18.2lf\n", (end - start)`dns / global_counter);
+		printf("Ops per threads      : %'15llu\n", global_counter / tthreads);
+		printf("Ops per procs        : %'15llu\n", global_counter / nprocs);
+		printf("Ops/sec/procs        : %'18.2lf\n", (((double)global_counter) / nprocs) / (end - start)`ds);
+		printf("ns per ops/procs     : %'18.2lf\n", (end - start)`dns / (global_counter / nprocs));
 		fflush(stdout);
 	}
Index: benchmark/readyQ/cycle.cpp
===================================================================
--- benchmark/readyQ/cycle.cpp	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ benchmark/readyQ/cycle.cpp	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -0,0 +1,86 @@
+
+#include "rq_bench.hpp"
+
+struct Partner {
+	unsigned long long count  = 0;
+	unsigned long long blocks = 0;
+	bench_sem self;
+	bench_sem * next;
+};
+
+void partner_main( Partner * self ) {
+	self->count = 0;
+	for(;;) {
+		self->blocks += self->self.wait();
+		self->next->post();
+		self->count ++;
+		if( clock_mode && stop) break;
+		if(!clock_mode && self->count >= stop_count) break;
+	}
+
+	__atomic_fetch_add(&threads_left, -1, __ATOMIC_SEQ_CST);
+}
+
+int main(int argc, char * argv[]) {
+	unsigned ring_size = 2;
+	option_t opt[] = {
+		BENCH_OPT,
+		{ 'r', "ringsize", "Number of threads in a cycle", ring_size }
+	};
+	BENCH_OPT_PARSE("cforall cycle benchmark");
+
+	{
+		unsigned long long global_counter = 0;
+		unsigned long long global_blocks  = 0;
+		unsigned tthreads = nthreads * ring_size;
+		uint64_t start, end;
+		FibreInit(1, nprocs);
+		{
+			threads_left = tthreads;
+			Fibre * threads[tthreads];
+			Partner thddata[tthreads];
+			for(int i = 0; i < tthreads; i++) {
+				unsigned pi = (i + nthreads) % tthreads;
+				thddata[i].next = &thddata[pi].self;
+			}
+			for(int i = 0; i < tthreads; i++) {
+				threads[i] = new Fibre( reinterpret_cast<void (*)(void *)>(partner_main), &thddata[i] );
+			}
+			printf("Starting\n");
+
+			bool is_tty = isatty(STDOUT_FILENO);
+			start = getTimeNsec();
+
+			for(int i = 0; i < nthreads; i++) {
+				thddata[i].self.post();
+			}
+			wait(start, is_tty);
+
+			stop = true;
+			end = getTimeNsec();
+			printf("\nDone\n");
+
+			for(int i = 0; i < tthreads; i++) {
+				fibre_join( threads[i], nullptr );
+				global_counter += thddata[i].count;
+				global_blocks  += thddata[i].blocks;
+			}
+		}
+
+		printf("Duration (ms)        : %'ld\n", to_miliseconds(end - start));
+		printf("Number of processors : %'d\n", nprocs);
+		printf("Number of threads    : %'d\n", tthreads);
+		printf("Cycle size (# thrds) : %'d\n", ring_size);
+		printf("Total Operations(ops): %'15llu\n", global_counter);
+		printf("Total blocks         : %'15llu\n", global_blocks);
+		printf("Ops per second       : %'18.2lf\n", ((double)global_counter) / to_fseconds(end - start));
+		printf("ns per ops           : %'18.2lf\n", ((double)(end - start)) / global_counter);
+		printf("Ops per threads      : %'15llu\n", global_counter / tthreads);
+		printf("Ops per procs        : %'15llu\n", global_counter / nprocs);
+		printf("Ops/sec/procs        : %'18.2lf\n", (((double)global_counter) / nprocs) / to_fseconds(end - start));
+		printf("ns per ops/procs     : %'18.2lf\n", ((double)(end - start)) / (global_counter / nprocs));
+		fflush(stdout);
+	}
+
+	return 0;
+}
Index: benchmark/readyQ/cycle.go
===================================================================
--- benchmark/readyQ/cycle.go	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ benchmark/readyQ/cycle.go	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -2,9 +2,6 @@
 
 import (
-	"bufio"
 	"flag"
 	"fmt"
-	"os"
-	"runtime"
 	"sync/atomic"
 	"time"
@@ -12,34 +9,4 @@
 	"golang.org/x/text/message"
 )
-
-var clock_mode bool
-var threads_left int64
-var stop int32
-var duration float64
-var stop_count uint64
-
-func fflush(f *bufio.Writer) {
-	defer f.Flush()
-	f.Write([]byte("\r"))
-}
-
-func wait(start time.Time, is_tty bool) {
-	f := bufio.NewWriter(os.Stdout)
-	tdur := time.Duration(duration)
-	for true {
-		time.Sleep(100 * time.Millisecond)
-		end := time.Now()
-		delta := end.Sub(start)
-		if is_tty {
-			fmt.Printf(" %.1f",delta.Seconds())
-			fflush(f)
-		}
-		if clock_mode && delta >= (tdur * time.Second) {
-			break
-		} else if !clock_mode && atomic.LoadInt64(&threads_left) == 0 {
-			break
-		}
-	}
-}
 
 func partner(result chan uint64, mine chan int, next chan int) {
@@ -58,38 +25,12 @@
 
 func main() {
-	var nprocs int
-	var nthreads int
 	var ring_size int
 
-	nprocsOpt := flag.Int("p", 1, "The number of processors")
-	nthreadsOpt := flag.Int("t", 1, "The number of threads")
 	ring_sizeOpt := flag.Int("r", 2, "The number of threads per cycles")
-	durationOpt := flag.Float64("d", 0, "Duration of the experiment in seconds")
-	stopOpt := flag.Uint64("i", 0, "Duration of the experiment in iterations")
 
-	flag.Parse()
+	bench_init()
 
-	nprocs = *nprocsOpt
-	nthreads = *nthreadsOpt
 	ring_size = *ring_sizeOpt
-	duration = *durationOpt
-	stop_count = *stopOpt
 
-	if duration > 0 && stop_count > 0 {
-		panic(fmt.Sprintf("--duration and --iterations cannot be used together\n"))
-	} else if duration > 0 {
-		clock_mode = true
-		stop_count = 0xFFFFFFFFFFFFFFFF
-		fmt.Printf("Running for %f seconds\n", duration)
-	} else if stop_count > 0 {
-		clock_mode = false
-		fmt.Printf("Running for %d iterations\n", stop_count)
-	} else {
-		duration = 5
-		clock_mode = true
-		fmt.Printf("Running for %f seconds\n", duration)
-	}
-
-	runtime.GOMAXPROCS(nprocs)
 	tthreads := nthreads * ring_size
 	threads_left = int64(tthreads)
@@ -126,15 +67,15 @@
 
 	p := message.NewPrinter(language.English)
-	p.Printf("Duration (ms)       : %f\n", delta.Seconds());
-	p.Printf("Number of processors: %d\n", nprocs);
-	p.Printf("Number of threads   : %d\n", tthreads);
-	p.Printf("Cycle size (# thrds): %d\n", ring_size);
-	p.Printf("Yields per second   : %18.2f\n", float64(global_counter) / delta.Seconds())
-	p.Printf("ns per yields       : %18.2f\n", float64(delta.Nanoseconds()) / float64(global_counter))
-	p.Printf("Total yields        : %15d\n", global_counter)
-	p.Printf("Yields per threads  : %15d\n", global_counter / uint64(tthreads))
-	p.Printf("Yields per procs    : %15d\n", global_counter / uint64(nprocs))
-	p.Printf("Yields/sec/procs    : %18.2f\n", (float64(global_counter) / float64(nprocs)) / delta.Seconds())
-	p.Printf("ns per yields/procs : %18.2f\n", float64(delta.Nanoseconds()) / (float64(global_counter) / float64(nprocs)))
+	p.Printf("Duration (ms)        : %f\n", delta.Seconds());
+	p.Printf("Number of processors : %d\n", nprocs);
+	p.Printf("Number of threads    : %d\n", tthreads);
+	p.Printf("Cycle size (# thrds) : %d\n", ring_size);
+	p.Printf("Total Operations(ops): %15d\n", global_counter)
+	p.Printf("Ops per second       : %18.2f\n", float64(global_counter) / delta.Seconds())
+	p.Printf("ns per ops           : %18.2f\n", float64(delta.Nanoseconds()) / float64(global_counter))
+	p.Printf("Ops per threads      : %15d\n", global_counter / uint64(tthreads))
+	p.Printf("Ops per procs        : %15d\n", global_counter / uint64(nprocs))
+	p.Printf("Ops/sec/procs        : %18.2f\n", (float64(global_counter) / float64(nprocs)) / delta.Seconds())
+	p.Printf("ns per ops/procs     : %18.2f\n", float64(delta.Nanoseconds()) / (float64(global_counter) / float64(nprocs)))
 
 }
Index: benchmark/readyQ/rq_bench.hfa
===================================================================
--- benchmark/readyQ/rq_bench.hfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ benchmark/readyQ/rq_bench.hfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -66,5 +66,5 @@
 
 void ^?{}( BenchCluster & this ) {
-	adelete( this.nprocs, this.procs );
+	adelete( this.procs );
 	^(this.cl){};
 }
@@ -87,2 +87,51 @@
 	}
 }
+
+struct __attribute__((aligned(128))) bench_sem {
+	struct $thread * volatile ptr;
+};
+
+static inline {
+	void  ?{}(bench_sem & this) {
+		this.ptr = 0p;
+	}
+
+	void ^?{}(bench_sem & this) {}
+
+	bool wait(bench_sem & this) {
+		for() {
+			struct $thread * expected = this.ptr;
+			if(expected == 1p) {
+				if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					return false;
+				}
+			}
+			else {
+				/* paranoid */ verify( expected == 0p );
+				if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					park();
+					return true;
+				}
+			}
+
+		}
+	}
+
+	bool post(bench_sem & this) {
+		for() {
+			struct $thread * expected = this.ptr;
+			if(expected == 1p) return false;
+			if(expected == 0p) {
+				if(__atomic_compare_exchange_n(&this.ptr, &expected, 1p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					return false;
+				}
+			}
+			else {
+				if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					unpark( expected );
+					return true;
+				}
+			}
+		}
+	}
+}
Index: benchmark/readyQ/rq_bench.hpp
===================================================================
--- benchmark/readyQ/rq_bench.hpp	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
+++ benchmark/readyQ/rq_bench.hpp	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -0,0 +1,386 @@
+#include <cassert>
+
+#include <time.h>										// timespec
+#include <sys/time.h>									// timeval
+
+enum { TIMEGRAN = 1000000000LL };					// nanosecond granularity, except for timeval
+
+
+#include <libfibre/fibre.h>
+
+
+
+volatile bool stop = false;
+bool clock_mode;
+double duration = -1;
+unsigned long long stop_count = 0;
+unsigned nprocs = 1;
+unsigned nthreads = 1;
+
+volatile unsigned long long threads_left;
+
+#define BENCH_OPT \
+	{'d', "duration",  "Duration of the experiments in seconds", duration }, \
+	{'i', "iterations",  "Number of iterations of the experiments", stop_count }, \
+	{'t', "nthreads",  "Number of threads to use", nthreads }, \
+	{'p', "nprocs",    "Number of processors to use", nprocs }
+
+#define BENCH_OPT_PARSE(name) \
+	{ \
+		int opt_cnt = sizeof(opt) / sizeof(option_t); \
+		char **left; \
+		parse_args( argc, argv, opt, opt_cnt, "[OPTIONS]...\n" name, &left ); \
+		if(duration > 0 && stop_count > 0) { \
+			fprintf(stderr, "--duration and --iterations cannot be used together\n"); \
+			print_args_usage(argc, argv, opt, opt_cnt, "[OPTIONS]...\n" name, true); \
+		} else if(duration > 0) { \
+			clock_mode = true; \
+			stop_count = 0xFFFFFFFFFFFFFFFF; \
+			printf("Running for %lf seconds\n", duration); \
+		} else if(stop_count > 0) { \
+			clock_mode = false; \
+			printf("Running for %llu iterations\n", stop_count); \
+		} else { \
+			duration = 5; clock_mode = true;\
+			printf("Running for %lf seconds\n", duration); \
+		} \
+	}
+
+uint64_t getTimeNsec() {
+	timespec curr;
+	clock_gettime( CLOCK_REALTIME, &curr );
+	return (int64_t)curr.tv_sec * TIMEGRAN + curr.tv_nsec;
+}
+
+uint64_t to_miliseconds( uint64_t durtn ) { return durtn / (TIMEGRAN / 1000LL); }
+double to_fseconds(uint64_t durtn ) { return durtn / (double)TIMEGRAN; }
+uint64_t from_fseconds(double sec) { return sec * TIMEGRAN; }
+
+void wait(const uint64_t & start, bool is_tty) {
+	for(;;) {
+		Fibre::usleep(100000);
+		uint64_t end = getTimeNsec();
+		uint64_t delta = end - start;
+		if(is_tty) {
+			printf(" %.1f\r", to_fseconds(delta));
+			fflush(stdout);
+		}
+		if( clock_mode && delta >= from_fseconds(duration) ) {
+			break;
+		}
+		else if( !clock_mode && threads_left == 0 ) {
+			break;
+		}
+	}
+}
+
+class __attribute__((aligned(128))) bench_sem {
+	Fibre * volatile ptr = nullptr;
+public:
+	inline bool wait() {
+		static Fibre * const ready  = reinterpret_cast<Fibre * const>(1ull);
+		for(;;) {
+			Fibre * expected = this->ptr;
+			if(expected == ready) {
+				if(__atomic_compare_exchange_n(&this->ptr, &expected, nullptr, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					return false;
+				}
+			}
+			else {
+				/* paranoid */ assert( expected == nullptr );
+				if(__atomic_compare_exchange_n(&this->ptr, &expected, fibre_self(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					fibre_park();
+					return true;
+				}
+			}
+
+		}
+	}
+
+	inline bool post() {
+		static Fibre * const ready  = reinterpret_cast<Fibre * const>(1ull);
+		for(;;) {
+			Fibre * expected = this->ptr;
+			if(expected == ready) return false;
+			if(expected == nullptr) {
+				if(__atomic_compare_exchange_n(&this->ptr, &expected, ready, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					return false;
+				}
+			}
+			else {
+				if(__atomic_compare_exchange_n(&this->ptr, &expected, nullptr, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					fibre_unpark( expected );
+					return true;
+				}
+			}
+		}
+	}
+};
+
+// ==========================================================================================
+#include <cstring>
+
+//-----------------------------------------------------------------------------
+// Typed argument parsing
+bool parse_yesno(const char * arg, bool & value ) {
+	if(strcmp(arg, "yes") == 0) {
+		value = true;
+		return true;
+	}
+
+	if(strcmp(arg, "no") == 0) {
+		value = false;
+		return true;
+	}
+
+	return false;
+}
+
+bool parse_settrue (const char *, bool & value ) {
+	value = true;
+	return true;
+}
+
+bool parse_setfalse(const char *, bool & value )  {
+	value = false;
+	return true;
+}
+
+bool parse(const char * arg, const char * & value ) {
+	value = arg;
+	return true;
+}
+
+bool parse(const char * arg, int & value) {
+	char * end;
+	int r = strtoll(arg, &end, 10);
+	if(*end != '\0') return false;
+
+	value = r;
+	return true;
+}
+
+bool parse(const char * arg, unsigned & value) {
+	char * end;
+	unsigned long long int r = strtoull(arg, &end, 10);
+	if(*end != '\0') return false;
+	if(r > UINT_MAX) return false;
+
+	value = r;
+	return true;
+}
+
+bool parse(const char * arg, unsigned long & value) {
+	char * end;
+	unsigned long long int r = strtoull(arg, &end, 10);
+	if(*end != '\0') return false;
+	if(r > ULONG_MAX) return false;
+
+	value = r;
+	return true;
+}
+
+bool parse(const char * arg, unsigned long long & value) {
+        char * end;
+        unsigned long long int r = strtoull(arg, &end, 10);
+        if(*end != '\0') return false;
+        if(r > ULLONG_MAX) return false;
+
+        value = r;
+        return true;
+}
+
+bool parse(const char * arg, double & value) {
+	char * end;
+	double r = strtod(arg, &end);
+	if(*end != '\0') return false;
+
+	value = r;
+	return true;
+}
+
+//-----------------------------------------------------------------------------
+struct option_t {
+      char short_name;
+      const char * long_name;
+      const char * help;
+      void * variable;
+      bool (*parse_fun)(const char *, void * );
+
+	template<typename T>
+	inline option_t( char short_name, const char * long_name, const char * help, T & variable ) {
+		this->short_name = short_name;
+		this->long_name  = long_name;
+		this->help       = help;
+		this->variable   = reinterpret_cast<void*>(&variable);
+		this->parse_fun  = reinterpret_cast<bool (*)(const char *, void * )>(static_cast<bool (*)(const char *, T & )>(parse));
+	}
+
+	template<typename T>
+	inline option_t( char short_name, const char * long_name, const char * help, T & variable, bool (*parse)(const char *, T & )) {
+		this->short_name = short_name;
+		this->long_name  = long_name;
+		this->help       = help;
+		this->variable   = reinterpret_cast<void*>(&variable);
+		this->parse_fun  = reinterpret_cast<bool (*)(const char *, void * )>(parse);
+	}
+};
+
+extern option_t last_option;
+
+
+//-----------------------------------------------------------------------------
+#include <cstdint>
+#include <climits>
+#include <errno.h>
+#include <unistd.h>
+extern "C" {
+	#include <getopt.h>
+	#include <sys/ioctl.h>
+
+	extern FILE * stderr;
+	extern FILE * stdout;
+
+	extern int fileno(FILE *stream);
+
+	extern int fprintf ( FILE * stream, const char * format, ... );
+
+	extern          long long int strtoll (const char* str, char** endptr, int base);
+	extern unsigned long long int strtoull(const char* str, char** endptr, int base);
+	extern                 double strtod  (const char* str, char** endptr);
+}
+
+static void usage(char * cmd, option_t options[], size_t opt_count, const char * usage, FILE * out)  __attribute__ ((noreturn));
+
+//-----------------------------------------------------------------------------
+// getopt_long wrapping
+void parse_args(
+	int argc,
+	char * argv[],
+	option_t options[],
+	size_t opt_count,
+	const char * usage_msg,
+	char ** * left
+) {
+	struct option optarr[opt_count + 2];
+	{
+		int idx = 0;
+		for(int i = 0; i < opt_count; i++) {
+			if(options[i].long_name) {
+				optarr[idx].name = options[i].long_name;
+				optarr[idx].flag = nullptr;
+				optarr[idx].val  = options[i].short_name;
+				if(    ((intptr_t)options[i].parse_fun) == ((intptr_t)parse_settrue)
+				    || ((intptr_t)options[i].parse_fun) == ((intptr_t)parse_setfalse) ) {
+					optarr[idx].has_arg = no_argument;
+				} else {
+					optarr[idx].has_arg = required_argument;
+				}
+				idx++;
+			}
+		}
+		optarr[idx+0].name = "help";
+		optarr[idx+0].has_arg = no_argument;
+		optarr[idx+0].flag = 0;
+		optarr[idx+0].val = 'h';
+		optarr[idx+1].name = 0;
+		optarr[idx+1].has_arg = no_argument;
+		optarr[idx+1].flag = 0;
+		optarr[idx+1].val = 0;
+	}
+
+	char optstring[opt_count * 3];
+	for(auto & o : optstring) {
+		o = '\0';
+	}
+	{
+		int idx = 0;
+		for(int i = 0; i < opt_count; i++) {
+			optstring[idx] = options[i].short_name;
+			idx++;
+			if(    ((intptr_t)options[i].parse_fun) != ((intptr_t)parse_settrue)
+			    && ((intptr_t)options[i].parse_fun) != ((intptr_t)parse_setfalse) ) {
+				optstring[idx] = ':';
+				idx++;
+			}
+		}
+		optstring[idx+0] = 'h';
+		optstring[idx+1] = '\0';
+	}
+
+	FILE * out = stderr;
+	for(;;) {
+		int idx = 0;
+		int opt = getopt_long(argc, argv, optstring, optarr, &idx);
+		switch(opt) {
+			case -1:
+				if(left != nullptr) *left = argv + optind;
+				return;
+			case 'h':
+				out = stdout;
+			case '?':
+				usage(argv[0], options, opt_count, usage_msg, out);
+			default:
+				for(int i = 0; i < opt_count; i++) {
+					if(opt == options[i].short_name) {
+						const char * arg = optarg ? optarg : "";
+						bool success = options[i].parse_fun( arg, options[i].variable );
+						if(success) goto NEXT_ARG;
+
+						fprintf(out, "Argument '%s' for option %c could not be parsed\n\n", arg, (char)opt);
+						usage(argv[0], options, opt_count, usage_msg, out);
+					}
+				}
+				std::abort();
+		}
+		NEXT_ARG:;
+	}
+}
+
+//-----------------------------------------------------------------------------
+// Print usage
+static void printopt(FILE * out, int width, int max, char sn, const char * ln, const char * help) {
+	int hwidth = max - (11 + width);
+	if(hwidth <= 0) hwidth = max;
+
+	fprintf(out, "  -%c, --%-*s   %.*s\n", sn, width, ln, hwidth, help);
+	for(;;) {
+		help += std::min(strlen(help), (unsigned long)hwidth);
+		if('\0' == *help) break;
+		fprintf(out, "%*s%.*s\n", width + 11, "", hwidth, help);
+	}
+}
+
+__attribute__((noreturn)) void print_args_usage(int , char * argv[], option_t options[], size_t opt_count, const char * usage_msg, bool error) {
+	usage(argv[0], options, opt_count, usage_msg, error ? stderr : stdout);
+}
+
+static __attribute__((noreturn)) void usage(char * cmd, option_t options[], size_t opt_count, const char * help, FILE * out) {
+	int width = 0;
+	{
+		for(int i = 0; i < opt_count; i++) {
+			if(options[i].long_name) {
+				int w = strlen(options[i].long_name);
+				if(w > width) width = w;
+			}
+		}
+	}
+
+	int max_width = 1000000;
+	int outfd = fileno(out);
+	if(isatty(outfd)) {
+		struct winsize size;
+		int ret = ioctl(outfd, TIOCGWINSZ, &size);
+		if(ret < 0) abort(); // "ioctl error: (%d) %s\n", (int)errno, strerror(errno)
+		max_width = size.ws_col;
+	}
+
+	fprintf(out, "Usage:\n  %s %s\n", cmd, help);
+
+	for(int i = 0; i < opt_count; i++) {
+		printopt(out, width, max_width, options[i].short_name, options[i].long_name, options[i].help);
+	}
+	fprintf(out, "  -%c, --%-*s   %s\n", 'h', width, "help", "print this help message");
+	exit(out == stdout ? 0 : 1);
+}
+
Index: benchmark/rmit.py
===================================================================
--- benchmark/rmit.py	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ benchmark/rmit.py	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -10,4 +10,5 @@
 
 import argparse
+import datetime
 import itertools
 import os
@@ -101,4 +102,16 @@
 	return nopts
 
+def actions_eta(actions):
+	time = 0
+	for a in actions:
+		i = 0
+		while i < len(a):
+			if a[i] == '-d':
+				i += 1
+				if i != len(a):
+					time += int(a[i])
+			i += 1
+	return time
+
 if __name__ == "__main__":
 	# ================================================================================
@@ -160,7 +173,10 @@
 	# ================================================================================
 	# Prepare to run
+
+	# find expected time
+	time = actions_eta(actions)
+	print("Running {} trials{}".format(len(actions), "" if time == 0 else " (expecting to take {}".format(str(datetime.timedelta(seconds=int(time)))) ))
+
 	random.shuffle(actions)
-
-	print("Running {} trials".format(len(actions)))
 	result = []
 
@@ -209,5 +225,8 @@
 			d = [r[0], r[1]]
 			for k in headers[2:]:
-				d.append(r[2][k])
+				try:
+					d.append(r[2][k])
+				except:
+					d.append(0.0)
 
 			data.append(d)
Index: configure.ac
===================================================================
--- configure.ac	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ configure.ac	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -28,12 +28,14 @@
 # New AST toggling support
 AH_TEMPLATE([CFA_USE_NEW_AST],[Sets whether or not to use the new-ast, this is adefault value and can be overrided by --old-ast and --new-ast])
+DEFAULT_NEW_AST="False"
 AC_ARG_ENABLE(new-ast,
 	[  --enable-new-ast     whether or not to use new ast as the default AST algorithm],
 	[case "${enableval}" in
-		yes) newast=true ;;
-		no)  newast=false ;;
+		yes) newast=true ; DEFAULT_NEW_AST="True"  ;;
+		no)  newast=false; DEFAULT_NEW_AST="False" ;;
 		*) AC_MSG_ERROR([bad value ${enableval} for --enable-new-ast]) ;;
 	esac],[newast=false])
 AC_DEFINE_UNQUOTED([CFA_USE_NEW_AST], $newast)
+AC_SUBST(DEFAULT_NEW_AST)
 
 #==============================================================================
Index: libcfa/src/concurrency/alarm.cfa
===================================================================
--- libcfa/src/concurrency/alarm.cfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/alarm.cfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -45,10 +45,10 @@
 //=============================================================================================
 
-void ?{}( alarm_node_t & this, $thread * thrd, Time alarm, Duration period ) with( this ) {
+void ?{}( alarm_node_t & this, $thread * thrd, Time alarm, Duration period) with( this ) {
 	this.thrd = thrd;
 	this.alarm = alarm;
 	this.period = period;
 	set = false;
-	kernel_alarm = false;
+	type = User;
 }
 
@@ -58,5 +58,13 @@
 	this.period = period;
 	set = false;
-	kernel_alarm = true;
+	type = Kernel;
+}
+void ?{}( alarm_node_t & this, $thread * thrd, Time alarm, Duration period, Alarm_Callback callback ) with( this ) {
+	this.thrd = thrd;
+	this.alarm = alarm;
+	this.period = period;
+	this.callback = callback;
+	set = false;
+	type = Callback;
 }
 
Index: libcfa/src/concurrency/alarm.hfa
===================================================================
--- libcfa/src/concurrency/alarm.hfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/alarm.hfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -39,4 +39,10 @@
 //=============================================================================================
 
+enum alarm_type{ Kernel = 0, User = 1, Callback = 2 };
+
+struct alarm_node_t;
+
+typedef void (*Alarm_Callback)(alarm_node_t & );
+
 struct alarm_node_t {
 	Time alarm;				// time when alarm goes off
@@ -50,6 +56,8 @@
 	};
 
+	Alarm_Callback callback;
+
 	bool set		:1;		// whether or not the alarm has be registered
-	bool kernel_alarm	:1;		// true if this is not a user defined alarm
+	enum alarm_type type;		// true if this is not a user defined alarm
 };
 DLISTED_MGD_IMPL_OUT(alarm_node_t)
@@ -57,4 +65,5 @@
 void ?{}( alarm_node_t & this, $thread * thrd, Time alarm, Duration period );
 void ?{}( alarm_node_t & this, processor   * proc, Time alarm, Duration period );
+void ?{}( alarm_node_t & this, $thread * thrd, Time alarm, Duration period, Alarm_Callback callback );
 void ^?{}( alarm_node_t & this );
 
Index: libcfa/src/concurrency/coroutine.cfa
===================================================================
--- libcfa/src/concurrency/coroutine.cfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/coroutine.cfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -134,5 +134,5 @@
 void ^?{}($coroutine& this) {
 	if(this.state != Halted && this.state != Start && this.state != Primed) {
-		$coroutine * src = TL_GET( this_thread )->curr_cor;
+		$coroutine * src = active_coroutine();
 		$coroutine * dst = &this;
 
@@ -240,5 +240,5 @@
 
 	struct $coroutine * __cfactx_cor_finish(void) {
-		struct $coroutine * cor = kernelTLS.this_thread->curr_cor;
+		struct $coroutine * cor = active_coroutine();
 
 		if(cor->state == Primed) {
Index: libcfa/src/concurrency/coroutine.hfa
===================================================================
--- libcfa/src/concurrency/coroutine.hfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/coroutine.hfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -63,5 +63,5 @@
 void prime(T & cor);
 
-static inline struct $coroutine * active_coroutine() { return TL_GET( this_thread )->curr_cor; }
+static inline struct $coroutine * active_coroutine() { return active_thread()->curr_cor; }
 
 //-----------------------------------------------------------------------------
@@ -87,5 +87,5 @@
 
 	// set new coroutine that task is executing
-	TL_GET( this_thread )->curr_cor = dst;
+	active_thread()->curr_cor = dst;
 
 	// context switch to specified coroutine
@@ -112,5 +112,5 @@
 		// will also migrate which means this value will
 		// stay in syn with the TLS
-		$coroutine * src = TL_GET( this_thread )->curr_cor;
+		$coroutine * src = active_coroutine();
 
 		assertf( src->last != 0,
@@ -138,12 +138,12 @@
 	// will also migrate which means this value will
 	// stay in syn with the TLS
-	$coroutine * src = TL_GET( this_thread )->curr_cor;
+	$coroutine * src = active_coroutine();
 	$coroutine * dst = get_coroutine(cor);
 
 	if( unlikely(dst->context.SP == 0p) ) {
-		TL_GET( this_thread )->curr_cor = dst;
+		active_thread()->curr_cor = dst;
 		__stack_prepare(&dst->stack, 65000);
 		__cfactx_start(main, dst, cor, __cfactx_invoke_coroutine);
-		TL_GET( this_thread )->curr_cor = src;
+		active_thread()->curr_cor = src;
 	}
 
@@ -175,5 +175,5 @@
 	// will also migrate which means this value will
 	// stay in syn with the TLS
-	$coroutine * src = TL_GET( this_thread )->curr_cor;
+	$coroutine * src = active_coroutine();
 
 	// not resuming self ?
Index: libcfa/src/concurrency/exception.cfa
===================================================================
--- libcfa/src/concurrency/exception.cfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/exception.cfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -72,5 +72,5 @@
 	void * stop_param;
 
-	struct $thread * this_thread = TL_GET( this_thread );
+	struct $thread * this_thread = active_thread();
 	if ( &this_thread->self_cor != this_thread->curr_cor ) {
 		struct $coroutine * cor = this_thread->curr_cor;
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/io.cfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -76,5 +76,5 @@
 
 	static inline bool next( __leaderlock_t & this ) {
-		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+		/* paranoid */ verify( ! __preemption_enabled() );
 		struct $thread * nextt;
 		for() {
@@ -168,5 +168,5 @@
 	// This is NOT thread-safe
 	static [int, bool] __drain_io( & struct __io_data ring ) {
-		/* paranoid */ verify( !kernelTLS.preemption_state.enabled );
+		/* paranoid */ verify( ! __preemption_enabled() );
 
 		unsigned to_submit = 0;
@@ -404,5 +404,5 @@
 					return;
 				}
-				/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+				/* paranoid */ verify( ! __preemption_enabled() );
 				__STATS__( true,
 					io.submit_q.leader += 1;
@@ -442,5 +442,5 @@
 
 			#if defined(LEADER_LOCK)
-				/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+				/* paranoid */ verify( ! __preemption_enabled() );
 				next(ring.submit_q.submit_lock);
 			#else
Index: libcfa/src/concurrency/io/setup.cfa
===================================================================
--- libcfa/src/concurrency/io/setup.cfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/io/setup.cfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -149,4 +149,5 @@
 		id.full_proc = false;
 		id.id = doregister(&id);
+		__cfaabi_tls.this_proc_id = &id;
 		__cfaabi_dbg_print_safe( "Kernel : IO poller thread starting\n" );
 
@@ -178,7 +179,7 @@
 				__cfadbg_print_safe(io_core, "Kernel I/O : Unparking io poller %p\n", io_ctx);
 				#if !defined( __CFA_NO_STATISTICS__ )
-					kernelTLS.this_stats = io_ctx->self.curr_cluster->stats;
+					__cfaabi_tls.this_stats = io_ctx->self.curr_cluster->stats;
 				#endif
-				__post( io_ctx->sem, &id );
+				post( io_ctx->sem );
 			}
 		}
@@ -235,5 +236,5 @@
 			if( thrd.state == Ready || thrd.preempted != __NO_PREEMPTION ) {
 
-				ready_schedule_lock( (struct __processor_id_t *)active_processor() );
+				ready_schedule_lock();
 
 					// This is the tricky case
@@ -253,5 +254,5 @@
 					thrd.preempted = __NO_PREEMPTION;
 
-				ready_schedule_unlock( (struct __processor_id_t *)active_processor() );
+				ready_schedule_unlock();
 
 				// Pretend like the thread was blocked all along
@@ -275,5 +276,5 @@
 			}
 		} else {
-			unpark( &thrd );
+			post( this.thrd.sem );
 		}
 
Index: bcfa/src/concurrency/iocall.cfa
===================================================================
--- libcfa/src/concurrency/iocall.cfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ 	(revision )
@@ -1,621 +1,0 @@
-//
-// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
-//
-// The contents of this file are covered under the licence agreement in the
-// file "LICENCE" distributed with Cforall.
-//
-// iocall.cfa --
-//
-// Author           : Thierry Delisle
-// Created On       : Wed Jul  1 14:51:00 2020
-// Last Modified By :
-// Last Modified On :
-// Update Count     :
-//
-
-#define __cforall_thread__
-
-#include "bits/defs.hfa"
-#include "kernel.hfa"
-
-//=============================================================================================
-// I/O uring backend
-//=============================================================================================
-
-#if defined(CFA_HAVE_LINUX_IO_URING_H)
-	#include <assert.h>
-	#include <stdint.h>
-	#include <errno.h>
-	#include <linux/io_uring.h>
-
-	#include "kernel/fwd.hfa"
-	#include "io/types.hfa"
-
-	extern [* struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data );
-	extern void __submit( struct io_context * ctx, __u32 idx ) __attribute__((nonnull (1)));
-
-	static inline void ?{}(struct io_uring_sqe & this, __u8 opcode, int fd) {
-		this.opcode = opcode;
-		#if !defined(IOSQE_ASYNC)
-			this.flags = 0;
-		#else
-			this.flags = IOSQE_ASYNC;
-		#endif
-		this.ioprio = 0;
-		this.fd = fd;
-		this.off = 0;
-		this.addr = 0;
-		this.len = 0;
-		this.rw_flags = 0;
-		this.__pad2[0] = this.__pad2[1] = this.__pad2[2] = 0;
-	}
-
-	static inline void ?{}(struct io_uring_sqe & this, __u8 opcode, int fd, void * addr, __u32 len, __u64 off ) {
-		(this){ opcode, fd };
-		this.off = off;
-		this.addr = (__u64)(uintptr_t)addr;
-		this.len = len;
-	}
-
-	static inline io_context * __get_io_context( void ) {
-		cluster * cltr = active_cluster();
-		/* paranoid */ verifyf( cltr, "No active cluster for io operation\n");
-		assertf( cltr->io.cnt > 0, "Cluster %p has no default io contexts and no context was specified\n", cltr );
-		/* paranoid */ verifyf( cltr->io.ctxs, "default io contexts for cluster %p are missing\n", cltr);
-		return &cltr->io.ctxs[ __tls_rand() % cltr->io.cnt ];
-	}
-
-
-	#if defined(CFA_HAVE_IOSQE_FIXED_FILE) && defined(CFA_HAVE_IOSQE_IO_DRAIN) && defined(CFA_HAVE_IOSQE_ASYNC)
-		#define REGULAR_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_DRAIN | IOSQE_ASYNC)
-	#elif defined(CFA_HAVE_IOSQE_FIXED_FILE) && defined(CFA_HAVE_IOSQE_ASYNC)
-		#define REGULAR_FLAGS (IOSQE_FIXED_FILE | IOSQE_ASYNC)
-	#elif defined(CFA_HAVE_IOSQE_FIXED_FILE) && defined(CFA_HAVE_IOSQE_IO_DRAIN)
-		#define REGULAR_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_DRAIN)
-	#elif defined(CFA_HAVE_IOSQE_IO_DRAIN) && defined(CFA_HAVE_IOSQE_ASYNC)
-		#define REGULAR_FLAGS (IOSQE_IO_DRAIN | IOSQE_ASYNC)
-	#elif defined(CFA_HAVE_IOSQE_FIXED_FILE)
-		#define REGULAR_FLAGS (IOSQE_FIXED_FILE)
-	#elif defined(CFA_HAVE_IOSQE_IO_DRAIN)
-		#define REGULAR_FLAGS (IOSQE_IO_DRAIN)
-	#elif defined(CFA_HAVE_IOSQE_ASYNC)
-		#define REGULAR_FLAGS (IOSQE_ASYNC)
-	#else
-		#define REGULAR_FLAGS (0)
-	#endif
-
-	#if defined(CFA_HAVE_IOSQE_IO_LINK) && defined(CFA_HAVE_IOSQE_IO_HARDLINK)
-		#define LINK_FLAGS (IOSQE_IO_LINK | IOSQE_IO_HARDLINK)
-	#elif defined(CFA_HAVE_IOSQE_IO_LINK)
-		#define LINK_FLAGS (IOSQE_IO_LINK)
-	#elif defined(CFA_HAVE_IOSQE_IO_HARDLINK)
-		#define LINK_FLAGS (IOSQE_IO_HARDLINK)
-	#else
-		#define LINK_FLAGS (0)
-	#endif
-
-	#if defined(CFA_HAVE_SPLICE_F_FD_IN_FIXED)
-		#define SPLICE_FLAGS (SPLICE_F_FD_IN_FIXED)
-	#else
-		#define SPLICE_FLAGS (0)
-	#endif
-
-	#define __submit_prelude \
-		if( 0 != (submit_flags & LINK_FLAGS) ) { errno = ENOTSUP; return -1; } \
-		(void)timeout; (void)cancellation; \
-		if( !context ) context = __get_io_context(); \
-		__io_user_data_t data = { 0 }; \
-		struct __io_data & ring = *context->thrd.ring; \
-		struct io_uring_sqe * sqe; \
-		__u32 idx; \
-		__u8 sflags = REGULAR_FLAGS & submit_flags; \
-		[sqe, idx] = __submit_alloc( ring, (__u64)(uintptr_t)&data ); \
-		sqe->flags = sflags;
-
-	#define __submit_wait \
-		/*__cfaabi_bits_print_safe( STDERR_FILENO, "Preparing user data %p for %p\n", &data, data.thrd );*/ \
-		verify( sqe->user_data == (__u64)(uintptr_t)&data ); \
-		__submit( context, idx ); \
-		wait( data.sem ); \
-		if( data.result < 0 ) { \
-			errno = -data.result; \
-			return -1; \
-		} \
-		return data.result;
-#endif
-
-//=============================================================================================
-// I/O Forwards
-//=============================================================================================
-#include <time.hfa>
-
-// Some forward declarations
-#include <errno.h>
-#include <unistd.h>
-
-extern "C" {
-	#include <sys/types.h>
-	#include <sys/socket.h>
-	#include <sys/syscall.h>
-
-#if defined(HAVE_PREADV2)
-	struct iovec;
-	extern ssize_t preadv2 (int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
-#endif
-#if defined(HAVE_PWRITEV2)
-	struct iovec;
-	extern ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
-#endif
-
-	extern int fsync(int fd);
-
-	#if __OFF_T_MATCHES_OFF64_T
-		typedef __off64_t off_t;
-	#else
-		typedef __off_t off_t;
-	#endif
-	typedef __off64_t off64_t;
-	extern int sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags);
-
-	struct msghdr;
-	struct sockaddr;
-	extern ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags);
-	extern ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags);
-	extern ssize_t send(int sockfd, const void *buf, size_t len, int flags);
-	extern ssize_t recv(int sockfd, void *buf, size_t len, int flags);
-	extern int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
-	extern int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
-
-	extern int fallocate(int fd, int mode, off_t offset, off_t len);
-	extern int posix_fadvise(int fd, off_t offset, off_t len, int advice);
-	extern int madvise(void *addr, size_t length, int advice);
-
-	extern int openat(int dirfd, const char *pathname, int flags, mode_t mode);
-	extern int close(int fd);
-
-	extern ssize_t read (int fd, void *buf, size_t count);
-
-	extern ssize_t splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags);
-	extern ssize_t tee(int fd_in, int fd_out, size_t len, unsigned int flags);
-}
-
-//=============================================================================================
-// I/O Interface
-//=============================================================================================
-
-//-----------------------------------------------------------------------------
-// Asynchronous operations
-#if defined(HAVE_PREADV2)
-	ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-		#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_READV)
-			return preadv2(fd, iov, iovcnt, offset, flags);
-		#else
-			__submit_prelude
-
-			sqe->opcode = IORING_OP_READV;
-			sqe->ioprio = 0;
-			sqe->fd = fd;
-			sqe->off = offset;
-			sqe->addr = (__u64)iov;
-			sqe->len = iovcnt;
-			sqe->rw_flags = 0;
-			sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
-
-			__submit_wait
-		#endif
-	}
-#endif
-
-#if defined(HAVE_PWRITEV2)
-	ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-		#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_WRITEV)
-			return pwritev2(fd, iov, iovcnt, offset, flags);
-		#else
-			__submit_prelude
-
-			sqe->opcode = IORING_OP_WRITEV;
-			sqe->ioprio = 0;
-			sqe->fd = fd;
-			sqe->off = offset;
-			sqe->addr = (__u64)iov;
-			sqe->len = iovcnt;
-			sqe->rw_flags = 0;
-			sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
-
-			__submit_wait
-		#endif
-	}
-#endif
-
-int cfa_fsync(int fd, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FSYNC)
-		return fsync(fd);
-	#else
-		__submit_prelude
-
-		sqe->opcode = IORING_OP_FSYNC;
-		sqe->ioprio = 0;
-		sqe->fd = fd;
-		sqe->off = 0;
-		sqe->addr = 0;
-		sqe->len = 0;
-		sqe->rw_flags = 0;
-		sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
-
-		__submit_wait
-	#endif
-}
-
-int cfa_sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SYNC_FILE_RANGE)
-		return sync_file_range(fd, offset, nbytes, flags);
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_SYNC_FILE_RANGE, fd };
-		sqe->off = offset;
-		sqe->len = nbytes;
-		sqe->sync_range_flags = flags;
-
-		__submit_wait
-	#endif
-}
-
-
-ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SENDMSG)
-		return sendmsg(sockfd, msg, flags);
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_SENDMSG, sockfd, msg, 1, 0 };
-		sqe->msg_flags = flags;
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_RECVMSG)
-		return recvmsg(sockfd, msg, flags);
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_RECVMSG, sockfd, msg, 1, 0 };
-		sqe->msg_flags = flags;
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SEND)
-		return send( sockfd, buf, len, flags );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_SEND, sockfd };
-		sqe->addr = (__u64)buf;
-		sqe->len = len;
-		sqe->msg_flags = flags;
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_RECV)
-		return recv( sockfd, buf, len, flags );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_RECV, sockfd };
-		sqe->addr = (__u64)buf;
-		sqe->len = len;
-		sqe->msg_flags = flags;
-
-		__submit_wait
-	#endif
-}
-
-int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_ACCEPT)
-		return accept4( sockfd, addr, addrlen, flags );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_ACCEPT, sockfd };
-		sqe->addr  = (__u64)addr;
-		sqe->addr2 = (__u64)addrlen;
-		sqe->accept_flags = flags;
-
-		__submit_wait
-	#endif
-}
-
-int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_CONNECT)
-		return connect( sockfd, addr, addrlen );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_CONNECT, sockfd };
-		sqe->addr = (__u64)addr;
-		sqe->off  = (__u64)addrlen;
-
-		__submit_wait
-	#endif
-}
-
-int cfa_fallocate(int fd, int mode, off_t offset, off_t len, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FALLOCATE)
-		return fallocate( fd, mode, offset, len );
-	#else
-		__submit_prelude
-
-		#warning FALLOCATE documentation for linux 5.7 is incorrect, and does not handle mode
-
-		(*sqe){ IORING_OP_FALLOCATE, fd };
-		sqe->off = offset;
-		sqe->len = mode;
-		sqe->addr = len;
-
-		__submit_wait
-	#endif
-}
-
-int cfa_fadvise(int fd, off_t offset, off_t len, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FADVISE)
-		return posix_fadvise( fd, offset, len, advice );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_FADVISE, fd };
-		sqe->off = (__u64)offset;
-		sqe->len = len;
-		sqe->fadvise_advice = advice;
-
-		__submit_wait
-	#endif
-}
-
-int cfa_madvise(void *addr, size_t length, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_MADVISE)
-		return madvise( addr, length, advice );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_MADVISE, 0 };
-		sqe->addr = (__u64)addr;
-		sqe->len = length;
-		sqe->fadvise_advice = advice;
-
-		__submit_wait
-	#endif
-}
-
-int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_OPENAT)
-		return openat( dirfd, pathname, flags, mode );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_OPENAT, dirfd };
-		sqe->addr = (__u64)pathname;
-		sqe->open_flags = flags;
-		sqe->len = mode;
-
-		__submit_wait
-	#endif
-}
-
-int cfa_close(int fd, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_CLOSE)
-		return close( fd );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_CLOSE, fd };
-
-		__submit_wait
-	#endif
-}
-
-// Forward declare in case it is not supported
-struct statx;
-int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_STATX)
-		#if defined(__NR_statx)
-			return syscall( __NR_statx, dirfd, pathname, flags, mask, statxbuf );
-		#else
-			errno = ENOTSUP;
-			return -1;
-		#endif
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_STATX, dirfd, pathname, mask, (__u64)statxbuf };
-		sqe->statx_flags = flags;
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_read(int fd, void *buf, size_t count, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_READ)
-		return read( fd, buf, count );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_READ, fd, buf, count, 0 };
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_write(int fd, void *buf, size_t count, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_WRITE)
-		return read( fd, buf, count );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_WRITE, fd, buf, count, 0 };
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SPLICE)
-		return splice( fd_in, off_in, fd_out, off_out, len, flags );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_SPLICE, fd_out };
-		if( off_out ) {
-			sqe->off = *off_out;
-		}
-		else {
-			sqe->off = (__u64)-1;
-		}
-		sqe->len = len;
-		sqe->splice_fd_in  = fd_in;
-		if( off_in ) {
-			sqe->splice_off_in = *off_in;
-		}
-		else {
-			sqe->splice_off_in = (__u64)-1;
-		}
-		sqe->splice_flags  = flags | (SPLICE_FLAGS & submit_flags);
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_TEE)
-		return tee( fd_in, fd_out, len, flags );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_TEE, fd_out, 0p, len, 0 };
-		sqe->splice_fd_in = fd_in;
-		sqe->splice_flags  = flags | (SPLICE_FLAGS & submit_flags);
-
-		__submit_wait
-	#endif
-}
-
-//-----------------------------------------------------------------------------
-// Check if a function is asynchronous
-
-// Macro magic to reduce the size of the following switch case
-#define IS_DEFINED_APPLY(f, ...) f(__VA_ARGS__)
-#define IS_DEFINED_SECOND(first, second, ...) second
-#define IS_DEFINED_TEST(expansion) _CFA_IO_FEATURE_##expansion
-#define IS_DEFINED(macro) IS_DEFINED_APPLY( IS_DEFINED_SECOND,IS_DEFINED_TEST(macro) false, true)
-
-bool has_user_level_blocking( fptr_t func ) {
-	#if defined(CFA_HAVE_LINUX_IO_URING_H)
-		#if defined(HAVE_PREADV2)
-			if( /*func == (fptr_t)preadv2 || */
-				func == (fptr_t)cfa_preadv2 )
-				#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_READV ,
-				return IS_DEFINED(CFA_HAVE_IORING_OP_READV);
-		#endif
-
-		#if defined(HAVE_PWRITEV2)
-			if( /*func == (fptr_t)pwritev2 || */
-				func == (fptr_t)cfa_pwritev2 )
-				#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_WRITEV ,
-				return IS_DEFINED(CFA_HAVE_IORING_OP_WRITEV);
-		#endif
-
-		if( /*func == (fptr_t)fsync || */
-			func == (fptr_t)cfa_fsync )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_FSYNC ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_FSYNC);
-
-		if( /*func == (fptr_t)ync_file_range || */
-			func == (fptr_t)cfa_sync_file_range )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_SYNC_FILE_RANGE ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_SYNC_FILE_RANGE);
-
-		if( /*func == (fptr_t)sendmsg || */
-			func == (fptr_t)cfa_sendmsg )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_SENDMSG ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_SENDMSG);
-
-		if( /*func == (fptr_t)recvmsg || */
-			func == (fptr_t)cfa_recvmsg )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_RECVMSG ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_RECVMSG);
-
-		if( /*func == (fptr_t)send || */
-			func == (fptr_t)cfa_send )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_SEND ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_SEND);
-
-		if( /*func == (fptr_t)recv || */
-			func == (fptr_t)cfa_recv )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_RECV ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_RECV);
-
-		if( /*func == (fptr_t)accept4 || */
-			func == (fptr_t)cfa_accept4 )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_ACCEPT ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_ACCEPT);
-
-		if( /*func == (fptr_t)connect || */
-			func == (fptr_t)cfa_connect )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_CONNECT ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_CONNECT);
-
-		if( /*func == (fptr_t)fallocate || */
-			func == (fptr_t)cfa_fallocate )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_FALLOCATE ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_FALLOCATE);
-
-		if( /*func == (fptr_t)posix_fadvise || */
-			func == (fptr_t)cfa_fadvise )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_FADVISE ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_FADVISE);
-
-		if( /*func == (fptr_t)madvise || */
-			func == (fptr_t)cfa_madvise )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_MADVISE ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_MADVISE);
-
-		if( /*func == (fptr_t)openat || */
-			func == (fptr_t)cfa_openat )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_OPENAT ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_OPENAT);
-
-		if( /*func == (fptr_t)close || */
-			func == (fptr_t)cfa_close )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_CLOSE ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_CLOSE);
-
-		if( /*func == (fptr_t)read || */
-			func == (fptr_t)cfa_read )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_READ ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_READ);
-
-		if( /*func == (fptr_t)write || */
-			func == (fptr_t)cfa_write )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_WRITE ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_WRITE);
-
-		if( /*func == (fptr_t)splice || */
-			func == (fptr_t)cfa_splice )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_SPLICE ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_SPLICE);
-
-		if( /*func == (fptr_t)tee || */
-			func == (fptr_t)cfa_tee )
-			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_TEE ,
-			return IS_DEFINED(CFA_HAVE_IORING_OP_TEE);
-	#endif
-
-	return false;
-}
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/kernel.cfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -108,5 +108,5 @@
 static $thread * __next_thread_slow(cluster * this);
 static void __run_thread(processor * this, $thread * dst);
-static void __wake_one(struct __processor_id_t * id, cluster * cltr);
+static void __wake_one(cluster * cltr);
 
 static void push  (__cluster_idles & idles, processor & proc);
@@ -122,6 +122,6 @@
 	// Because of a bug, we couldn't initialized the seed on construction
 	// Do it here
-	kernelTLS.rand_seed ^= rdtscl();
-	kernelTLS.ready_rng.fwd_seed = 25214903917_l64u * (rdtscl() ^ (uintptr_t)&runner);
+	__cfaabi_tls.rand_seed ^= rdtscl();
+	__cfaabi_tls.ready_rng.fwd_seed = 25214903917_l64u * (rdtscl() ^ (uintptr_t)&runner);
 	__tls_rand_advance_bck();
 
@@ -217,5 +217,5 @@
 		// and it make sense for it to be set in all other cases except here
 		// fake it
-		kernelTLS.this_thread = mainThread;
+		__cfaabi_tls.this_thread = mainThread;
 	}
 
@@ -230,5 +230,5 @@
 // from the processor coroutine to the target thread
 static void __run_thread(processor * this, $thread * thrd_dst) {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	/* paranoid */ verifyf( thrd_dst->state == Ready || thrd_dst->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", thrd_dst->state, thrd_dst->preempted);
 	/* paranoid */ verifyf( thrd_dst->link.next == 0p, "Expected null got %p", thrd_dst->link.next );
@@ -247,8 +247,8 @@
 
 		// Update global state
-		kernelTLS.this_thread = thrd_dst;
-
-		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-		/* paranoid */ verify( kernelTLS.this_thread == thrd_dst );
+		kernelTLS().this_thread = thrd_dst;
+
+		/* paranoid */ verify( ! __preemption_enabled() );
+		/* paranoid */ verify( kernelTLS().this_thread == thrd_dst );
 		/* paranoid */ verify( thrd_dst->context.SP );
 		/* paranoid */ verify( thrd_dst->state != Halted );
@@ -267,9 +267,9 @@
 		/* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) < ((uintptr_t)__get_stack(thrd_dst->curr_cor)->base ), "ERROR : Destination $thread %p has been corrupted.\n StackPointer too small.\n", thrd_dst );
 		/* paranoid */ verify( thrd_dst->context.SP );
-		/* paranoid */ verify( kernelTLS.this_thread == thrd_dst );
-		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+		/* paranoid */ verify( kernelTLS().this_thread == thrd_dst );
+		/* paranoid */ verify( ! __preemption_enabled() );
 
 		// Reset global state
-		kernelTLS.this_thread = 0p;
+		kernelTLS().this_thread = 0p;
 
 		// We just finished running a thread, there are a few things that could have happened.
@@ -282,5 +282,5 @@
 		if(unlikely(thrd_dst->preempted != __NO_PREEMPTION)) {
 			// The thread was preempted, reschedule it and reset the flag
-			__schedule_thread( (__processor_id_t*)this, thrd_dst );
+			__schedule_thread( thrd_dst );
 			break RUNNING;
 		}
@@ -315,15 +315,15 @@
 	proc_cor->state = Active;
 
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 }
 
 // KERNEL_ONLY
 void returnToKernel() {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	$coroutine * proc_cor = get_coroutine(kernelTLS.this_processor->runner);
-	$thread * thrd_src = kernelTLS.this_thread;
+	/* paranoid */ verify( ! __preemption_enabled() );
+	$coroutine * proc_cor = get_coroutine(kernelTLS().this_processor->runner);
+	$thread * thrd_src = kernelTLS().this_thread;
 
 	#if !defined(__CFA_NO_STATISTICS__)
-		struct processor * last_proc = kernelTLS.this_processor;
+		struct processor * last_proc = kernelTLS().this_processor;
 	#endif
 
@@ -345,10 +345,10 @@
 
 	#if !defined(__CFA_NO_STATISTICS__)
-		if(last_proc != kernelTLS.this_processor) {
+		if(last_proc != kernelTLS().this_processor) {
 			__tls_stats()->ready.threads.migration++;
 		}
 	#endif
 
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	/* paranoid */ verifyf( ((uintptr_t)thrd_src->context.SP) < ((uintptr_t)__get_stack(thrd_src->curr_cor)->base ), "ERROR : Returning $thread %p has been corrupted.\n StackPointer too small.\n", thrd_src );
 	/* paranoid */ verifyf( ((uintptr_t)thrd_src->context.SP) > ((uintptr_t)__get_stack(thrd_src->curr_cor)->limit), "ERROR : Returning $thread %p has been corrupted.\n StackPointer too large.\n", thrd_src );
@@ -358,8 +358,9 @@
 // Scheduler routines
 // KERNEL ONLY
-void __schedule_thread( struct __processor_id_t * id, $thread * thrd ) {
+void __schedule_thread( $thread * thrd ) {
+	/* paranoid */ verify( ! __preemption_enabled() );
 	/* paranoid */ verify( thrd );
 	/* paranoid */ verify( thrd->state != Halted );
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( kernelTLS().this_proc_id );
 	/* paranoid */ #if defined( __CFA_WITH_VERIFY__ )
 	/* paranoid */ 	if( thrd->state == Blocked || thrd->state == Start ) assertf( thrd->preempted == __NO_PREEMPTION,
@@ -374,21 +375,23 @@
 	if (thrd->preempted == __NO_PREEMPTION) thrd->state = Ready;
 
-	ready_schedule_lock  ( id );
+	ready_schedule_lock();
 		push( thrd->curr_cluster, thrd );
-		__wake_one(id, thrd->curr_cluster);
-	ready_schedule_unlock( id );
-
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+		__wake_one(thrd->curr_cluster);
+	ready_schedule_unlock();
+
+	/* paranoid */ verify( ! __preemption_enabled() );
 }
 
 // KERNEL ONLY
 static inline $thread * __next_thread(cluster * this) with( *this ) {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-
-	ready_schedule_lock  ( (__processor_id_t*)kernelTLS.this_processor );
+	/* paranoid */ verify( ! __preemption_enabled() );
+	/* paranoid */ verify( kernelTLS().this_proc_id );
+
+	ready_schedule_lock();
 		$thread * thrd = pop( this );
-	ready_schedule_unlock( (__processor_id_t*)kernelTLS.this_processor );
-
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	ready_schedule_unlock();
+
+	/* paranoid */ verify( kernelTLS().this_proc_id );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	return thrd;
 }
@@ -396,16 +399,19 @@
 // KERNEL ONLY
 static inline $thread * __next_thread_slow(cluster * this) with( *this ) {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-
-	ready_schedule_lock  ( (__processor_id_t*)kernelTLS.this_processor );
+	/* paranoid */ verify( ! __preemption_enabled() );
+	/* paranoid */ verify( kernelTLS().this_proc_id );
+
+	ready_schedule_lock();
 		$thread * thrd = pop_slow( this );
-	ready_schedule_unlock( (__processor_id_t*)kernelTLS.this_processor );
-
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	ready_schedule_unlock();
+
+	/* paranoid */ verify( kernelTLS().this_proc_id );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	return thrd;
 }
 
-// KERNEL ONLY unpark with out disabling interrupts
-void __unpark(  struct __processor_id_t * id, $thread * thrd ) {
+void unpark( $thread * thrd ) {
+	if( !thrd ) return;
+
 	int old_ticket = __atomic_fetch_add(&thrd->ticket, 1, __ATOMIC_SEQ_CST);
 	switch(old_ticket) {
@@ -417,6 +423,20 @@
 			/* paranoid */ verify( thrd->state == Blocked );
 
-			// Wake lost the race,
-			__schedule_thread( id, thrd );
+			{
+				/* paranoid */ verify( publicTLS_get(this_proc_id) );
+				bool full = publicTLS_get(this_proc_id)->full_proc;
+				if(full) disable_interrupts();
+
+				/* paranoid */ verify( ! __preemption_enabled() );
+
+				// Wake lost the race,
+				__schedule_thread( thrd );
+
+				/* paranoid */ verify( ! __preemption_enabled() );
+
+				if(full) enable_interrupts( __cfaabi_dbg_ctx );
+				/* paranoid */ verify( publicTLS_get(this_proc_id) );
+			}
+
 			break;
 		default:
@@ -426,23 +446,15 @@
 }
 
-void unpark( $thread * thrd ) {
-	if( !thrd ) return;
-
+void park( void ) {
+	/* paranoid */ verify( __preemption_enabled() );
 	disable_interrupts();
-	__unpark( (__processor_id_t*)kernelTLS.this_processor, thrd );
+	/* paranoid */ verify( ! __preemption_enabled() );
+	/* paranoid */ verify( kernelTLS().this_thread->preempted == __NO_PREEMPTION );
+
+	returnToKernel();
+
+	/* paranoid */ verify( ! __preemption_enabled() );
 	enable_interrupts( __cfaabi_dbg_ctx );
-}
-
-void park( void ) {
-	/* paranoid */ verify( kernelTLS.preemption_state.enabled );
-	disable_interrupts();
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	/* paranoid */ verify( kernelTLS.this_thread->preempted == __NO_PREEMPTION );
-
-	returnToKernel();
-
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	enable_interrupts( __cfaabi_dbg_ctx );
-	/* paranoid */ verify( kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( __preemption_enabled() );
 
 }
@@ -453,5 +465,5 @@
 	// Should never return
 	void __cfactx_thrd_leave() {
-		$thread * thrd = TL_GET( this_thread );
+		$thread * thrd = active_thread();
 		$monitor * this = &thrd->self_mon;
 
@@ -462,9 +474,9 @@
 
 		thrd->state = Halted;
-
+		if( TICKET_RUNNING != thrd->ticket ) { abort( "Thread terminated with pending unpark" ); }
 		if( thrd != this->owner || this->recursion != 1) { abort( "Thread internal monitor has unbalanced recursion" ); }
 
 		// Leave the thread
-		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+		/* paranoid */ verify( ! __preemption_enabled() );
 		returnToKernel();
 		abort();
@@ -476,9 +488,9 @@
 // KERNEL ONLY
 bool force_yield( __Preemption_Reason reason ) {
-	/* paranoid */ verify( kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( __preemption_enabled() );
 	disable_interrupts();
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-
-	$thread * thrd = kernelTLS.this_thread;
+	/* paranoid */ verify( ! __preemption_enabled() );
+
+	$thread * thrd = kernelTLS().this_thread;
 	/* paranoid */ verify(thrd->state == Active);
 
@@ -494,7 +506,7 @@
 	}
 
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	enable_interrupts_noPoll();
-	/* paranoid */ verify( kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( __preemption_enabled() );
 
 	return preempted;
@@ -505,7 +517,7 @@
 //=============================================================================================
 // Wake a thread from the front if there are any
-static void __wake_one(struct __processor_id_t * id, cluster * this) {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	/* paranoid */ verify( ready_schedule_islocked( id ) );
+static void __wake_one(cluster * this) {
+	/* paranoid */ verify( ! __preemption_enabled() );
+	/* paranoid */ verify( ready_schedule_islocked() );
 
 	// Check if there is a sleeping processor
@@ -525,6 +537,6 @@
 	#endif
 
-	/* paranoid */ verify( ready_schedule_islocked( id ) );
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ready_schedule_islocked() );
+	/* paranoid */ verify( ! __preemption_enabled() );
 
 	return;
@@ -536,5 +548,5 @@
 
 	disable_interrupts();
-		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+		/* paranoid */ verify( ! __preemption_enabled() );
 		post( this->idle );
 	enable_interrupts( __cfaabi_dbg_ctx );
@@ -542,5 +554,5 @@
 
 static void push  (__cluster_idles & this, processor & proc) {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	lock( this );
 		this.idle++;
@@ -549,9 +561,9 @@
 		insert_first(this.list, proc);
 	unlock( this );
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 }
 
 static void remove(__cluster_idles & this, processor & proc) {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	lock( this );
 		this.idle--;
@@ -560,5 +572,5 @@
 		remove(proc);
 	unlock( this );
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 }
 
@@ -604,5 +616,5 @@
 	}
 
-	return kernelTLS.this_thread;
+	return __cfaabi_tls.this_thread;
 }
 
@@ -629,5 +641,5 @@
 
 int kernel_abort_lastframe( void ) __attribute__ ((__nothrow__)) {
-	return get_coroutine(kernelTLS.this_thread) == get_coroutine(mainThread) ? 4 : 2;
+	return get_coroutine(kernelTLS().this_thread) == get_coroutine(mainThread) ? 4 : 2;
 }
 
@@ -661,5 +673,5 @@
 	if ( count < 0 ) {
 		// queue current task
-		append( waiting, kernelTLS.this_thread );
+		append( waiting, active_thread() );
 
 		// atomically release spin lock and block
@@ -711,5 +723,5 @@
 		void __cfaabi_dbg_record_lock(__spinlock_t & this, const char prev_name[]) {
 			this.prev_name = prev_name;
-			this.prev_thrd = kernelTLS.this_thread;
+			this.prev_thrd = kernelTLS().this_thread;
 		}
 	}
@@ -728,4 +740,8 @@
 		this.print_halts = true;
 	}
+
+	void print_stats_now( cluster & this, int flags ) {
+		__print_stats( this.stats, this.print_stats, true, this.name, (void*)&this );
+	}
 #endif
 // Local Variables: //
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/kernel.hfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -275,8 +275,10 @@
 static inline [cluster *&, cluster *& ] __get( cluster & this ) __attribute__((const)) { return this.node.[next, prev]; }
 
-static inline struct processor * active_processor() { return TL_GET( this_processor ); } // UNSAFE
-static inline struct cluster   * active_cluster  () { return TL_GET( this_processor )->cltr; }
+static inline struct processor * active_processor() { return publicTLS_get( this_processor ); } // UNSAFE
+static inline struct cluster   * active_cluster  () { return publicTLS_get( this_processor )->cltr; }
 
 #if !defined(__CFA_NO_STATISTICS__)
+	void print_stats_now( cluster & this, int flags );
+
 	static inline void print_stats_at_exit( cluster & this, int flags ) {
 		this.print_stats |= flags;
Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -35,7 +35,8 @@
 	extern "Cforall" {
 		extern __attribute__((aligned(128))) thread_local struct KernelThreadData {
-			struct $thread    * volatile this_thread;
-			struct processor  * volatile this_processor;
-			struct __stats_t  * volatile this_stats;
+			struct $thread          * volatile this_thread;
+			struct processor        * volatile this_processor;
+			struct __processor_id_t * volatile this_proc_id;
+			struct __stats_t        * volatile this_stats;
 
 			struct {
@@ -54,13 +55,24 @@
 				uint64_t bck_seed;
 			} ready_rng;
-		} kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
+		} __cfaabi_tls __attribute__ ((tls_model ( "initial-exec" )));
 
+		extern bool __preemption_enabled();
 
+		static inline KernelThreadData & kernelTLS( void ) {
+			/* paranoid */ verify( ! __preemption_enabled() );
+			return __cfaabi_tls;
+		}
+
+		extern uintptr_t __cfatls_get( unsigned long int member );
+		// #define publicTLS_get( member ) ((typeof(__cfaabi_tls.member))__cfatls_get( __builtin_offsetof(KernelThreadData, member) ))
+		#define publicTLS_get( member ) (__cfaabi_tls.member)
+		// extern forall(otype T) T __cfatls_get( T * member, T value );
+		// #define publicTLS_set( member, value ) __cfatls_set( (typeof(member)*)__builtin_offsetof(KernelThreadData, member), value );
 
 		static inline uint64_t __tls_rand() {
 			#if defined(__SIZEOF_INT128__)
-				return __lehmer64( kernelTLS.rand_seed );
+				return __lehmer64( kernelTLS().rand_seed );
 			#else
-				return __xorshift64( kernelTLS.rand_seed );
+				return __xorshift64( kernelTLS().rand_seed );
 			#endif
 		}
@@ -74,11 +86,11 @@
 		static inline unsigned __tls_rand_fwd() {
 
-			kernelTLS.ready_rng.fwd_seed = (A * kernelTLS.ready_rng.fwd_seed + C) & (M - 1);
-			return kernelTLS.ready_rng.fwd_seed >> D;
+			kernelTLS().ready_rng.fwd_seed = (A * kernelTLS().ready_rng.fwd_seed + C) & (M - 1);
+			return kernelTLS().ready_rng.fwd_seed >> D;
 		}
 
 		static inline unsigned __tls_rand_bck() {
-			unsigned int r = kernelTLS.ready_rng.bck_seed >> D;
-			kernelTLS.ready_rng.bck_seed = AI * (kernelTLS.ready_rng.bck_seed - C) & (M - 1);
+			unsigned int r = kernelTLS().ready_rng.bck_seed >> D;
+			kernelTLS().ready_rng.bck_seed = AI * (kernelTLS().ready_rng.bck_seed - C) & (M - 1);
 			return r;
 		}
@@ -91,25 +103,9 @@
 
 		static inline void __tls_rand_advance_bck(void) {
-			kernelTLS.ready_rng.bck_seed = kernelTLS.ready_rng.fwd_seed;
+			kernelTLS().ready_rng.bck_seed = kernelTLS().ready_rng.fwd_seed;
 		}
 	}
 
-	#if 0 // def __ARM_ARCH
-		// function prototypes are only really used by these macros on ARM
-		void disable_global_interrupts();
-		void enable_global_interrupts();
 
-		#define TL_GET( member ) ( { __typeof__( kernelTLS.member ) target; \
-			disable_global_interrupts(); \
-			target = kernelTLS.member; \
-			enable_global_interrupts(); \
-			target; } )
-		#define TL_SET( member, value ) disable_global_interrupts(); \
-			kernelTLS.member = value; \
-			enable_global_interrupts();
-	#else
-		#define TL_GET( member ) kernelTLS.member
-		#define TL_SET( member, value ) kernelTLS.member = value;
-	#endif
 
 	extern void disable_interrupts();
@@ -120,5 +116,9 @@
 		extern void park( void );
 		extern void unpark( struct $thread * this );
-		static inline struct $thread * active_thread () { return TL_GET( this_thread ); }
+		static inline struct $thread * active_thread () {
+			struct $thread * t = publicTLS_get( this_thread );
+			/* paranoid */ verify( t );
+			return t;
+		}
 
 		extern bool force_yield( enum __Preemption_Reason );
@@ -139,7 +139,7 @@
 		#if !defined(__CFA_NO_STATISTICS__)
 			static inline struct __stats_t * __tls_stats() {
-				/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-				/* paranoid */ verify( kernelTLS.this_stats );
-				return kernelTLS.this_stats;
+				/* paranoid */ verify( ! __preemption_enabled() );
+				/* paranoid */ verify( kernelTLS().this_stats );
+				return kernelTLS().this_stats;
 			}
 
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -118,6 +118,7 @@
 //-----------------------------------------------------------------------------
 // Global state
-thread_local struct KernelThreadData kernelTLS __attribute__ ((tls_model ( "initial-exec" ))) @= {
+thread_local struct KernelThreadData __cfaabi_tls __attribute__ ((tls_model ( "initial-exec" ))) @= {
 	NULL,												// cannot use 0p
+	NULL,
 	NULL,
 	NULL,
@@ -155,5 +156,5 @@
 // Kernel boot procedures
 static void __kernel_startup(void) {
-	verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	__cfadbg_print_safe(runtime_core, "Kernel : Starting\n");
 
@@ -211,10 +212,11 @@
 
 	//initialize the global state variables
-	kernelTLS.this_processor = mainProcessor;
-	kernelTLS.this_thread    = mainThread;
+	__cfaabi_tls.this_processor = mainProcessor;
+	__cfaabi_tls.this_proc_id   = (__processor_id_t*)mainProcessor;
+	__cfaabi_tls.this_thread    = mainThread;
 
 	#if !defined( __CFA_NO_STATISTICS__ )
-		kernelTLS.this_stats = (__stats_t *)& storage_mainProcStats;
-		__init_stats( kernelTLS.this_stats );
+		__cfaabi_tls.this_stats = (__stats_t *)& storage_mainProcStats;
+		__init_stats( __cfaabi_tls.this_stats );
 	#endif
 
@@ -227,10 +229,10 @@
 	// Add the main thread to the ready queue
 	// once resume is called on mainProcessor->runner the mainThread needs to be scheduled like any normal thread
-	__schedule_thread((__processor_id_t *)mainProcessor, mainThread);
+	__schedule_thread(mainThread);
 
 	// SKULLDUGGERY: Force a context switch to the main processor to set the main thread's context to the current UNIX
 	// context. Hence, the main thread does not begin through __cfactx_invoke_thread, like all other threads. The trick here is that
 	// mainThread is on the ready queue when this call is made.
-	__kernel_first_resume( kernelTLS.this_processor );
+	__kernel_first_resume( __cfaabi_tls.this_processor );
 
 
@@ -249,7 +251,8 @@
 	__cfadbg_print_safe(runtime_core, "Kernel : Started\n--------------------------------------------------\n\n");
 
-	verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	enable_interrupts( __cfaabi_dbg_ctx );
-	verify( TL_GET( preemption_state.enabled ) );
+	/* paranoid */ verify( __preemption_enabled() );
+
 }
 
@@ -260,7 +263,7 @@
 	mainCluster->io.ctxs = 0p;
 
-	/* paranoid */ verify( TL_GET( preemption_state.enabled ) );
+	/* paranoid */ verify( __preemption_enabled() );
 	disable_interrupts();
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 
 	__cfadbg_print_safe(runtime_core, "\n--------------------------------------------------\nKernel : Shutting down\n");
@@ -270,5 +273,5 @@
 	// which is currently here
 	__atomic_store_n(&mainProcessor->do_terminate, true, __ATOMIC_RELEASE);
-	__kernel_last_resume( kernelTLS.this_processor );
+	__kernel_last_resume( __cfaabi_tls.this_processor );
 	mainThread->self_cor.state = Halted;
 
@@ -319,11 +322,12 @@
 		__stats_t local_stats;
 		__init_stats( &local_stats );
-		kernelTLS.this_stats = &local_stats;
+		__cfaabi_tls.this_stats = &local_stats;
 	#endif
 
 	processor * proc = (processor *) arg;
-	kernelTLS.this_processor = proc;
-	kernelTLS.this_thread    = 0p;
-	kernelTLS.preemption_state.[enabled, disable_count] = [false, 1];
+	__cfaabi_tls.this_processor = proc;
+	__cfaabi_tls.this_proc_id   = (__processor_id_t*)proc;
+	__cfaabi_tls.this_thread    = 0p;
+	__cfaabi_tls.preemption_state.[enabled, disable_count] = [false, 1];
 	// SKULLDUGGERY: We want to create a context for the processor coroutine
 	// which is needed for the 2-step context switch. However, there is no reason
@@ -337,5 +341,5 @@
 
 	//Set global state
-	kernelTLS.this_thread = 0p;
+	__cfaabi_tls.this_thread = 0p;
 
 	//We now have a proper context from which to schedule threads
@@ -367,11 +371,11 @@
 	$coroutine * dst = get_coroutine(this->runner);
 
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	kernelTLS.this_thread->curr_cor = dst;
+	/* paranoid */ verify( ! __preemption_enabled() );
+
+	__cfaabi_tls.this_thread->curr_cor = dst;
 	__stack_prepare( &dst->stack, 65000 );
 	__cfactx_start(main, dst, this->runner, __cfactx_invoke_coroutine);
 
-	verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 
 	dst->last = &src->self_cor;
@@ -391,5 +395,5 @@
 	/* paranoid */ verify(src->state == Active);
 
-	verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 }
 
@@ -399,7 +403,7 @@
 	$coroutine * dst = get_coroutine(this->runner);
 
-	verify( ! kernelTLS.preemption_state.enabled );
-	verify( dst->starter == src );
-	verify( dst->context.SP );
+	/* paranoid */ verify( ! __preemption_enabled() );
+	/* paranoid */ verify( dst->starter == src );
+	/* paranoid */ verify( dst->context.SP );
 
 	// SKULLDUGGERY in debug the processors check that the
@@ -543,5 +547,5 @@
 
 		P( terminated );
-		verify( kernelTLS.this_processor != &this);
+		/* paranoid */ verify( active_processor() != &this);
 	}
 
@@ -693,5 +697,5 @@
 #if defined(__CFA_WITH_VERIFY__)
 static bool verify_fwd_bck_rng(void) {
-	kernelTLS.ready_rng.fwd_seed = 25214903917_l64u * (rdtscl() ^ (uintptr_t)&verify_fwd_bck_rng);
+	__cfaabi_tls.ready_rng.fwd_seed = 25214903917_l64u * (rdtscl() ^ (uintptr_t)&verify_fwd_bck_rng);
 
 	unsigned values[10];
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -33,9 +33,11 @@
 }
 
-void __schedule_thread( struct __processor_id_t *, $thread * )
+void __schedule_thread( $thread * )
 #if defined(NDEBUG) || (!defined(__CFA_DEBUG__) && !defined(__CFA_VERIFY__))
-	__attribute__((nonnull (2)))
+	__attribute__((nonnull (1)))
 #endif
 ;
+
+extern bool __preemption_enabled();
 
 //release/wake-up the following resources
@@ -63,28 +65,7 @@
 )
 
-// KERNEL ONLY unpark with out disabling interrupts
-void __unpark( struct __processor_id_t *, $thread * thrd );
-
 #define TICKET_BLOCKED (-1) // thread is blocked
 #define TICKET_RUNNING ( 0) // thread is running
 #define TICKET_UNBLOCK ( 1) // thread should ignore next block
-
-static inline bool __post(single_sem & this, struct __processor_id_t * id) {
-	for() {
-		struct $thread * expected = this.ptr;
-		if(expected == 1p) return false;
-		if(expected == 0p) {
-			if(__atomic_compare_exchange_n(&this.ptr, &expected, 1p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-				return false;
-			}
-		}
-		else {
-			if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-				__unpark( id, expected );
-				return true;
-			}
-		}
-	}
-}
 
 //-----------------------------------------------------------------------------
@@ -201,7 +182,10 @@
 // Reader side : acquire when using the ready queue to schedule but not
 //  creating/destroying queues
-static inline void ready_schedule_lock( struct __processor_id_t * proc) with(*__scheduler_lock) {
-	unsigned iproc = proc->id;
-	/*paranoid*/ verify(data[iproc].handle == proc);
+static inline void ready_schedule_lock(void) with(*__scheduler_lock) {
+	/* paranoid */ verify( ! __preemption_enabled() );
+	/* paranoid */ verify( kernelTLS().this_proc_id );
+
+	unsigned iproc = kernelTLS().this_proc_id->id;
+	/*paranoid*/ verify(data[iproc].handle == kernelTLS().this_proc_id);
 	/*paranoid*/ verify(iproc < ready);
 
@@ -225,7 +209,10 @@
 }
 
-static inline void ready_schedule_unlock( struct __processor_id_t * proc) with(*__scheduler_lock) {
-	unsigned iproc = proc->id;
-	/*paranoid*/ verify(data[iproc].handle == proc);
+static inline void ready_schedule_unlock(void) with(*__scheduler_lock) {
+	/* paranoid */ verify( ! __preemption_enabled() );
+	/* paranoid */ verify( kernelTLS().this_proc_id );
+
+	unsigned iproc = kernelTLS().this_proc_id->id;
+	/*paranoid*/ verify(data[iproc].handle == kernelTLS().this_proc_id);
 	/*paranoid*/ verify(iproc < ready);
 	/*paranoid*/ verify(data[iproc].lock);
@@ -239,5 +226,8 @@
 
 #ifdef __CFA_WITH_VERIFY__
-	static inline bool ready_schedule_islocked( struct __processor_id_t * proc) {
+	static inline bool ready_schedule_islocked(void) {
+		/* paranoid */ verify( ! __preemption_enabled() );
+		/*paranoid*/ verify( kernelTLS().this_proc_id );
+		__processor_id_t * proc = kernelTLS().this_proc_id;
 		return __scheduler_lock->data[proc->id].owned;
 	}
Index: libcfa/src/concurrency/locks.cfa
===================================================================
--- libcfa/src/concurrency/locks.cfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/locks.cfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -15,4 +15,5 @@
 		this.t = t;
 		this.lock = 0p;
+		this.listed = false;
 	}
 
@@ -21,4 +22,5 @@
 		this.info = info;
 		this.lock = 0p;
+		this.listed = false;
 	}
 
@@ -74,18 +76,19 @@
 
 void lock( blocking_lock & this ) with( this ) {
+	$thread * thrd = active_thread();
 	lock( lock __cfaabi_dbg_ctx2 );
-	if ( owner == kernelTLS.this_thread && !multi_acquisition) {
+	if ( owner == thrd && !multi_acquisition) {
 		fprintf(stderr, "A single acquisition lock holder attempted to reacquire the lock resulting in a deadlock."); // Possibly throw instead
-		exit(EXIT_FAILURE);
-	} else if ( owner != 0p && owner != kernelTLS.this_thread ) {
-		append( blocked_threads, kernelTLS.this_thread );
+    	exit(EXIT_FAILURE);
+	} else if ( owner != 0p && owner != thrd ) {
+		append( blocked_threads, thrd );
 		wait_count++;
 		unlock( lock );
-		park( __cfaabi_dbg_ctx );
-	} else if ( owner == kernelTLS.this_thread && multi_acquisition ) {
+		park( );
+	} else if ( owner == thrd && multi_acquisition ) {
 		recursion_count++;
 		unlock( lock );
 	} else {
-		owner = kernelTLS.this_thread;
+		owner = thrd;
 		recursion_count = 1;
 		unlock( lock );
@@ -94,11 +97,12 @@
 
 bool try_lock( blocking_lock & this ) with( this ) {
+	$thread * thrd = active_thread();
 	bool ret = false;
 	lock( lock __cfaabi_dbg_ctx2 );
 	if ( owner == 0p ) {
-		owner = kernelTLS.this_thread;
+		owner = thrd;
 		if ( multi_acquisition ) recursion_count = 1;
 		ret = true;
-	} else if ( owner == kernelTLS.this_thread && multi_acquisition ) {
+	} else if ( owner == thrd && multi_acquisition ) {
 		recursion_count++;
 		ret = true;
@@ -113,5 +117,5 @@
 		fprintf( stderr, "There was an attempt to release a lock that isn't held" );
 		return;
-	} else if ( strict_owner && owner != kernelTLS.this_thread ) {
+	} else if ( strict_owner && active_thread() ) {
 		fprintf( stderr, "A thread other than the owner attempted to release an owner lock" );
 		return;
@@ -123,5 +127,5 @@
 		recursion_count = ( thrd && multi_acquisition ? 1 : 0 );
 		wait_count--;
-		unpark( thrd __cfaabi_dbg_ctx2 );
+		unpark( thrd );
 	}
 	unlock( lock );
@@ -150,5 +154,8 @@
 		owner = t;
 		if ( multi_acquisition ) recursion_count = 1;
-		unpark( t __cfaabi_dbg_ctx2 );
+		#if !defined( __CFA_NO_STATISTICS__ )
+			kernelTLS.this_stats = t->curr_cluster->stats;
+		#endif
+		unpark( t );
 		unlock( lock );
 	}
@@ -159,5 +166,5 @@
 	if ( owner == 0p ){ // no owner implies lock isn't held
 		fprintf( stderr, "A lock that is not held was passed to a synchronization lock" );
-	} else if ( strict_owner && owner != kernelTLS.this_thread ) {
+	} else if ( strict_owner && active_thread() ) {
 		fprintf( stderr, "A thread other than the owner of a lock passed it to a synchronization lock" );
 	} else {
@@ -166,5 +173,5 @@
 		recursion_count = ( thrd && multi_acquisition ? 1 : 0 );
 		wait_count--;
-		unpark( thrd __cfaabi_dbg_ctx2 );
+		unpark( thrd );
 	}
 	unlock( lock );
@@ -175,7 +182,5 @@
 ///////////////////////////////////////////////////////////////////
 
-// In an ideal world this may not be necessary
-// Is it possible for nominal inheritance to inherit traits??
-// If that occurs we would avoid all this extra code
+// This is temporary until an inheritance bug is fixed
 
 void lock( mutex_lock & this ){
@@ -228,21 +233,38 @@
 
 ///////////////////////////////////////////////////////////////////
-//// Synchronization Locks
+//// condition variable
 ///////////////////////////////////////////////////////////////////
 
 forall(dtype L | is_blocking_lock(L)) {
-	void ?{}( synchronization_lock(L) & this, bool reacquire_after_signal ){
+
+	void timeout_handler ( alarm_node_wrap(L) & this ) with( this ) {
+    	// This condition_variable member is called from the kernel, and therefore, cannot block, but it can spin.
+	    lock( cond->lock __cfaabi_dbg_ctx2 );
+	    if ( (*i)->listed ) {			// is thread on queue
+	    	info_thread(L) * copy = *i;
+			remove( cond->blocked_threads, i );		 //remove this thread O(1)
+			cond->wait_count--;
+			if( !copy->lock ) {
+				unlock( cond->lock );
+				#if !defined( __CFA_NO_STATISTICS__ )
+					#warning unprotected access to tls TODO discuss this
+					kernelTLS.this_stats = copy->t->curr_cluster->stats;
+				#endif
+				unpark( copy->t );
+	    	} else {
+	    		add_(*copy->lock, copy->t);			// call lock's add_
+	    	}
+	    }
+	    unlock( cond->lock );
+	}
+
+	void alarm_node_wrap_cast( alarm_node_t & a ) {
+		timeout_handler( (alarm_node_wrap(L) &)a );
+	}
+
+	void ?{}( condition_variable(L) & this ){
 		this.lock{};
 		this.blocked_threads{};
 		this.count = 0;
-		this.reacquire_after_signal = reacquire_after_signal;
-	}
-
-	void ^?{}( synchronization_lock(L) & this ){
-		// default
-	}
-
-	void ?{}( condition_variable(L) & this ){
-		((synchronization_lock(L) &)this){ true };
 	}
 
@@ -251,23 +273,23 @@
 	}
 
-	void ?{}( thread_queue(L) & this ){
-		((synchronization_lock(L) &)this){ false };
-	}
-
-	void ^?{}( thread_queue(L) & this ){
+	void ?{}( alarm_node_wrap(L) & this, $thread * thrd, Time alarm, Duration period, Alarm_Callback callback ) {
+		this.alarm_node{ thrd, alarm, period, callback };
+	}
+
+	void ^?{}( alarm_node_wrap(L) & this ) {
 		// default
 	}
 
-	bool notify_one( synchronization_lock(L) & this ) with( this ) {
+	bool notify_one( condition_variable(L) & this ) with( this ) {
 		lock( lock __cfaabi_dbg_ctx2 );
 		bool ret = !!blocked_threads;
 		info_thread(L) * popped = pop_head( blocked_threads );
+		popped->listed = false;
 		if(popped != 0p) {
-			if( reacquire_after_signal ){
+			count--;
+			if (popped->lock) {
 				add_(*popped->lock, popped->t);
 			} else {
-				unpark(
-					popped->t __cfaabi_dbg_ctx2
-				);
+				unpark(popped->t);
 			}
 		}
@@ -276,16 +298,16 @@
 	}
 
-	bool notify_all( synchronization_lock(L) & this ) with(this) {
+	bool notify_all( condition_variable(L) & this ) with(this) {
 		lock( lock __cfaabi_dbg_ctx2 );
 		bool ret = blocked_threads ? true : false;
 		while( blocked_threads ) {
 			info_thread(L) * popped = pop_head( blocked_threads );
+			popped->listed = false;
 			if(popped != 0p){
-				if( reacquire_after_signal ){
+				count--;
+				if (popped->lock) {
 					add_(*popped->lock, popped->t);
 				} else {
-					unpark(
-						popped->t __cfaabi_dbg_ctx2
-					);
+					unpark(popped->t);
 				}
 			}
@@ -295,134 +317,127 @@
 	}
 
-	uintptr_t front( synchronization_lock(L) & this ) with(this) {
-		return (*peek(blocked_threads)).info;
-	}
-
-	bool empty( synchronization_lock(L) & this ) with(this) {
+	uintptr_t front( condition_variable(L) & this ) with(this) {
+		if(!blocked_threads) return NULL;
+		return peek(blocked_threads)->info;
+	}
+
+	bool empty( condition_variable(L) & this ) with(this) {
 		return blocked_threads ? false : true;
 	}
 
-	int counter( synchronization_lock(L) & this ) with(this) {
+	int counter( condition_variable(L) & this ) with(this) {
 		return count;
 	}
 
-	void queue_info_thread( synchronization_lock(L) & this, info_thread(L) & i ) with(this) {
-		lock( lock __cfaabi_dbg_ctx2 );
-		append( blocked_threads, &i );
-		count++;
-		unlock( lock );
-		park( __cfaabi_dbg_ctx );
-	}
-
-
-	void wait( synchronization_lock(L) & this ) with(this) {
-		info_thread( L ) i = { kernelTLS.this_thread };
-		queue_info_thread( this, i );
-	}
-
-	void wait( synchronization_lock(L) & this, uintptr_t info ) with(this) {
-		info_thread( L ) i = { kernelTLS.this_thread, info };
-		queue_info_thread( this, i );
-	}
-	// I still need to implement the time delay wait routines
-	bool wait( synchronization_lock(L) & this, Duration duration ) with(this) {
-		timeval tv = { time(0) };
-		Time t = { tv };
-		return wait( this, t + duration );
-	}
-
-	bool wait( synchronization_lock(L) & this, uintptr_t info, Duration duration ) with(this) {
-		// TODO: ADD INFO
-		return wait( this, duration );
-	}
-
-	bool wait( synchronization_lock(L) & this, Time time ) with(this) {
-		return false; //default
-	}
-
-	bool wait( synchronization_lock(L) & this, uintptr_t info, Time time ) with(this) {
-		// TODO: ADD INFO
-		return wait( this, time );
-	}
-
-	void queue_info_thread_unlock( synchronization_lock(L) & this, L & l, info_thread(L) & i ) with(this) {
+	// helper for wait()'s' without a timeout
+	void queue_info_thread( condition_variable(L) & this, info_thread(L) & i ) with(this) {
 		lock( lock __cfaabi_dbg_ctx2 );
 		append( this.blocked_threads, &i );
 		count++;
-		i.lock = &l;
-		size_t recursion_count = get_recursion_count(l);
-		remove_( l );
-		unlock( lock );
-		park( __cfaabi_dbg_ctx ); // blocks here
-
-		set_recursion_count(l, recursion_count); // resets recursion count here after waking
-	}
-
-	void wait( synchronization_lock(L) & this, L & l ) with(this) {
-		info_thread(L) i = { kernelTLS.this_thread };
-		queue_info_thread_unlock( this, l, i );
-	}
-
-	void wait( synchronization_lock(L) & this, L & l, uintptr_t info ) with(this) {
-		info_thread(L) i = { kernelTLS.this_thread, info };
-		queue_info_thread_unlock( this, l, i );
-	}
-
-	bool wait( synchronization_lock(L) & this, L & l, Duration duration ) with(this) {
-		timeval tv = { time(0) };
-		Time t = { tv };
-		return wait( this, l, t + duration );
-	}
-
-	bool wait( synchronization_lock(L) & this, L & l, uintptr_t info, Duration duration ) with(this) {
-		// TODO: ADD INFO
-		return wait( this, l, duration );
-	}
-
-	bool wait( synchronization_lock(L) & this, L & l, Time time ) with(this) {
-		return false; //default
-	}
-
-	bool wait( synchronization_lock(L) & this, L & l, uintptr_t info, Time time ) with(this) {
-		// TODO: ADD INFO
-		return wait( this, l, time );
-	}
-}
-
-///////////////////////////////////////////////////////////////////
-//// condition lock alternative approach
-///////////////////////////////////////////////////////////////////
-
-// the solution below is less efficient but does not require the lock to have a specific add/remove routine
-
-///////////////////////////////////////////////////////////////////
-//// is_simple_lock
-///////////////////////////////////////////////////////////////////
-
-forall(dtype L | is_simple_lock(L)) {
-	void ?{}( condition_lock(L) & this ){
-		// default
-	}
-
-	void ^?{}( condition_lock(L) & this ){
-		// default
-	}
-
-	bool notify_one( condition_lock(L) & this ) with(this) {
-		return notify_one( c_var );
-	}
-
-	bool notify_all( condition_lock(L) & this ) with(this) {
-		return notify_all( c_var );
-	}
-
-	void wait( condition_lock(L) & this, L & l ) with(this) {
-		lock( m_lock );
-		size_t recursion = get_recursion_count( l );
-		unlock( l );
-		wait( c_var, m_lock );
-		lock( l );
-		set_recursion_count( l , recursion );
-		unlock( m_lock );
-	}
-}
+		i.listed = true;
+		size_t recursion_count;
+		if (i.lock) {
+			recursion_count = get_recursion_count(*i.lock);
+			remove_( *i.lock );
+		}
+
+		unlock( lock );
+		park( ); // blocks here
+
+		if (i.lock) set_recursion_count(*i.lock, recursion_count); // resets recursion count here after waking
+	}
+
+	// helper for wait()'s' with a timeout
+	void queue_info_thread_timeout( condition_variable(L) & this, info_thread(L) & info, Time t ) with(this) {
+		lock( lock __cfaabi_dbg_ctx2 );
+
+		info_thread(L) * queue_ptr = &info;
+
+		alarm_node_wrap(L) node_wrap = { info.t, t, 0`s, alarm_node_wrap_cast };
+		node_wrap.cond = &this;
+		node_wrap.i = &queue_ptr;
+
+		register_self( &node_wrap.alarm_node );
+
+		append( blocked_threads, queue_ptr );
+		info.listed = true;
+		count++;
+
+		size_t recursion_count;
+		if (info.lock) {
+			recursion_count = get_recursion_count(*info.lock);
+			remove_( *info.lock );
+		}
+
+		unlock( lock );
+		park();
+
+		if (info.lock) set_recursion_count(*info.lock, recursion_count);
+	}
+
+	void wait( condition_variable(L) & this ) with(this) {
+		info_thread( L ) i = { active_thread() };
+		queue_info_thread( this, i );
+	}
+
+	void wait( condition_variable(L) & this, uintptr_t info ) with(this) {
+		info_thread( L ) i = { active_thread(), info };
+		queue_info_thread( this, i );
+	}
+
+	void wait( condition_variable(L) & this, Duration duration ) with(this) {
+		info_thread( L ) i = { active_thread() };
+		queue_info_thread_timeout(this, i, __kernel_get_time() + duration );
+	}
+
+	void wait( condition_variable(L) & this, uintptr_t info, Duration duration ) with(this) {
+		info_thread( L ) i = { active_thread(), info };
+		queue_info_thread_timeout(this, i, __kernel_get_time() + duration );
+	}
+
+	void wait( condition_variable(L) & this, Time time ) with(this) {
+		info_thread( L ) i = { active_thread() };
+		queue_info_thread_timeout(this, i, time);
+	}
+
+	void wait( condition_variable(L) & this, uintptr_t info, Time time ) with(this) {
+		info_thread( L ) i = { active_thread(), info };
+		queue_info_thread_timeout(this, i, time);
+	}
+
+	void wait( condition_variable(L) & this, L & l ) with(this) {
+		info_thread(L) i = { active_thread() };
+		i.lock = &l;
+		queue_info_thread( this, i );
+	}
+
+	void wait( condition_variable(L) & this, L & l, uintptr_t info ) with(this) {
+		info_thread(L) i = { active_thread(), info };
+		i.lock = &l;
+		queue_info_thread( this, i );
+	}
+
+	void wait( condition_variable(L) & this, L & l, Duration duration ) with(this) {
+		info_thread(L) i = { active_thread() };
+		i.lock = &l;
+		queue_info_thread_timeout(this, i, __kernel_get_time() + duration );
+	}
+
+	void wait( condition_variable(L) & this, L & l, uintptr_t info, Duration duration ) with(this) {
+		info_thread(L) i = { active_thread(), info };
+		i.lock = &l;
+		queue_info_thread_timeout(this, i, __kernel_get_time() + duration );
+	}
+
+	void wait( condition_variable(L) & this, L & l, Time time ) with(this) {
+		info_thread(L) i = { active_thread() };
+		i.lock = &l;
+		queue_info_thread_timeout(this, i, time );
+	}
+
+	void wait( condition_variable(L) & this, L & l, uintptr_t info, Time time ) with(this) {
+		info_thread(L) i = { active_thread(), info };
+		i.lock = &l;
+		queue_info_thread_timeout(this, i, time );
+	}
+}
Index: libcfa/src/concurrency/locks.hfa
===================================================================
--- libcfa/src/concurrency/locks.hfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/locks.hfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -1,2 +1,4 @@
+#pragma once
+
 #include <stdbool.h>
 
@@ -10,4 +12,5 @@
 #include "time.hfa"
 #include <sys/time.h>
+#include "alarm.hfa"
 
 ///////////////////////////////////////////////////////////////////
@@ -32,4 +35,5 @@
 		info_thread(L) * next;
 		L * lock;
+		bool listed;					// true if info_thread is on queue, false otherwise;
 	};
 
@@ -119,5 +123,5 @@
 ///////////////////////////////////////////////////////////////////
 forall(dtype L | is_blocking_lock(L)) {
-	struct synchronization_lock {
+	struct condition_variable {
 		// Spin lock used for mutual exclusion
 		__spinlock_t lock;
@@ -128,84 +132,45 @@
 		// Count of current blocked threads
 		int count;
-
-		// If true threads will reacquire the lock they block on upon waking
-		bool reacquire_after_signal;
 	};
-
-	struct condition_variable {
-		inline synchronization_lock(L);
-	};
-
-	struct thread_queue {
-		inline synchronization_lock(L);
-	};
-
-
-	void ?{}( synchronization_lock(L) & this, bool multi_acquisition, bool strict_owner );
-	void ^?{}( synchronization_lock(L) & this );
 
 	void ?{}( condition_variable(L) & this );
 	void ^?{}( condition_variable(L) & this );
 
-	void ?{}( thread_queue(L) & this );
-	void ^?{}( thread_queue(L) & this );
+	struct alarm_node_wrap {
+		alarm_node_t alarm_node;
 
-	bool notify_one( synchronization_lock(L) & this );
-	bool notify_all( synchronization_lock(L) & this );
+		condition_variable(L) * cond;
 
-	uintptr_t front( synchronization_lock(L) & this );
-
-	bool empty( synchronization_lock(L) & this );
-	int counter( synchronization_lock(L) & this );
-
-	// wait functions that are not passed a mutex lock
-	void wait( synchronization_lock(L) & this );
-	void wait( synchronization_lock(L) & this, uintptr_t info );
-	bool wait( synchronization_lock(L) & this, Duration duration );
-	bool wait( synchronization_lock(L) & this, uintptr_t info, Duration duration );
-	bool wait( synchronization_lock(L) & this, Time time );
-	bool wait( synchronization_lock(L) & this, uintptr_t info, Time time );
-
-	// wait functions that are passed a lock
-	bool notify_one( synchronization_lock(L) & this, L & l );
-	bool notify_all( synchronization_lock(L) & this, L & l );
-
-	void wait( synchronization_lock(L) & this, L & l );
-	void wait( synchronization_lock(L) & this, L & l, uintptr_t info );
-	bool wait( synchronization_lock(L) & this, L & l, Duration duration );
-	bool wait( synchronization_lock(L) & this, L & l, uintptr_t info, Duration duration );
-	bool wait( synchronization_lock(L) & this, L & l, Time time );
-	bool wait( synchronization_lock(L) & this, L & l, uintptr_t info, Time time );
-}
-
-///////////////////////////////////////////////////////////////////
-//// condition lock alternative approach
-///////////////////////////////////////////////////////////////////
-
-
-///////////////////////////////////////////////////////////////////
-//// is_simple_lock
-///////////////////////////////////////////////////////////////////
-
-trait is_simple_lock(dtype L | sized(L)) {
-	void lock( L & );		// For synchronization locks to use when acquiring
-	void unlock( L & );    // For synchronization locks to use when releasing
-	size_t get_recursion_count( L & ); // to get recursion count for cond lock to reset after waking
-	void set_recursion_count( L &, size_t recursion ); // to set recursion count after getting signalled;
-};
-
-forall(dtype L | is_simple_lock(L)) {
-	struct condition_lock {
-		// Spin lock used for mutual exclusion
-		mutex_lock m_lock;
-
-		condition_variable( mutex_lock ) c_var;
+		info_thread(L) ** i;
 	};
 
-	void ?{}( condition_lock(L) & this );
-	void ^?{}( condition_lock(L) & this );
+	void ?{}( alarm_node_wrap(L) & this, $thread * thrd, Time alarm, Duration period, Alarm_Callback callback );
+	void ^?{}( alarm_node_wrap(L) & this );
 
-	bool notify_one( condition_lock(L) & this );
-	bool notify_all( condition_lock(L) & this );
-	void wait( condition_lock(L) & this, L & l );
+	void alarm_node_callback( alarm_node_wrap(L) & this );
+
+	void alarm_node_wrap_cast( alarm_node_t & a );
+
+	bool notify_one( condition_variable(L) & this );
+	bool notify_all( condition_variable(L) & this );
+
+	uintptr_t front( condition_variable(L) & this );
+
+	bool empty( condition_variable(L) & this );
+	int counter( condition_variable(L) & this );
+
+	// TODO: look into changing timout routines to return bool showing if signalled or woken by kernel
+	void wait( condition_variable(L) & this );
+	void wait( condition_variable(L) & this, uintptr_t info );
+	void wait( condition_variable(L) & this, Duration duration );
+	void wait( condition_variable(L) & this, uintptr_t info, Duration duration );
+	void wait( condition_variable(L) & this, Time time );
+	void wait( condition_variable(L) & this, uintptr_t info, Time time );
+
+	void wait( condition_variable(L) & this, L & l );
+	void wait( condition_variable(L) & this, L & l, uintptr_t info );
+	void wait( condition_variable(L) & this, L & l, Duration duration );
+	void wait( condition_variable(L) & this, L & l, uintptr_t info, Duration duration );
+	void wait( condition_variable(L) & this, L & l, Time time );
+	void wait( condition_variable(L) & this, L & l, uintptr_t info, Time time );
 }
Index: libcfa/src/concurrency/monitor.cfa
===================================================================
--- libcfa/src/concurrency/monitor.cfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/monitor.cfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -82,8 +82,8 @@
 // Enter single monitor
 static void __enter( $monitor * this, const __monitor_group_t & group ) {
+	$thread * thrd = active_thread();
+
 	// Lock the monitor spinlock
 	lock( this->lock __cfaabi_dbg_ctx2 );
-	// Interrupts disable inside critical section
-	$thread * thrd = kernelTLS.this_thread;
 
 	__cfaabi_dbg_print_safe( "Kernel : %10p Entering mon %p (%p)\n", thrd, this, this->owner);
@@ -126,5 +126,5 @@
 		__cfaabi_dbg_print_safe( "Kernel : %10p Entered  mon %p\n", thrd, this);
 
-		/* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+		/* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 		return;
 	}
@@ -132,5 +132,5 @@
 	__cfaabi_dbg_print_safe( "Kernel : %10p Entered  mon %p\n", thrd, this);
 
-	/* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+	/* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 	/* paranoid */ verify( this->lock.lock );
 
@@ -141,8 +141,8 @@
 
 static void __dtor_enter( $monitor * this, fptr_t func, bool join ) {
+	$thread * thrd = active_thread();
+
 	// Lock the monitor spinlock
 	lock( this->lock __cfaabi_dbg_ctx2 );
-	// Interrupts disable inside critical section
-	$thread * thrd = kernelTLS.this_thread;
 
 	__cfaabi_dbg_print_safe( "Kernel : %10p Entering dtor for mon %p (%p)\n", thrd, this, this->owner);
@@ -155,5 +155,5 @@
 		__set_owner( this, thrd );
 
-		verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+		verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 
 		unlock( this->lock );
@@ -174,5 +174,5 @@
 		this->owner = thrd;
 
-		verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+		verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 
 		unlock( this->lock );
@@ -200,5 +200,5 @@
 
 		// Release the next thread
-		/* paranoid */ verifyf( urgent->owner->waiting_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+		/* paranoid */ verifyf( urgent->owner->waiting_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 		unpark( urgent->owner->waiting_thread );
 
@@ -207,5 +207,5 @@
 
 		// Some one was waiting for us, enter
-		/* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+		/* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 	}
 	else {
@@ -224,5 +224,5 @@
 		park();
 
-		/* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+		/* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 		return;
 	}
@@ -237,7 +237,7 @@
 	lock( this->lock __cfaabi_dbg_ctx2 );
 
-	__cfaabi_dbg_print_safe( "Kernel : %10p Leaving mon %p (%p)\n", kernelTLS.this_thread, this, this->owner);
-
-	/* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+	__cfaabi_dbg_print_safe( "Kernel : %10p Leaving mon %p (%p)\n", active_thread(), this, this->owner);
+
+	/* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 
 	// Leaving a recursion level, decrement the counter
@@ -270,6 +270,6 @@
 void __dtor_leave( $monitor * this, bool join ) {
 	__cfaabi_dbg_debug_do(
-		if( TL_GET( this_thread ) != this->owner ) {
-			abort( "Destroyed monitor %p has inconsistent owner, expected %p got %p.\n", this, TL_GET( this_thread ), this->owner);
+		if( active_thread() != this->owner ) {
+			abort( "Destroyed monitor %p has inconsistent owner, expected %p got %p.\n", this, active_thread(), this->owner);
 		}
 		if( this->recursion != 1  && !join ) {
@@ -287,5 +287,5 @@
 	/* paranoid */ verify( this->lock.lock );
 	/* paranoid */ verifyf( thrd == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", thrd, this->owner, this->recursion, this );
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	/* paranoid */ verify( thrd->state == Halted );
 	/* paranoid */ verify( this->recursion == 1 );
@@ -303,5 +303,5 @@
 	// Unpark the next owner if needed
 	/* paranoid */ verifyf( !new_owner || new_owner == this->owner, "Expected owner to be %p, got %p (m: %p)", new_owner, this->owner, this );
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	/* paranoid */ verify( thrd->state == Halted );
 	unpark( new_owner );
@@ -327,5 +327,5 @@
 // Sorts monitors before entering
 void ?{}( monitor_guard_t & this, $monitor * m [], __lock_size_t count, fptr_t func ) {
-	$thread * thrd = TL_GET( this_thread );
+	$thread * thrd = active_thread();
 
 	// Store current array
@@ -362,5 +362,5 @@
 
 	// Restore thread context
-	TL_GET( this_thread )->monitors = this.prev;
+	active_thread()->monitors = this.prev;
 }
 
@@ -369,5 +369,5 @@
 void ?{}( monitor_dtor_guard_t & this, $monitor * m [], fptr_t func, bool join ) {
 	// optimization
-	$thread * thrd = TL_GET( this_thread );
+	$thread * thrd = active_thread();
 
 	// Store current array
@@ -392,5 +392,5 @@
 
 	// Restore thread context
-	TL_GET( this_thread )->monitors = this.prev;
+	active_thread()->monitors = this.prev;
 }
 
@@ -432,5 +432,5 @@
 
 	// Create the node specific to this wait operation
-	wait_ctx( TL_GET( this_thread ), user_info );
+	wait_ctx( active_thread(), user_info );
 
 	// Append the current wait operation to the ones already queued on the condition
@@ -483,5 +483,5 @@
 	//Some more checking in debug
 	__cfaabi_dbg_debug_do(
-		$thread * this_thrd = TL_GET( this_thread );
+		$thread * this_thrd = active_thread();
 		if ( this.monitor_count != this_thrd->monitors.size ) {
 			abort( "Signal on condition %p made with different number of monitor(s), expected %zi got %zi", &this, this.monitor_count, this_thrd->monitors.size );
@@ -531,5 +531,5 @@
 
 	// Create the node specific to this wait operation
-	wait_ctx_primed( kernelTLS.this_thread, 0 )
+	wait_ctx_primed( active_thread(), 0 )
 
 	//save contexts
@@ -630,5 +630,5 @@
 
 				// Create the node specific to this wait operation
-				wait_ctx_primed( kernelTLS.this_thread, 0 );
+				wait_ctx_primed( active_thread(), 0 );
 
 				// Save monitor states
@@ -682,5 +682,5 @@
 
 	// Create the node specific to this wait operation
-	wait_ctx_primed( kernelTLS.this_thread, 0 );
+	wait_ctx_primed( active_thread(), 0 );
 
 	monitor_save;
@@ -688,5 +688,5 @@
 
 	for( __lock_size_t i = 0; i < count; i++) {
-		verify( monitors[i]->owner == kernelTLS.this_thread );
+		verify( monitors[i]->owner == active_thread() );
 	}
 
@@ -724,10 +724,10 @@
 static inline void __set_owner( $monitor * monitors [], __lock_size_t count, $thread * owner ) {
 	/* paranoid */ verify ( monitors[0]->lock.lock );
-	/* paranoid */ verifyf( monitors[0]->owner == kernelTLS.this_thread, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, monitors[0]->owner, monitors[0]->recursion, monitors[0] );
+	/* paranoid */ verifyf( monitors[0]->owner == active_thread(), "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), monitors[0]->owner, monitors[0]->recursion, monitors[0] );
 	monitors[0]->owner        = owner;
 	monitors[0]->recursion    = 1;
 	for( __lock_size_t i = 1; i < count; i++ ) {
 		/* paranoid */ verify ( monitors[i]->lock.lock );
-		/* paranoid */ verifyf( monitors[i]->owner == kernelTLS.this_thread, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, monitors[i]->owner, monitors[i]->recursion, monitors[i] );
+		/* paranoid */ verifyf( monitors[i]->owner == active_thread(), "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), monitors[i]->owner, monitors[i]->recursion, monitors[i] );
 		monitors[i]->owner        = owner;
 		monitors[i]->recursion    = 0;
@@ -755,5 +755,5 @@
 		//regardless of if we are ready to baton pass,
 		//we need to set the monitor as in use
-		/* paranoid */ verifyf( !this->owner || kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+		/* paranoid */ verifyf( !this->owner || active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 		__set_owner( this,  urgent->owner->waiting_thread );
 
@@ -764,5 +764,5 @@
 	// Get the next thread in the entry_queue
 	$thread * new_owner = pop_head( this->entry_queue );
-	/* paranoid */ verifyf( !this->owner || kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+	/* paranoid */ verifyf( !this->owner || active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
 	/* paranoid */ verify( !new_owner || new_owner->link.next == 0p );
 	__set_owner( this, new_owner );
@@ -892,5 +892,5 @@
 
 static inline void brand_condition( condition & this ) {
-	$thread * thrd = TL_GET( this_thread );
+	$thread * thrd = active_thread();
 	if( !this.monitors ) {
 		// __cfaabi_dbg_print_safe( "Branding\n" );
Index: libcfa/src/concurrency/mutex.cfa
===================================================================
--- libcfa/src/concurrency/mutex.cfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/mutex.cfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -40,5 +40,5 @@
 	lock( lock __cfaabi_dbg_ctx2 );
 	if( is_locked ) {
-		append( blocked_threads, kernelTLS.this_thread );
+		append( blocked_threads, active_thread() );
 		unlock( lock );
 		park();
@@ -86,14 +86,14 @@
 	lock( lock __cfaabi_dbg_ctx2 );
 	if( owner == 0p ) {
-		owner = kernelTLS.this_thread;
+		owner = active_thread();
 		recursion_count = 1;
 		unlock( lock );
 	}
-	else if( owner == kernelTLS.this_thread ) {
+	else if( owner == active_thread() ) {
 		recursion_count++;
 		unlock( lock );
 	}
 	else {
-		append( blocked_threads, kernelTLS.this_thread );
+		append( blocked_threads, active_thread() );
 		unlock( lock );
 		park();
@@ -105,9 +105,9 @@
 	lock( lock __cfaabi_dbg_ctx2 );
 	if( owner == 0p ) {
-		owner = kernelTLS.this_thread;
+		owner = active_thread();
 		recursion_count = 1;
 		ret = true;
 	}
-	else if( owner == kernelTLS.this_thread ) {
+	else if( owner == active_thread() ) {
 		recursion_count++;
 		ret = true;
@@ -159,5 +159,5 @@
 void wait(condition_variable & this) {
 	lock( this.lock __cfaabi_dbg_ctx2 );
-	append( this.blocked_threads, kernelTLS.this_thread );
+	append( this.blocked_threads, active_thread() );
 	unlock( this.lock );
 	park();
@@ -167,5 +167,5 @@
 void wait(condition_variable & this, L & l) {
 	lock( this.lock __cfaabi_dbg_ctx2 );
-	append( this.blocked_threads, kernelTLS.this_thread );
+	append( this.blocked_threads, active_thread() );
 	unlock(l);
 	unlock(this.lock);
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/preemption.cfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -10,6 +10,6 @@
 // Created On       : Mon Jun 5 14:20:42 2017
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Wed Aug 26 16:46:03 2020
-// Update Count     : 53
+// Last Modified On : Fri Nov  6 07:42:13 2020
+// Update Count     : 54
 //
 
@@ -38,5 +38,5 @@
 // FwdDeclarations : timeout handlers
 static void preempt( processor   * this );
-static void timeout( struct __processor_id_t * id, $thread * this );
+static void timeout( $thread * this );
 
 // FwdDeclarations : Signal handlers
@@ -91,5 +91,5 @@
 
 // Tick one frame of the Discrete Event Simulation for alarms
-static void tick_preemption( struct __processor_id_t * id ) {
+static void tick_preemption(void) {
 	alarm_node_t * node = 0p;							// Used in the while loop but cannot be declared in the while condition
 	alarm_list_t * alarms = &event_kernel->alarms;		// Local copy for ease of reading
@@ -105,9 +105,12 @@
 
 		// Check if this is a kernel
-		if( node->kernel_alarm ) {
+		if( node->type == Kernel ) {
 			preempt( node->proc );
 		}
+		else if( node->type == User ) {
+			timeout( node->thrd );
+		}
 		else {
-			timeout( id, node->thrd );
+			node->callback(*node);
 		}
 
@@ -160,11 +163,97 @@
 // Kernel Signal Tools
 //=============================================================================================
-
-__cfaabi_dbg_debug_do( static thread_local void * last_interrupt = 0; )
+// In a user-level threading system, there are handful of thread-local variables where this problem occurs on the ARM.
+//
+// For each kernel thread running user-level threads, there is a flag variable to indicate if interrupts are
+// enabled/disabled for that kernel thread. Therefore, this variable is made thread local.
+//
+// For example, this code fragment sets the state of the "interrupt" variable in thread-local memory.
+//
+// _Thread_local volatile int interrupts;
+// int main() {
+//     interrupts = 0; // disable interrupts }
+//
+// which generates the following code on the ARM
+//
+// (gdb) disassemble main
+// Dump of assembler code for function main:
+//    0x0000000000000610 <+0>:	mrs	x1, tpidr_el0
+//    0x0000000000000614 <+4>:	mov	w0, #0x0                   	// #0
+//    0x0000000000000618 <+8>:	add	x1, x1, #0x0, lsl #12
+//    0x000000000000061c <+12>:	add	x1, x1, #0x10
+//    0x0000000000000620 <+16>:	str	wzr, [x1]
+//    0x0000000000000624 <+20>:	ret
+//
+// The mrs moves a pointer from coprocessor register tpidr_el0 into register x1.  Register w0 is set to 0. The two adds
+// increase the TLS pointer with the displacement (offset) 0x10, which is the location in the TSL of variable
+// "interrupts".  Finally, 0 is stored into "interrupts" through the pointer in register x1 that points into the
+// TSL. Now once x1 has the pointer to the location of the TSL for kernel thread N, it can be be preempted at a
+// user-level and the user thread is put on the user-level ready-queue. When the preempted thread gets to the front of
+// the user-level ready-queue it is run on kernel thread M. It now stores 0 into "interrupts" back on kernel thread N,
+// turning off interrupt on the wrong kernel thread.
+//
+// On the x86, the following code is generated for the same code fragment.
+//
+// (gdb) disassemble main
+// Dump of assembler code for function main:
+//    0x0000000000400420 <+0>:	movl   $0x0,%fs:0xfffffffffffffffc
+//    0x000000000040042c <+12>:	xor    %eax,%eax
+//    0x000000000040042e <+14>:	retq
+//
+// and there is base-displacement addressing used to atomically reset variable "interrupts" off of the TSL pointer in
+// register "fs".
+//
+// Hence, the ARM has base-displacement address for the general purpose registers, BUT not to the coprocessor
+// registers. As a result, generating the address for the write into variable "interrupts" is no longer atomic.
+//
+// Note this problem does NOT occur when just using multiple kernel threads because the preemption ALWAYS restarts the
+// thread on the same kernel thread.
+//
+// The obvious question is why does ARM use a coprocessor register to store the TSL pointer given that coprocessor
+// registers are second-class registers with respect to the instruction set. One possible answer is that they did not
+// want to dedicate one of the general registers to hold the TLS pointer and there was a free coprocessor register
+// available.
+
+//----------
+// special case for preemption since used often
+bool __preemption_enabled() {
+	// create a assembler label before
+	// marked as clobber all to avoid movement
+	asm volatile("__cfaasm_check_before:":::"memory");
+
+	// access tls as normal
+	bool enabled = __cfaabi_tls.preemption_state.enabled;
+
+	// create a assembler label after
+	// marked as clobber all to avoid movement
+	asm volatile("__cfaasm_check_after:":::"memory");
+	return enabled;
+}
+
+//----------
+// Get data from the TLS block
+uintptr_t __cfatls_get( unsigned long int offset ) __attribute__((__noinline__)); //no inline to avoid problems
+uintptr_t __cfatls_get( unsigned long int offset ) {
+	// create a assembler label before
+	// marked as clobber all to avoid movement
+	asm volatile("__cfaasm_get_before:":::"memory");
+
+	// access tls as normal (except for pointer arithmetic)
+	uintptr_t val = *(uintptr_t*)((uintptr_t)&__cfaabi_tls + offset);
+
+	// create a assembler label after
+	// marked as clobber all to avoid movement
+	asm volatile("__cfaasm_get_after:":::"memory");
+	return val;
+}
 
 extern "C" {
 	// Disable interrupts by incrementing the counter
 	void disable_interrupts() {
-		with( kernelTLS.preemption_state ) {
+		// create a assembler label before
+		// marked as clobber all to avoid movement
+		asm volatile("__cfaasm_disable_before:":::"memory");
+
+		with( __cfaabi_tls.preemption_state ) {
 			#if GCC_VERSION > 50000
 			static_assert(__atomic_always_lock_free(sizeof(enabled), &enabled), "Must be lock-free");
@@ -183,4 +272,8 @@
 			verify( new_val < 65_000u );              // If this triggers someone is disabling interrupts without enabling them
 		}
+
+		// create a assembler label after
+		// marked as clobber all to avoid movement
+		asm volatile("__cfaasm_disable_after:":::"memory");
 	}
 
@@ -188,8 +281,12 @@
 	// If counter reaches 0, execute any pending __cfactx_switch
 	void enable_interrupts( __cfaabi_dbg_ctx_param ) {
-		processor   * proc = kernelTLS.this_processor; // Cache the processor now since interrupts can start happening after the atomic store
+		// create a assembler label before
+		// marked as clobber all to avoid movement
+		asm volatile("__cfaasm_enable_before:":::"memory");
+
+		processor   * proc = __cfaabi_tls.this_processor; // Cache the processor now since interrupts can start happening after the atomic store
 		/* paranoid */ verify( proc );
 
-		with( kernelTLS.preemption_state ){
+		with( __cfaabi_tls.preemption_state ){
 			unsigned short prev = disable_count;
 			disable_count -= 1;
@@ -218,4 +315,8 @@
 		// For debugging purposes : keep track of the last person to enable the interrupts
 		__cfaabi_dbg_debug_do( proc->last_enable = caller; )
+
+		// create a assembler label after
+		// marked as clobber all to avoid movement
+		asm volatile("__cfaasm_enable_after:":::"memory");
 	}
 
@@ -223,19 +324,27 @@
 	// Don't execute any pending __cfactx_switch even if counter reaches 0
 	void enable_interrupts_noPoll() {
-		unsigned short prev = kernelTLS.preemption_state.disable_count;
-		kernelTLS.preemption_state.disable_count -= 1;
+		// create a assembler label before
+		// marked as clobber all to avoid movement
+		asm volatile("__cfaasm_nopoll_before:":::"memory");
+
+		unsigned short prev = __cfaabi_tls.preemption_state.disable_count;
+		__cfaabi_tls.preemption_state.disable_count -= 1;
 		verifyf( prev != 0u, "Incremented from %u\n", prev );                     // If this triggers someone is enabled already enabled interrupts
 		if( prev == 1 ) {
 			#if GCC_VERSION > 50000
-			static_assert(__atomic_always_lock_free(sizeof(kernelTLS.preemption_state.enabled), &kernelTLS.preemption_state.enabled), "Must be lock-free");
+			static_assert(__atomic_always_lock_free(sizeof(__cfaabi_tls.preemption_state.enabled), &__cfaabi_tls.preemption_state.enabled), "Must be lock-free");
 			#endif
 			// Set enabled flag to true
 			// should be atomic to avoid preemption in the middle of the operation.
 			// use memory order RELAXED since there is no inter-thread on this variable requirements
-			__atomic_store_n(&kernelTLS.preemption_state.enabled, true, __ATOMIC_RELAXED);
+			__atomic_store_n(&__cfaabi_tls.preemption_state.enabled, true, __ATOMIC_RELAXED);
 
 			// Signal the compiler that a fence is needed but only for signal handlers
 			__atomic_signal_fence(__ATOMIC_RELEASE);
 		}
+
+		// create a assembler label after
+		// marked as clobber all to avoid movement
+		asm volatile("__cfaasm_nopoll_after:":::"memory");
 	}
 }
@@ -270,9 +379,9 @@
 
 // reserved for future use
-static void timeout( struct __processor_id_t * id, $thread * this ) {
+static void timeout( $thread * this ) {
 	#if !defined( __CFA_NO_STATISTICS__ )
-		kernelTLS.this_stats = this->curr_cluster->stats;
+		kernelTLS().this_stats = this->curr_cluster->stats;
 	#endif
-	__unpark( id, this );
+	unpark( this );
 }
 
@@ -283,8 +392,8 @@
 static inline bool preemption_ready() {
 	// Check if preemption is safe
-	bool ready = kernelTLS.preemption_state.enabled && ! kernelTLS.preemption_state.in_progress;
+	bool ready = __cfaabi_tls.preemption_state.enabled && ! __cfaabi_tls.preemption_state.in_progress;
 
 	// Adjust the pending flag accordingly
-	kernelTLS.this_processor->pending_preemption = !ready;
+	__cfaabi_tls.this_processor->pending_preemption = !ready;
 	return ready;
 }
@@ -300,6 +409,6 @@
 
 	// Start with preemption disabled until ready
-	kernelTLS.preemption_state.enabled = false;
-	kernelTLS.preemption_state.disable_count = 1;
+	__cfaabi_tls.preemption_state.enabled = false;
+	__cfaabi_tls.preemption_state.disable_count = 1;
 
 	// Initialize the event kernel
@@ -359,9 +468,51 @@
 // Kernel Signal Handlers
 //=============================================================================================
+struct asm_region {
+	void * before;
+	void * after;
+};
+
+//-----------------------------------------------------------------------------
+// Some assembly required
+#if defined( __i386 )
+	#define __cfaasm_label( label ) \
+		({ \
+			struct asm_region region; \
+			asm( \
+				"movl $__cfaasm_" #label "_before, %[vb]\n\t" \
+				"movl $__cfaasm_" #label "_after , %[va]\n\t" \
+				 : [vb]"=r"(region.before), [vb]"=r"(region.before) \
+			); \
+			region; \
+		});
+#elif defined( __x86_64 )
+	#ifdef __PIC__
+		#define PLT "@PLT"
+	#else
+		#define PLT ""
+	#endif
+	#define __cfaasm_label( label ) \
+		({ \
+			struct asm_region region; \
+			asm( \
+				"movq $__cfaasm_" #label "_before" PLT ", %[vb]\n\t" \
+				"movq $__cfaasm_" #label "_after"  PLT ", %[va]\n\t" \
+				 : [vb]"=r"(region.before), [va]"=r"(region.after) \
+			); \
+			region; \
+		});
+#elif defined( __aarch64__ )
+	#error __cfaasm_label undefined for arm
+#else
+	#error unknown hardware architecture
+#endif
+
+__cfaabi_dbg_debug_do( static thread_local void * last_interrupt = 0; )
 
 // Context switch signal handler
 // Receives SIGUSR1 signal and causes the current thread to yield
 static void sigHandler_ctxSwitch( __CFA_SIGPARMS__ ) {
-	__cfaabi_dbg_debug_do( last_interrupt = (void *)(cxt->uc_mcontext.CFA_REG_IP); )
+	void * ip = (void *)(cxt->uc_mcontext.CFA_REG_IP);
+	__cfaabi_dbg_debug_do( last_interrupt = ip; )
 
 	// SKULLDUGGERY: if a thread creates a processor and the immediately deletes it,
@@ -369,9 +520,9 @@
 	// before the kernel thread has even started running. When that happens, an interrupt
 	// with a null 'this_processor' will be caught, just ignore it.
-	if(! kernelTLS.this_processor ) return;
+	if(! __cfaabi_tls.this_processor ) return;
 
 	choose(sfp->si_value.sival_int) {
 		case PREEMPT_NORMAL   : ;// Normal case, nothing to do here
-		case PREEMPT_TERMINATE: verify( __atomic_load_n( &kernelTLS.this_processor->do_terminate, __ATOMIC_SEQ_CST ) );
+		case PREEMPT_TERMINATE: verify( __atomic_load_n( &__cfaabi_tls.this_processor->do_terminate, __ATOMIC_SEQ_CST ) );
 		default:
 			abort( "internal error, signal value is %d", sfp->si_value.sival_int );
@@ -381,8 +532,15 @@
 	if( !preemption_ready() ) { return; }
 
-	__cfaabi_dbg_print_buffer_decl( " KERNEL: preempting core %p (%p @ %p).\n", kernelTLS.this_processor, kernelTLS.this_thread, (void *)(cxt->uc_mcontext.CFA_REG_IP) );
+	struct asm_region region;
+	region = __cfaasm_label( get     ); if( ip >= region.before && ip <= region.after ) return;
+	region = __cfaasm_label( check   ); if( ip >= region.before && ip <= region.after ) return;
+	region = __cfaasm_label( disable ); if( ip >= region.before && ip <= region.after ) return;
+	region = __cfaasm_label( enable  ); if( ip >= region.before && ip <= region.after ) return;
+	region = __cfaasm_label( nopoll  ); if( ip >= region.before && ip <= region.after ) return;
+
+	__cfaabi_dbg_print_buffer_decl( " KERNEL: preempting core %p (%p @ %p).\n", __cfaabi_tls.this_processor, __cfaabi_tls.this_thread, (void *)(cxt->uc_mcontext.CFA_REG_IP) );
 
 	// Sync flag : prevent recursive calls to the signal handler
-	kernelTLS.preemption_state.in_progress = true;
+	__cfaabi_tls.preemption_state.in_progress = true;
 
 	// Clear sighandler mask before context switching.
@@ -394,7 +552,6 @@
 	}
 
-	// TODO: this should go in finish action
 	// Clear the in progress flag
-	kernelTLS.preemption_state.in_progress = false;
+	__cfaabi_tls.preemption_state.in_progress = false;
 
 	// Preemption can occur here
@@ -413,4 +570,5 @@
 	id.full_proc = false;
 	id.id = doregister(&id);
+	__cfaabi_tls.this_proc_id = &id;
 
 	// Block sigalrms to control when they arrive
@@ -458,5 +616,5 @@
 			// __cfaabi_dbg_print_safe( "Kernel : Preemption thread tick\n" );
 			lock( event_kernel->lock __cfaabi_dbg_ctx2 );
-			tick_preemption( &id );
+			tick_preemption();
 			unlock( event_kernel->lock );
 			break;
@@ -480,5 +638,5 @@
 
 void __cfaabi_check_preemption() {
-	bool ready = kernelTLS.preemption_state.enabled;
+	bool ready = __preemption_enabled();
 	if(!ready) { abort("Preemption should be ready"); }
 
@@ -503,5 +661,5 @@
 #ifdef __CFA_WITH_VERIFY__
 bool __cfaabi_dbg_in_kernel() {
-	return !kernelTLS.preemption_state.enabled;
+	return !__preemption_enabled();
 }
 #endif
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -150,5 +150,5 @@
 //  queues or removing them.
 uint_fast32_t ready_mutate_lock( void ) with(*__scheduler_lock) {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 
 	// Step 1 : lock global lock
@@ -166,10 +166,10 @@
 	}
 
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 	return s;
 }
 
 void ready_mutate_unlock( uint_fast32_t last_s ) with(*__scheduler_lock) {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 
 	// Step 1 : release local locks
@@ -188,5 +188,5 @@
 	__atomic_store_n(&lock, (bool)false, __ATOMIC_RELEASE);
 
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ! __preemption_enabled() );
 }
 
@@ -252,5 +252,5 @@
 		preferred =
 			//*
-			kernelTLS.this_processor ? kernelTLS.this_processor->id * 4 : -1;
+			kernelTLS().this_processor ? kernelTLS().this_processor->id * 4 : -1;
 			/*/
 			thrd->link.preferred * 4;
@@ -331,5 +331,5 @@
 		// Don't bother trying locally too much
 		int local_tries = 8;
-		preferred = kernelTLS.this_processor->id * 4;
+		preferred = kernelTLS().this_processor->id * 4;
 	#endif
 
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ libcfa/src/concurrency/thread.cfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -127,5 +127,5 @@
 	verify( this_thrd->context.SP );
 
-	__schedule_thread( (__processor_id_t *)kernelTLS.this_processor, this_thrd);
+	__schedule_thread( this_thrd );
 	enable_interrupts( __cfaabi_dbg_ctx );
 }
Index: src/AST/Convert.cpp
===================================================================
--- src/AST/Convert.cpp	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ src/AST/Convert.cpp	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -25,4 +25,5 @@
 #include "AST/Init.hpp"
 #include "AST/Stmt.hpp"
+#include "AST/TranslationUnit.hpp"
 #include "AST/TypeSubstitution.hpp"
 
@@ -1404,8 +1405,8 @@
 };
 
-std::list< Declaration * > convert( const std::list< ast::ptr< ast::Decl > > && translationUnit ) {
+std::list< Declaration * > convert( const ast::TranslationUnit && translationUnit ) {
 	ConverterNewToOld c;
 	std::list< Declaration * > decls;
-	for(auto d : translationUnit) {
+	for(auto d : translationUnit.decls) {
 		decls.emplace_back( c.decl( d ) );
 	}
@@ -2803,12 +2804,12 @@
 #undef GET_ACCEPT_1
 
-std::list< ast::ptr< ast::Decl > > convert( const std::list< Declaration * > && translationUnit ) {
+ast::TranslationUnit convert( const std::list< Declaration * > && translationUnit ) {
 	ConverterOldToNew c;
-	std::list< ast::ptr< ast::Decl > > decls;
+	ast::TranslationUnit unit;
 	for(auto d : translationUnit) {
 		d->accept( c );
-		decls.emplace_back( c.decl() );
+		unit.decls.emplace_back( c.decl() );
 	}
 	deleteAll(translationUnit);
-	return decls;
+	return unit;
 }
Index: src/AST/Convert.hpp
===================================================================
--- src/AST/Convert.hpp	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ src/AST/Convert.hpp	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -18,11 +18,9 @@
 #include <list>
 
-#include "AST/Node.hpp"
-
 class Declaration;
 namespace ast {
-	class Decl;
+	class TranslationUnit;
 };
 
-std::list< Declaration * > convert( const std::list< ast::ptr< ast::Decl > > && translationUnit );
-std::list< ast::ptr< ast::Decl > > convert( const std::list< Declaration * > && translationUnit );
+std::list< Declaration * > convert( const ast::TranslationUnit && translationUnit );
+ast::TranslationUnit convert( const std::list< Declaration * > && translationUnit );
Index: src/AST/Fwd.hpp
===================================================================
--- src/AST/Fwd.hpp	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ src/AST/Fwd.hpp	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -137,4 +137,6 @@
 typedef unsigned int UniqueId;
 
+class TranslationUnit;
+// TODO: Get from the TranslationUnit:
 extern Type * sizeType;
 extern FunctionDecl * dereferenceOperator;
Index: src/AST/Pass.hpp
===================================================================
--- src/AST/Pass.hpp	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ src/AST/Pass.hpp	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -103,5 +103,5 @@
 	/// Construct and run a pass on a translation unit.
 	template< typename... Args >
-	static void run( std::list< ptr<Decl> > & decls, Args &&... args ) {
+	static void run( TranslationUnit & decls, Args &&... args ) {
 		Pass<core_t> visitor( std::forward<Args>( args )... );
 		accept_all( decls, visitor );
@@ -119,5 +119,5 @@
 	// Versions of the above for older compilers.
 	template< typename... Args >
-	static void run( std::list< ptr<Decl> > & decls ) {
+	static void run( TranslationUnit & decls ) {
 		Pass<core_t> visitor;
 		accept_all( decls, visitor );
@@ -303,4 +303,7 @@
 void accept_all( std::list< ast::ptr<ast::Decl> > &, ast::Pass<core_t> & visitor );
 
+template<typename core_t>
+void accept_all( ast::TranslationUnit &, ast::Pass<core_t> & visitor );
+
 //-------------------------------------------------------------------------------------------------
 // PASS ACCESSORIES
Index: src/AST/Pass.impl.hpp
===================================================================
--- src/AST/Pass.impl.hpp	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ src/AST/Pass.impl.hpp	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -20,4 +20,5 @@
 #include <unordered_map>
 
+#include "AST/TranslationUnit.hpp"
 #include "AST/TypeSubstitution.hpp"
 
@@ -430,4 +431,9 @@
 	pass_visitor_stats.depth--;
 	if ( !errors.isEmpty() ) { throw errors; }
+}
+
+template< typename core_t >
+inline void ast::accept_all( ast::TranslationUnit & unit, ast::Pass< core_t > & visitor ) {
+	return ast::accept_all( unit.decls, visitor );
 }
 
@@ -674,5 +680,5 @@
 const ast::CompoundStmt * ast::Pass< core_t >::visit( const ast::CompoundStmt * node ) {
 	VISIT_START( node );
-	VISIT({
+	VISIT(
 		// Do not enter (or leave) a new scope if atFunctionTop. Remember to save the result.
 		auto guard1 = makeFuncGuard( [this, enterScope = !this->atFunctionTop]() {
@@ -681,9 +687,9 @@
 			if ( leaveScope ) __pass::symtab::leave(core, 0);
 		});
-		ValueGuard< bool > guard2( inFunction );
+		ValueGuard< bool > guard2( atFunctionTop );
+		atFunctionTop = false;
 		guard_scope guard3 { *this };
-		inFunction = false;
 		maybe_accept( node, &CompoundStmt::kids );
-	})
+	)
 	VISIT_END( CompoundStmt, node );
 }
Index: src/AST/Pass.proto.hpp
===================================================================
--- src/AST/Pass.proto.hpp	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ src/AST/Pass.proto.hpp	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -22,4 +22,6 @@
 template<typename core_t>
 class Pass;
+
+class TranslationUnit;
 
 struct PureVisitor;
Index: src/InitTweak/FixGlobalInit.cc
===================================================================
--- src/InitTweak/FixGlobalInit.cc	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ src/InitTweak/FixGlobalInit.cc	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -109,5 +109,5 @@
 	}
 
-	void fixGlobalInit(std::list<ast::ptr<ast::Decl>> & translationUnit, bool inLibrary) {
+	void fixGlobalInit(ast::TranslationUnit & translationUnit, bool inLibrary) {
 		ast::Pass<GlobalFixer_new> fixer;
 		accept_all(translationUnit, fixer);
@@ -119,5 +119,5 @@
 				ast::Storage::Static, ast::Linkage::C, {new ast::Attribute("constructor", std::move(ctorParams))});
 
-			translationUnit.emplace_back( initFunction );
+			translationUnit.decls.emplace_back( initFunction );
 		} // if
 
@@ -128,5 +128,5 @@
 				ast::Storage::Static, ast::Linkage::C, {new ast::Attribute("destructor", std::move(dtorParams))});
 
-			translationUnit.emplace_back(destroyFunction);
+			translationUnit.decls.emplace_back(destroyFunction);
 		} // if
 	}
@@ -183,4 +183,5 @@
 			} // if
 			if ( const ast::Stmt * ctor = ctorInit->ctor ) {
+				addDataSectionAttribute(mutDecl);
 				initStmts.push_back( ctor );
 				mutDecl->init = nullptr;
Index: src/InitTweak/FixGlobalInit.h
===================================================================
--- src/InitTweak/FixGlobalInit.h	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ src/InitTweak/FixGlobalInit.h	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -29,5 +29,5 @@
 	/// function is for library code.
 	void fixGlobalInit( std::list< Declaration * > & translationUnit, bool inLibrary );
-	void fixGlobalInit( std::list< ast::ptr<ast::Decl> > & translationUnit, bool inLibrary );
+	void fixGlobalInit( ast::TranslationUnit & translationUnit, bool inLibrary );
 } // namespace
 
Index: src/InitTweak/FixInit.h
===================================================================
--- src/InitTweak/FixInit.h	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ src/InitTweak/FixInit.h	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -19,7 +19,8 @@
 #include <string>  // for string
 
-#include <AST/Fwd.hpp>
-
 class Declaration;
+namespace ast {
+	class TranslationUnit;
+}
 
 namespace InitTweak {
@@ -27,5 +28,5 @@
 	void fix( std::list< Declaration * > & translationUnit, bool inLibrary );
 
-	void fix( std::list<ast::ptr<ast::Decl>> & translationUnit, bool inLibrary);
+	void fix( ast::TranslationUnit & translationUnit, bool inLibrary);
 } // namespace
 
Index: src/InitTweak/FixInitNew.cpp
===================================================================
--- src/InitTweak/FixInitNew.cpp	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ src/InitTweak/FixInitNew.cpp	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -179,5 +179,5 @@
 	/// expand each object declaration to use its constructor after it is declared.
 	struct FixInit : public ast::WithStmtsToAdd<> {
-		static void fixInitializers( std::list< ast::ptr<ast::Decl> > &translationUnit );
+		static void fixInitializers( ast::TranslationUnit &translationUnit );
 
 		const ast::DeclWithType * postvisit( const ast::ObjectDecl *objDecl );
@@ -225,5 +225,5 @@
 } // namespace
 
-void fix( std::list< ast::ptr<ast::Decl> > & translationUnit, bool inLibrary ) {
+void fix( ast::TranslationUnit & translationUnit, bool inLibrary ) {
 	ast::Pass<SelfAssignChecker>::run( translationUnit );
 
@@ -308,5 +308,5 @@
 	}
 
-	void FixInit::fixInitializers( std::list< ast::ptr<ast::Decl> > & translationUnit ) {
+	void FixInit::fixInitializers( ast::TranslationUnit & translationUnit ) {
 		ast::Pass<FixInit> fixer;
 
@@ -314,9 +314,9 @@
 		// can't use DeclMutator, because sometimes need to insert IfStmt, etc.
 		SemanticErrorException errors;
-		for ( auto i = translationUnit.begin(); i != translationUnit.end(); ++i ) {
+		for ( auto i = translationUnit.decls.begin(); i != translationUnit.decls.end(); ++i ) {
 			try {
 				// maybeAccept( *i, fixer ); translationUnit should never contain null
 				*i = (*i)->accept(fixer);
-				translationUnit.splice( i, fixer.core.staticDtorDecls );
+				translationUnit.decls.splice( i, fixer.core.staticDtorDecls );
 			} catch( SemanticErrorException &e ) {
 				errors.append( e );
@@ -864,4 +864,5 @@
 			if ( const ast::Stmt * ctor = ctorInit->ctor ) {
 				if ( objDecl->storage.is_static ) {
+					addDataSectionAttribute(objDecl);
 					// originally wanted to take advantage of gcc nested functions, but
 					// we get memory errors with this approach. To remedy this, the static
@@ -947,4 +948,5 @@
 						objDecl->name = objDecl->name + staticNamer.newName();
 						objDecl->mangleName = Mangle::mangle( objDecl );
+						objDecl->init = nullptr;
 
 						// xxx - temporary hack: need to return a declaration, but want to hoist the current object out of this scope
Index: src/InitTweak/InitTweak.cc
===================================================================
--- src/InitTweak/InitTweak.cc	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ src/InitTweak/InitTweak.cc	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -1113,3 +1113,8 @@
 	}
 
+	void addDataSectionAttribute( ast::ObjectDecl * objDecl ) {
+		auto strLitT = new ast::PointerType(new ast::BasicType(ast::BasicType::Char));
+		objDecl->attributes.push_back(new ast::Attribute("section", {new ast::ConstantExpr(objDecl->location, strLitT, "\".data#\"", std::nullopt)}));
+	}
+
 }
Index: src/InitTweak/InitTweak.h
===================================================================
--- src/InitTweak/InitTweak.h	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ src/InitTweak/InitTweak.h	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -119,4 +119,6 @@
 	void addDataSectonAttribute( ObjectDecl * objDecl );
 
+	void addDataSectionAttribute( ast::ObjectDecl * objDecl );
+
 	class InitExpander_old {
 	public:
Index: src/ResolvExpr/Resolver.cc
===================================================================
--- src/ResolvExpr/Resolver.cc	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ src/ResolvExpr/Resolver.cc	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -1274,5 +1274,5 @@
 	// size_t Resolver_new::traceId = Stats::Heap::new_stacktrace_id("Resolver");
 
-	void resolve( std::list< ast::ptr< ast::Decl > >& translationUnit ) {
+	void resolve( ast::TranslationUnit& translationUnit ) {
 		ast::Pass< Resolver_new >::run( translationUnit );
 	}
Index: src/ResolvExpr/Resolver.h
===================================================================
--- src/ResolvExpr/Resolver.h	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ src/ResolvExpr/Resolver.h	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -35,4 +35,5 @@
 	class StmtExpr;
 	class SymbolTable;
+	class TranslationUnit;
 	class Type;
 	class TypeEnvironment;
@@ -55,5 +56,5 @@
 
 	/// Checks types and binds syntactic constructs to typed representations
-	void resolve( std::list< ast::ptr<ast::Decl> >& translationUnit );
+	void resolve( ast::TranslationUnit& translationUnit );
 	/// Searches expr and returns the first DeletedExpr found, otherwise nullptr
 	const ast::DeletedExpr * findDeletedExpr( const ast::Expr * expr );
@@ -69,8 +70,8 @@
 		const ast::Expr * untyped, const ast::SymbolTable & symtab);
 	/// Resolves a constructor init expression
-	ast::ptr< ast::Init > resolveCtorInit( 
+	ast::ptr< ast::Init > resolveCtorInit(
 		const ast::ConstructorInit * ctorInit, const ast::SymbolTable & symtab );
 	/// Resolves a statement expression
-	ast::ptr< ast::Expr > resolveStmtExpr( 
+	ast::ptr< ast::Expr > resolveStmtExpr(
 		const ast::StmtExpr * stmtExpr, const ast::SymbolTable & symtab );
 } // namespace ResolvExpr
Index: tests/.expect/castError.oast.txt
===================================================================
--- tests/.expect/castError.oast.txt	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
+++ tests/.expect/castError.oast.txt	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -0,0 +1,70 @@
+castError.cfa:23:1 error: Cannot choose between 3 alternatives for expression
+Explicit Cast of:
+  Name: f
+... to:
+  char Alternatives are:
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: f: function
+        accepting unspecified arguments
+      ... returning nothing
+
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: f: double
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: f: signed int
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+
+castError.cfa:28:1 error: Cannot choose between 2 alternatives for expression
+Generated Cast of:
+  Comma Expression:
+    constant expression (3 3: signed int)
+    Name: v
+... to: nothing Alternatives are:
+Cost ( 0, 0, 2, 0, 0, 0, 0 ): Generated Cast of:
+      Comma Expression:
+        constant expression (3 3: signed int)
+        Variable Expression: v: unsigned char
+    ... to: nothing
+  (types:
+    void 
+  )
+  Environment:
+
+Cost ( 0, 0, 2, 0, 0, 0, 0 ): Generated Cast of:
+      Comma Expression:
+        constant expression (3 3: signed int)
+        Variable Expression: v: signed short int
+    ... to: nothing
+  (types:
+    void 
+  )
+  Environment:
+
+
+castError.cfa:30:1 error: No reasonable alternatives for expression Explicit Cast of:
+  Name: sint
+... to:
+  instance of struct S with body 1
+  ... with parameters
+    char
+
Index: sts/.expect/castError.txt
===================================================================
--- tests/.expect/castError.txt	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ 	(revision )
@@ -1,70 +1,0 @@
-castError.cfa:23:1 error: Cannot choose between 3 alternatives for expression
-Explicit Cast of:
-  Name: f
-... to:
-  char Alternatives are:
-Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
-      Variable Expression: f: function
-        accepting unspecified arguments
-      ... returning nothing
-
-    ... to:
-      char
-  (types:
-    char
-  )
-  Environment:
-
-Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
-      Variable Expression: f: double
-    ... to:
-      char
-  (types:
-    char
-  )
-  Environment:
-
-Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
-      Variable Expression: f: signed int
-    ... to:
-      char
-  (types:
-    char
-  )
-  Environment:
-
-
-castError.cfa:28:1 error: Cannot choose between 2 alternatives for expression
-Generated Cast of:
-  Comma Expression:
-    constant expression (3 3: signed int)
-    Name: v
-... to: nothing Alternatives are:
-Cost ( 0, 0, 2, 0, 0, 0, 0 ): Generated Cast of:
-      Comma Expression:
-        constant expression (3 3: signed int)
-        Variable Expression: v: unsigned char
-    ... to: nothing
-  (types:
-    void 
-  )
-  Environment:
-
-Cost ( 0, 0, 2, 0, 0, 0, 0 ): Generated Cast of:
-      Comma Expression:
-        constant expression (3 3: signed int)
-        Variable Expression: v: signed short int
-    ... to: nothing
-  (types:
-    void 
-  )
-  Environment:
-
-
-castError.cfa:30:1 error: No reasonable alternatives for expression Explicit Cast of:
-  Name: sint
-... to:
-  instance of struct S with body 1
-  ... with parameters
-    char
-
Index: tests/Makefile.am
===================================================================
--- tests/Makefile.am	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ tests/Makefile.am	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -53,5 +53,5 @@
 
 # adjust CC to current flags
-CC = LC_ALL=C $(if $(DISTCC_CFA_PATH),distcc $(DISTCC_CFA_PATH) ${ARCH_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS})
+CC = LC_ALL=C $(if $(DISTCC_CFA_PATH),distcc $(DISTCC_CFA_PATH) ${ARCH_FLAGS} ${AST_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS} ${AST_FLAGS})
 CFACC = $(CC)
 
@@ -60,5 +60,5 @@
 
 # adjusted CC but without the actual distcc call
-CFACCLOCAL = $(if $(DISTCC_CFA_PATH),$(DISTCC_CFA_PATH) ${ARCH_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS})
+CFACCLOCAL = $(if $(DISTCC_CFA_PATH),$(DISTCC_CFA_PATH) ${ARCH_FLAGS} ${AST_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS} ${AST_FLAGS})
 CFACCLINK = $(CFACCLOCAL) -quiet $(if $(test), 2> $(test), ) $($(shell echo "${@}_FLAGSLD" | sed 's/-\|\//_/g'))
 
Index: tests/config.py.in
===================================================================
--- tests/config.py.in	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ tests/config.py.in	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -9,2 +9,3 @@
 HOSTARCH = "@host_cpu@"
 DISTRIBUTE = @HAS_DISTCC@
+NEWAST = @DEFAULT_NEW_AST@
Index: tests/meta/.expect/archVast.nast.arm64.txt
===================================================================
--- tests/meta/.expect/archVast.nast.arm64.txt	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
+++ tests/meta/.expect/archVast.nast.arm64.txt	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -0,0 +1,36 @@
+meta/archVast.cfa:28:1 error: Cannot choose between 3 alternatives for expression
+Explicit Cast of:
+  Name: FA64
+... to:
+  char Alternatives are:
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FA64: signed int
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FA64: function
+        accepting unspecified arguments
+      ... returning nothing
+
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FA64: double
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+
Index: tests/meta/.expect/archVast.nast.x64.txt
===================================================================
--- tests/meta/.expect/archVast.nast.x64.txt	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
+++ tests/meta/.expect/archVast.nast.x64.txt	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -0,0 +1,36 @@
+meta/archVast.cfa:28:1 error: Cannot choose between 3 alternatives for expression
+Explicit Cast of:
+  Name: FX64
+... to:
+  char Alternatives are:
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX64: signed int
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX64: function
+        accepting unspecified arguments
+      ... returning nothing
+
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX64: double
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+
Index: tests/meta/.expect/archVast.nast.x86.txt
===================================================================
--- tests/meta/.expect/archVast.nast.x86.txt	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
+++ tests/meta/.expect/archVast.nast.x86.txt	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -0,0 +1,36 @@
+meta/archVast.cfa:28:1 error: Cannot choose between 3 alternatives for expression
+Explicit Cast of:
+  Name: FX86
+... to:
+  char Alternatives are:
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX86: signed int
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX86: function
+        accepting unspecified arguments
+      ... returning nothing
+
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX86: double
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+
Index: tests/meta/.expect/archVast.oast.arm64.txt
===================================================================
--- tests/meta/.expect/archVast.oast.arm64.txt	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
+++ tests/meta/.expect/archVast.oast.arm64.txt	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -0,0 +1,36 @@
+meta/archVast.cfa:28:1 error: Cannot choose between 3 alternatives for expression
+Explicit Cast of:
+  Name: FA64
+... to:
+  char Alternatives are:
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FA64: function
+        accepting unspecified arguments
+      ... returning nothing
+
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FA64: double
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FA64: signed int
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+
Index: tests/meta/.expect/archVast.oast.x64.txt
===================================================================
--- tests/meta/.expect/archVast.oast.x64.txt	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
+++ tests/meta/.expect/archVast.oast.x64.txt	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -0,0 +1,36 @@
+meta/archVast.cfa:28:1 error: Cannot choose between 3 alternatives for expression
+Explicit Cast of:
+  Name: FX64
+... to:
+  char Alternatives are:
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX64: function
+        accepting unspecified arguments
+      ... returning nothing
+
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX64: double
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX64: signed int
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+
Index: tests/meta/.expect/archVast.oast.x86.txt
===================================================================
--- tests/meta/.expect/archVast.oast.x86.txt	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
+++ tests/meta/.expect/archVast.oast.x86.txt	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -0,0 +1,36 @@
+meta/archVast.cfa:28:1 error: Cannot choose between 3 alternatives for expression
+Explicit Cast of:
+  Name: FX86
+... to:
+  char Alternatives are:
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX86: function
+        accepting unspecified arguments
+      ... returning nothing
+
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX86: double
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX86: signed int
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+
Index: tests/meta/archVast.cfa
===================================================================
--- tests/meta/archVast.cfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
+++ tests/meta/archVast.cfa	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -0,0 +1,32 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// archVast.cfa -- Check if all combinations are of ast/arch are properly distinguished
+//
+// Author           : Thierry Delisle
+// Created On       : Tue Nov 03 15:04:53 2020
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#if defined( __i386 )
+#define NAME FX86
+#elif defined( __x86_64 )
+#define NAME FX64
+#elif defined( __aarch64__ )
+#define NAME FA64
+#endif
+
+int NAME;
+void NAME() {
+	int NAME;
+	double NAME;
+	(char)NAME;
+	(int(*)())NAME;
+}
+
+int main() {}
Index: tests/pybin/settings.py
===================================================================
--- tests/pybin/settings.py	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ tests/pybin/settings.py	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -85,5 +85,4 @@
 	def filter(self, tests):
 		return [test for test in tests if not test.arch or self.target == test.arch]
-		return True if not arch else self.target == arch
 
 	@staticmethod
@@ -98,4 +97,26 @@
 		self.path   = "debug" if value else "nodebug"
 
+class AST:
+	def __init__(self, ast):
+		if ast == "new":
+			self.target = ast
+			self.string = "New AST"
+			self.flags  = """AST_FLAGS=-XCFA,--new-ast"""
+		elif ast == "old":
+			self.target = ast
+			self.string = "Old AST"
+			self.flags  = """AST_FLAGS=-XCFA,--old-ast"""
+		elif ast == None:
+			self.target = "new" if config.NEWAST else "old"
+			self.string = "Default AST (%s)" % self.target
+			self.flags  = """AST_FLAGS="""
+		else:
+			print("""ERROR: Invalid ast configuration, must be "old", "new" or left unspecified, was %s""" % (value), file=sys.stderr)
+			sys.exit(1)
+
+	def filter(self, tests):
+
+		return [test for test in tests if not test.astv or self.target == test.astv]
+
 class Install:
 	def __init__(self, value):
@@ -120,14 +141,17 @@
 
 def init( options ):
+	global all_ast
 	global all_arch
 	global all_debug
 	global all_install
+	global ast
 	global arch
+	global debug
 	global archive
+	global install
+
 	global continue_
-	global debug
 	global dry_run
 	global generating
-	global install
 	global make
 	global output_width
@@ -135,4 +159,5 @@
 	global timeout2gdb
 
+	all_ast      = [AST(o)          for o in list(dict.fromkeys(options.ast    ))] if options.ast  else [AST(None)]
 	all_arch     = [Architecture(o) for o in list(dict.fromkeys(options.arch   ))] if options.arch else [Architecture(None)]
 	all_debug    = [Debug(o)        for o in list(dict.fromkeys(options.debug  ))]
Index: tests/pybin/test_run.py
===================================================================
--- tests/pybin/test_run.py	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ tests/pybin/test_run.py	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -11,7 +11,8 @@
 		self.path = ''
 		self.arch = ''
+		self.astv = ''
 
 	def toString(self):
-		return "{:25s} ({:5s} {:s})".format( self.name, self.arch if self.arch else "Any", self.target() )
+		return "{:25s} ({:5s} arch, {:s} ast: {:s})".format( self.name, self.arch if self.arch else "Any", self.astv if self.astv else "Any", self.target() )
 
 	def prepare(self):
@@ -20,5 +21,7 @@
 
 	def expect(self):
-		return os.path.normpath( os.path.join(settings.SRCDIR  , self.path, ".expect", "%s%s.txt" % (self.name,'' if not self.arch else ".%s" % self.arch)) )
+		arch = '' if not self.arch else ".%s" % self.arch
+		astv = '' if not self.astv else ".nast" if self.astv == "new" else ".oast"
+		return os.path.normpath( os.path.join(settings.SRCDIR  , self.path, ".expect", "%s%s%s.txt" % (self.name,astv,arch)) )
 
 	def error_log(self):
@@ -45,9 +48,10 @@
 
 	@staticmethod
-	def new_target(target, arch):
+	def new_target(target, arch, astv):
 		test = Test()
 		test.name = os.path.basename(target)
 		test.path = os.path.relpath (os.path.dirname(target), settings.SRCDIR)
 		test.arch = arch.target if arch else ''
+		test.astv = astv.target if astv else ''
 		return test
 
Index: tests/pybin/tools.py
===================================================================
--- tests/pybin/tools.py	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ tests/pybin/tools.py	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -181,4 +181,5 @@
 		'-s' if silent else None,
 		test_param,
+		settings.ast.flags,
 		settings.arch.flags,
 		settings.debug.flags,
Index: tests/test.py
===================================================================
--- tests/test.py	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ tests/test.py	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -24,10 +24,20 @@
 
 	def match_test(path):
-		match = re.search("^%s\/([\w\/\-_]*).expect\/([\w\-_]+)(\.[\w\-_]+)?\.txt$" % settings.SRCDIR, path)
+		match = re.search("^%s\/([\w\/\-_]*).expect\/([\w\-_]+)(\.nast|\.oast)?(\.[\w\-_]+)?\.txt$" % settings.SRCDIR, path)
 		if match :
 			test = Test()
 			test.name = match.group(2)
 			test.path = match.group(1)
-			test.arch = match.group(3)[1:] if match.group(3) else None
+			test.arch = match.group(4)[1:] if match.group(4) else None
+
+			astv = match.group(3)[1:] if match.group(3) else None
+			if astv == 'oast':
+				test.astv = 'old'
+			elif astv == 'nast':
+				test.astv = 'new'
+			elif astv:
+				print('ERROR: "%s", expect file has astv but it is not "nast" or "oast"' % testname, file=sys.stderr)
+				sys.exit(1)
+
 			expected.append(test)
 
@@ -66,30 +76,34 @@
 	if options.regenerate_expected :
 		for testname in options.tests :
-			testname = canonical_path( testname )
+			testname = os.path.normpath( os.path.join(settings.SRCDIR, testname) )
+
 			# first check if this is a valid name to regenerate
 			if Test.valid_name(testname):
 				# this is a valid name, let's check if it already exists
 				found = [test for test in all_tests if canonical_path( test.target() ) == testname]
+				setup = itertools.product(settings.all_arch if options.arch else [None], settings.all_ast if options.ast else [None])
 				if not found:
-					# it's a new name, create it according to the name and specified architecture
-					if options.arch:
-						# user specified one or multiple architectures, assume the tests will have architecture specific results
-						tests.extend( [Test.new_target(testname, arch) for arch in settings.all_arch] )
-					else:
-						# user didn't specify an architecture, just create a cross platform test
-						tests.append( Test.new_target( testname, None ) )
+					# it's a new name, create it according to the name and specified architecture/ast version
+					tests.extend( [Test.new_target(testname, arch, ast) for arch, ast in setup] )
 				elif len(found) == 1 and not found[0].arch:
 					# we found a single test, the user better be wanting to create a cross platform test
 					if options.arch:
 						print('ERROR: "%s", test has no specified architecture but --arch was specified, ignoring it' % testname, file=sys.stderr)
+					elif options.ast:
+						print('ERROR: "%s", test has no specified ast version but --ast was specified, ignoring it' % testname, file=sys.stderr)
 					else:
 						tests.append( found[0] )
 				else:
 					# this test is already cross platform, just add a test for each platform the user asked
-					tests.extend( [Test.new_target(testname, arch) for arch in settings.all_arch] )
+					tests.extend( [Test.new_target(testname, arch, ast) for arch, ast in setup] )
 
 					# print a warning if it users didn't ask for a specific architecture
 					if not options.arch:
 						print('WARNING: "%s", test has architecture specific expected files but --arch was not specified, regenerating only for current host' % testname, file=sys.stderr)
+
+
+					# print a warning if it users didn't ask for a specific ast version
+					if not options.ast:
+						print('WARNING: "%s", test has ast version specific expected files but --ast was not specified, regenerating only for current ast' % testname, file=sys.stderr)
 
 			else :
@@ -112,7 +126,8 @@
 	# create a parser with the arguments for the tests script
 	parser = argparse.ArgumentParser(description='Script which runs cforall tests')
+	parser.add_argument('--ast', help='Test for specific ast', type=comma_separated(str), default=None)
+	parser.add_argument('--arch', help='Test for specific architecture', type=comma_separated(str), default=None)
 	parser.add_argument('--debug', help='Run all tests in debug or release', type=comma_separated(yes_no), default='yes')
 	parser.add_argument('--install', help='Run all tests based on installed binaries or tree binaries', type=comma_separated(yes_no), default='no')
-	parser.add_argument('--arch', help='Test for specific architecture', type=comma_separated(str), default=None)
 	parser.add_argument('--continue', help='When multiple specifications are passed (debug/install/arch), sets whether or not to continue if the last specification failed', type=yes_no, default='yes', dest='continue_')
 	parser.add_argument('--timeout', help='Maximum duration in seconds after a single test is considered to have timed out', type=int, default=120)
@@ -251,8 +266,8 @@
 	except KeyboardInterrupt:
 		return False, ""
-	except Exception as ex:
-		print("Unexpected error in worker thread running {}: {}".format(t.target(), ex), file=sys.stderr)
-		sys.stderr.flush()
-		return False, ""
+	# except Exception as ex:
+	# 	print("Unexpected error in worker thread running {}: {}".format(t.target(), ex), file=sys.stderr)
+	# 	sys.stderr.flush()
+	# 	return False, ""
 
 
@@ -362,5 +377,6 @@
 		# for each build configurations, run the test
 		with Timed() as total_dur:
-			for arch, debug, install in itertools.product(settings.all_arch, settings.all_debug, settings.all_install):
+			for ast, arch, debug, install in itertools.product(settings.all_ast, settings.all_arch, settings.all_debug, settings.all_install):
+				settings.ast     = ast
 				settings.arch    = arch
 				settings.debug   = debug
@@ -369,5 +385,6 @@
 				# filter out the tests for a different architecture
 				# tests are the same across debug/install
-				local_tests = settings.arch.filter( tests )
+				local_tests = settings.ast.filter( tests )
+				local_tests = settings.arch.filter( local_tests )
 				options.jobs, forceJobs = job_count( options, local_tests )
 				settings.update_make_cmd(forceJobs, options.jobs)
@@ -377,11 +394,15 @@
 
 				# print configuration
-				print('%s %i tests on %i cores (%s:%s)' % (
+				print('%s %i tests on %i cores (%s:%s - %s)' % (
 					'Regenerating' if settings.generating else 'Running',
 					len(local_tests),
 					options.jobs,
+					settings.ast.string,
 					settings.arch.string,
 					settings.debug.string
 				))
+				if not local_tests :
+					print('WARNING: No tests for this configuration')
+					continue
 
 				# otherwise run all tests and make sure to return the correct error code
Index: tools/stat.py
===================================================================
--- tools/stat.py	(revision 55acc3a4fa45a500fa88a39b850dd11a68275702)
+++ tools/stat.py	(revision 139775ec573763a1fd42cb0e38c3be4185217464)
@@ -1,3 +1,3 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 
 import sys
@@ -17,5 +17,5 @@
 		avg = numpy.mean  (content)
 		std = numpy.std   (content)
-		print "median {0:.1f} avg {1:.1f} stddev {2:.1f}".format( med, avg, std )
+		print("median {0:.1f} avg {1:.1f} stddev {2:.1f}".format( med, avg, std ))
 
 
