Index: benchmark/Makefile.in
===================================================================
--- benchmark/Makefile.in	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ benchmark/Makefile.in	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -349,6 +349,10 @@
 AUTOMAKE_OPTIONS = foreign    # do not require all the GNU file names
 ACLOCAL_AMFLAGS = -I automake
-CFACOMPILE = $(CFACC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CFAFLAGS) $(CFAFLAGS) $(AM_CFLAGS) $(CFLAGS)
-LTCFACOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+AM_T_CFA = $(am__t_CFA_@AM_T@)
+am__t_CFA_ = 
+am__t_CFA_0 = 
+am__t_CFA_1 = /usr/bin/time --quiet -f "$@ %E" # trailling space is necessary
+CFACOMPILE = $(AM_T_CFA)$(CFACC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CFAFLAGS) $(CFAFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCFACOMPILE = $(AM_T_CFA)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=compile $(CFACC) $(DEFS) \
 	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CFAFLAGS) $(AM_CFLAGS) $(CFAFLAGS) $(CFLAGS)
Index: benchmark/io/readv.cfa
===================================================================
--- benchmark/io/readv.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ benchmark/io/readv.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -12,7 +12,9 @@
 }
 
+#include <errno.h>
 #include <unistd.h>
 
 #include <clock.hfa>
+#include <iofwd.hfa>
 #include <kernel.hfa>
 #include <thread.hfa>
@@ -23,7 +25,4 @@
 
 extern bool traceHeapOn();
-extern ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
-extern ssize_t cfa_preadv2_fixed(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
-extern void register_fixed_files( cluster &, int *, unsigned count );
 
 int fd;
@@ -31,5 +30,5 @@
 volatile size_t count = 0;
 
-unsigned long int buflen = 50;
+unsigned long int buflen = 512;
 bool fixed_file = false;
 
@@ -40,10 +39,10 @@
 
 int do_read(int fd, struct iovec * iov) {
+	// extern ssize_t cfa_preadv2(int, const struct iovec *, int, off_t, int, int = 0, Duration = -1`s, io_cancellation * = 0p, io_context * = 0p);
+	int sflags = 0;
 	if(fixed_file) {
-		return cfa_preadv2_fixed(fd, iov, 1, 0, 0);
+		sflags |= CFA_IO_FIXED_FD1;
 	}
-	else {
-		return cfa_preadv2(fd, iov, 1, 0, 0);
-	}
+	return cfa_preadv2(fd, iov, 1, 0, 0, sflags, -1`s, 0p, 0p);
 }
 
@@ -52,10 +51,10 @@
 	/* paranoid */ assert( true == __atomic_load_n(&run, __ATOMIC_RELAXED) );
 
-	char data[buflen];
+	__attribute__((aligned(512)))  char data[buflen];
 	struct iovec iov = { data, buflen };
 
 	while(__atomic_load_n(&run, __ATOMIC_RELAXED)) {
 		int r = do_read(fd, &iov);
-		if(r < 0) abort("%s\n", strerror(-r));
+		if(r < 0) abort("%s\n", strerror(errno));
 
 		__atomic_fetch_add( &count, 1, __ATOMIC_SEQ_CST );
@@ -65,5 +64,6 @@
 int main(int argc, char * argv[]) {
 	BENCH_DECL
-	unsigned flags = 0;
+	unsigned num_io = 1;
+	io_context_params params;
 	int file_flags = 0;
 	unsigned sublen = 16;
@@ -74,9 +74,10 @@
 			BENCH_OPT_LONG
 			{"bufsize",       required_argument, 0, 'b'},
-			{"userthread",    no_argument      , 0, 'u'},
 			{"submitthread",  no_argument      , 0, 's'},
 			{"eagersubmit",   no_argument      , 0, 'e'},
 			{"kpollsubmit",   no_argument      , 0, 'k'},
 			{"kpollcomplete", no_argument      , 0, 'i'},
+			{"fixed-files",   no_argument      , 0, 'f'},
+			{"open-direct",   no_argument      , 0, 'o'},
 			{"submitlength",  required_argument, 0, 'l'},
 			{0, 0, 0, 0}
@@ -84,5 +85,5 @@
 
 		int idx = 0;
-		int opt = getopt_long(argc, argv, BENCH_OPT_SHORT "b:usekil:", options, &idx);
+		int opt = getopt_long(argc, argv, BENCH_OPT_SHORT "b:sekil:", options, &idx);
 
 		const char * arg = optarg ? optarg : "";
@@ -100,19 +101,18 @@
 				}
 				break;
-			case 'u':
-				flags |= CFA_CLUSTER_IO_POLLER_USER_THREAD;
-				break;
 			case 's':
-				flags |= CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS;
+				params.poller_submits = true;
 				break;
 			case 'e':
-				flags |= CFA_CLUSTER_IO_EAGER_SUBMITS;
+				params.eager_submits = true;
 				break;
 			case 'k':
-				flags |= CFA_CLUSTER_IO_KERNEL_POLL_SUBMITS;
+				params.poll_submit = true;
+			case 'f':
 				fixed_file = true;
 				break;
 			case 'i':
-				flags |= CFA_CLUSTER_IO_KERNEL_POLL_COMPLETES;
+				params.poll_complete = true;
+			case 'o':
 				file_flags |= O_DIRECT;
 				break;
@@ -123,5 +123,5 @@
 					goto usage;
 				}
-				flags |= (sublen << CFA_CLUSTER_IO_BUFFLEN_OFFSET);
+				// flags |= (sublen << CFA_CLUSTER_IO_BUFFLEN_OFFSET);
 				break;
 			default: /* ? */
@@ -150,5 +150,5 @@
 	{
 		Time start, end;
-		BenchCluster cl = { flags, CFA_STATS_READY_Q | CFA_STATS_IO };
+		BenchCluster cl = { num_io, params, CFA_STATS_READY_Q | CFA_STATS_IO };
 
 		if(fixed_file) {
@@ -179,4 +179,5 @@
 				printf("\nDone\n");
 			}
+			printf("Readers closed\n");
 		}
 		printf("Took %'ld ms\n", (end - start)`ms);
Index: libcfa/Makefile.in
===================================================================
--- libcfa/Makefile.in	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/Makefile.in	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -218,4 +218,5 @@
 AMTAR = @AMTAR@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AM_T = @AM_T@
 AR = @AR@
 ARCHITECTURE = @ARCHITECTURE@
Index: libcfa/configure
===================================================================
--- libcfa/configure	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/configure	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -701,4 +701,5 @@
 CFA_PREFIX
 CFA_NAME
+AM_T
 BUILDLIB_FALSE
 BUILDLIB_TRUE
@@ -3187,4 +3188,7 @@
   BUILDLIB_FALSE=
 fi
+
+
+AM_T='$(T)'
 
 
@@ -17017,4 +17021,5 @@
 
 
+
 for ac_header in linux/io_uring.h
 do :
@@ -19295,4 +19300,55 @@
 
 
+
+fi
+
+
+
+	# check support for various io_uring flags
+
+		ac_fn_c_check_decl "$LINENO" "IOSQE_FIXED_FILE" "ac_cv_have_decl_IOSQE_FIXED_FILE" "#include <linux/io_uring.h>
+"
+if test "x$ac_cv_have_decl_IOSQE_FIXED_FILE" = xyes; then :
+  $as_echo "#define CFA_HAVE_IOSQE_FIXED_FILE 1" >>confdefs.h
+
+fi
+
+
+		ac_fn_c_check_decl "$LINENO" "IOSQE_IO_DRAIN" "ac_cv_have_decl_IOSQE_IO_DRAIN" "#include <linux/io_uring.h>
+"
+if test "x$ac_cv_have_decl_IOSQE_IO_DRAIN" = xyes; then :
+  $as_echo "#define CFA_HAVE_IOSQE_IO_DRAIN 1" >>confdefs.h
+
+fi
+
+
+		ac_fn_c_check_decl "$LINENO" "IOSQE_ASYNC" "ac_cv_have_decl_IOSQE_ASYNC" "#include <linux/io_uring.h>
+"
+if test "x$ac_cv_have_decl_IOSQE_ASYNC" = xyes; then :
+  $as_echo "#define CFA_HAVE_IOSQE_ASYNC 1" >>confdefs.h
+
+fi
+
+
+		ac_fn_c_check_decl "$LINENO" "IOSQE_IO_LINK" "ac_cv_have_decl_IOSQE_IO_LINK" "#include <linux/io_uring.h>
+"
+if test "x$ac_cv_have_decl_IOSQE_IO_LINK" = xyes; then :
+  $as_echo "#define CFA_HAVE_IOSQE_IO_LINK 1" >>confdefs.h
+
+fi
+
+
+		ac_fn_c_check_decl "$LINENO" "IOSQE_IO_HARDLINK" "ac_cv_have_decl_IOSQE_IO_HARDLINK" "#include <linux/io_uring.h>
+"
+if test "x$ac_cv_have_decl_IOSQE_IO_HARDLINK" = xyes; then :
+  $as_echo "#define CFA_HAVE_IOSQE_IO_HARDLINK 1" >>confdefs.h
+
+fi
+
+
+		ac_fn_c_check_decl "$LINENO" "SPLICE_F_FD_IN_FIXED" "ac_cv_have_decl_SPLICE_F_FD_IN_FIXED" "#include <linux/io_uring.h>
+"
+if test "x$ac_cv_have_decl_SPLICE_F_FD_IN_FIXED" = xyes; then :
+  $as_echo "#define CFA_HAVE_SPLICE_F_FD_IN_FIXED 1" >>confdefs.h
 
 fi
Index: libcfa/configure.ac
===================================================================
--- libcfa/configure.ac	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/configure.ac	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -105,4 +105,7 @@
 AM_CONDITIONAL([BUILDLIB], [test "x${CONFIG_BUILDLIB}" = "xyes"])
 
+AM_T='$(T)'
+AC_SUBST(AM_T)
+
 #==============================================================================
 #Trasforming cc1 will break compilation
@@ -129,4 +132,5 @@
 #io_uring 5.6 and later uses probes
 define(ioring_ops, [IORING_OP_NOP,IORING_OP_READV,IORING_OP_WRITEV,IORING_OP_FSYNC,IORING_OP_READ_FIXED,IORING_OP_WRITE_FIXED,IORING_OP_POLL_ADD,IORING_OP_POLL_REMOVE,IORING_OP_SYNC_FILE_RANGE,IORING_OP_SENDMSG,IORING_OP_RECVMSG,IORING_OP_TIMEOUT,IORING_OP_TIMEOUT_REMOVE,IORING_OP_ACCEPT,IORING_OP_ASYNC_CANCEL,IORING_OP_LINK_TIMEOUT,IORING_OP_CONNECT,IORING_OP_FALLOCATE,IORING_OP_OPENAT,IORING_OP_CLOSE,IORING_OP_FILES_UPDATE,IORING_OP_STATX,IORING_OP_READ,IORING_OP_WRITE,IORING_OP_FADVISE,IORING_OP_MADVISE,IORING_OP_SEND,IORING_OP_RECV,IORING_OP_OPENAT2,IORING_OP_EPOLL_CTL,IORING_OP_SPLICE,IORING_OP_PROVIDE_BUFFERS,IORING_OP_REMOVE_BUFFER])
+define(ioring_flags, [IOSQE_FIXED_FILE,IOSQE_IO_DRAIN,IOSQE_ASYNC,IOSQE_IO_LINK,IOSQE_IO_HARDLINK,SPLICE_F_FD_IN_FIXED])
 
 define(ioring_from_decls, [
@@ -166,4 +170,9 @@
 		ioring_from_decls
 	])
+
+	# check support for various io_uring flags
+	m4_foreach([op], [ioring_flags], [
+		AC_CHECK_DECL(op, [AC_DEFINE([CFA_HAVE_]op)], [], [[#include <linux/io_uring.h>]])
+	])
 ])
 AC_CHECK_FUNCS([preadv2 pwritev2])
Index: libcfa/prelude/Makefile.in
===================================================================
--- libcfa/prelude/Makefile.in	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/prelude/Makefile.in	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -180,4 +180,5 @@
 AMTAR = @AMTAR@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AM_T = @AM_T@
 AR = @AR@
 ARCHITECTURE = @ARCHITECTURE@
Index: libcfa/prelude/defines.hfa.in
===================================================================
--- libcfa/prelude/defines.hfa.in	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/prelude/defines.hfa.in	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -50,4 +50,11 @@
 #undef CFA_HAVE_IORING_OP_REMOVE_BUFFER
 
+#undef CFA_HAVE_IOSQE_FIXED_FILE
+#undef CFA_HAVE_IOSQE_IO_DRAIN
+#undef CFA_HAVE_IOSQE_ASYNC
+#undef CFA_HAVE_IOSQE_IO_LINK
+#undef CFA_HAVE_IOSQE_IO_HARDLINK
+#undef CFA_HAVE_SPLICE_F_FD_IN_FIXED
+
 #undef HAVE_PREADV2
 #undef HAVE_PWRITEV2
Index: libcfa/src/Makefile.am
===================================================================
--- libcfa/src/Makefile.am	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/Makefile.am	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -50,7 +50,15 @@
 
 # not all platforms support concurrency, add option do disable it
-thread_headers_nosrc = concurrency/invoke.h
-thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
-thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/io.cfa concurrency/iocall.cfa concurrency/preemption.cfa concurrency/ready_queue.cfa concurrency/stats.cfa ${thread_headers:.hfa=.cfa}
+thread_headers_nosrc = bits/random.hfa concurrency/invoke.h concurrency/kernel/fwd.hfa
+
+thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa \
+		concurrency/monitor.hfa concurrency/mutex.hfa
+
+thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa \
+		concurrency/invoke.c concurrency/io.cfa concurrency/iocall.cfa \
+		concurrency/io/setup.cfa \
+		concurrency/kernel/startup.cfa concurrency/preemption.cfa \
+		concurrency/ready_queue.cfa concurrency/stats.cfa \
+		${thread_headers:.hfa=.cfa}
 else
 headers =
Index: libcfa/src/Makefile.in
===================================================================
--- libcfa/src/Makefile.in	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/Makefile.in	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -166,4 +166,5 @@
 	concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa \
 	concurrency/invoke.c concurrency/io.cfa concurrency/iocall.cfa \
+	concurrency/io/setup.cfa concurrency/kernel/startup.cfa \
 	concurrency/preemption.cfa concurrency/ready_queue.cfa \
 	concurrency/stats.cfa concurrency/coroutine.cfa \
@@ -177,4 +178,6 @@
 @BUILDLIB_TRUE@	concurrency/alarm.lo concurrency/invoke.lo \
 @BUILDLIB_TRUE@	concurrency/io.lo concurrency/iocall.lo \
+@BUILDLIB_TRUE@	concurrency/io/setup.lo \
+@BUILDLIB_TRUE@	concurrency/kernel/startup.lo \
 @BUILDLIB_TRUE@	concurrency/preemption.lo \
 @BUILDLIB_TRUE@	concurrency/ready_queue.lo concurrency/stats.lo \
@@ -249,5 +252,6 @@
 	concurrency/coroutine.hfa concurrency/thread.hfa \
 	concurrency/kernel.hfa concurrency/monitor.hfa \
-	concurrency/mutex.hfa concurrency/invoke.h
+	concurrency/mutex.hfa bits/random.hfa concurrency/invoke.h \
+	concurrency/kernel/fwd.hfa
 HEADERS = $(nobase_cfa_include_HEADERS)
 am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
@@ -277,4 +281,5 @@
 AMTAR = @AMTAR@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AM_T = @AM_T@
 AR = @AR@
 ARCHITECTURE = @ARCHITECTURE@
@@ -421,6 +426,10 @@
 AUTOMAKE_OPTIONS = foreign subdir-objects
 ACLOCAL_AMFLAGS = -I automake
-CFACOMPILE = $(CFACC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CFAFLAGS) $(CFAFLAGS) $(AM_CFLAGS) $(CFLAGS)
-LTCFACOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+AM_T_CFA = $(am__t_CFA_@AM_T@)
+am__t_CFA_ = 
+am__t_CFA_0 = 
+am__t_CFA_1 = /usr/bin/time --quiet -f "$@ %E" # trailling space is necessary
+CFACOMPILE = $(AM_T_CFA)$(CFACC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CFAFLAGS) $(CFAFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCFACOMPILE = $(AM_T_CFA)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=compile $(CFACC) $(DEFS) \
 	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CFAFLAGS) $(AM_CFLAGS) $(CFAFLAGS) $(CFLAGS)
@@ -483,8 +492,16 @@
 
 # not all platforms support concurrency, add option do disable it
-@BUILDLIB_TRUE@thread_headers_nosrc = concurrency/invoke.h
+@BUILDLIB_TRUE@thread_headers_nosrc = bits/random.hfa concurrency/invoke.h concurrency/kernel/fwd.hfa
 @BUILDLIB_FALSE@thread_headers = 
-@BUILDLIB_TRUE@thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
-@BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/io.cfa concurrency/iocall.cfa concurrency/preemption.cfa concurrency/ready_queue.cfa concurrency/stats.cfa ${thread_headers:.hfa=.cfa}
+@BUILDLIB_TRUE@thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa \
+@BUILDLIB_TRUE@		concurrency/monitor.hfa concurrency/mutex.hfa
+
+@BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa \
+@BUILDLIB_TRUE@		concurrency/invoke.c concurrency/io.cfa concurrency/iocall.cfa \
+@BUILDLIB_TRUE@		concurrency/io/setup.cfa \
+@BUILDLIB_TRUE@		concurrency/kernel/startup.cfa concurrency/preemption.cfa \
+@BUILDLIB_TRUE@		concurrency/ready_queue.cfa concurrency/stats.cfa \
+@BUILDLIB_TRUE@		${thread_headers:.hfa=.cfa}
+
 
 #----------------------------------------------------------------------------------------------------------------
@@ -625,4 +642,20 @@
 concurrency/iocall.lo: concurrency/$(am__dirstamp) \
 	concurrency/$(DEPDIR)/$(am__dirstamp)
+concurrency/io/$(am__dirstamp):
+	@$(MKDIR_P) concurrency/io
+	@: > concurrency/io/$(am__dirstamp)
+concurrency/io/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) concurrency/io/$(DEPDIR)
+	@: > concurrency/io/$(DEPDIR)/$(am__dirstamp)
+concurrency/io/setup.lo: concurrency/io/$(am__dirstamp) \
+	concurrency/io/$(DEPDIR)/$(am__dirstamp)
+concurrency/kernel/$(am__dirstamp):
+	@$(MKDIR_P) concurrency/kernel
+	@: > concurrency/kernel/$(am__dirstamp)
+concurrency/kernel/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) concurrency/kernel/$(DEPDIR)
+	@: > concurrency/kernel/$(DEPDIR)/$(am__dirstamp)
+concurrency/kernel/startup.lo: concurrency/kernel/$(am__dirstamp) \
+	concurrency/kernel/$(DEPDIR)/$(am__dirstamp)
 concurrency/preemption.lo: concurrency/$(am__dirstamp) \
 	concurrency/$(DEPDIR)/$(am__dirstamp)
@@ -651,4 +684,8 @@
 	-rm -f concurrency/*.$(OBJEXT)
 	-rm -f concurrency/*.lo
+	-rm -f concurrency/io/*.$(OBJEXT)
+	-rm -f concurrency/io/*.lo
+	-rm -f concurrency/kernel/*.$(OBJEXT)
+	-rm -f concurrency/kernel/*.lo
 	-rm -f containers/*.$(OBJEXT)
 	-rm -f containers/*.lo
@@ -717,4 +754,6 @@
 	-rm -rf bits/.libs bits/_libs
 	-rm -rf concurrency/.libs concurrency/_libs
+	-rm -rf concurrency/io/.libs concurrency/io/_libs
+	-rm -rf concurrency/kernel/.libs concurrency/kernel/_libs
 	-rm -rf containers/.libs containers/_libs
 install-nobase_cfa_includeHEADERS: $(nobase_cfa_include_HEADERS)
@@ -862,4 +901,8 @@
 	-rm -f concurrency/$(DEPDIR)/$(am__dirstamp)
 	-rm -f concurrency/$(am__dirstamp)
+	-rm -f concurrency/io/$(DEPDIR)/$(am__dirstamp)
+	-rm -f concurrency/io/$(am__dirstamp)
+	-rm -f concurrency/kernel/$(DEPDIR)/$(am__dirstamp)
+	-rm -f concurrency/kernel/$(am__dirstamp)
 	-rm -f containers/$(DEPDIR)/$(am__dirstamp)
 	-rm -f containers/$(am__dirstamp)
Index: libcfa/src/bits/debug.hfa
===================================================================
--- libcfa/src/bits/debug.hfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/bits/debug.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -15,4 +15,6 @@
 
 #pragma once
+
+#include <assert.h>
 
 #ifdef __CFA_DEBUG__
Index: libcfa/src/bits/defs.hfa
===================================================================
--- libcfa/src/bits/defs.hfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/bits/defs.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -16,6 +16,4 @@
 #pragma once
 
-#include <stdbool.h>
-#include <stddef.h>
 #include <stdint.h>
 
@@ -54,74 +52,2 @@
     return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
 }
-
-// #define __CFA_NO_BIT_TEST_AND_SET__
-
-#if defined( __i386 )
-static inline bool __atomic_bts(volatile unsigned long int * target, unsigned long int bit ) {
-	#if defined(__CFA_NO_BIT_TEST_AND_SET__)
-        unsigned long int mask = 1ul << bit;
-        unsigned long int ret = __atomic_fetch_or(target, mask, (int)__ATOMIC_RELAXED);
-        return (ret & mask) != 0;
-    #else
-        int result = 0;
-        asm volatile(
-            "LOCK btsl %[bit], %[target]\n\t"
-            : "=@ccc" (result)
-            : [target] "m" (*target), [bit] "r" (bit)
-        );
-        return result != 0;
-    #endif
-}
-
-static inline bool __atomic_btr(volatile unsigned long int * target, unsigned long int bit ) {
-	#if defined(__CFA_NO_BIT_TEST_AND_SET__)
-        unsigned long int mask = 1ul << bit;
-        unsigned long int ret = __atomic_fetch_and(target, ~mask, (int)__ATOMIC_RELAXED);
-        return (ret & mask) != 0;
-	#else
-        int result = 0;
-        asm volatile(
-            "LOCK btrl %[bit], %[target]\n\t"
-            :"=@ccc" (result)
-            : [target] "m" (*target), [bit] "r" (bit)
-        );
-        return result != 0;
-    #endif
-}
-#elif defined( __x86_64 )
-static inline bool __atomic_bts(volatile unsigned long long int * target, unsigned long long int bit ) {
-	#if defined(__CFA_NO_BIT_TEST_AND_SET__)
-        unsigned long long int mask = 1ul << bit;
-        unsigned long long int ret = __atomic_fetch_or(target, mask, (int)__ATOMIC_RELAXED);
-        return (ret & mask) != 0;
-    #else
-        int result = 0;
-        asm volatile(
-            "LOCK btsq %[bit], %[target]\n\t"
-            : "=@ccc" (result)
-            : [target] "m" (*target), [bit] "r" (bit)
-        );
-        return result != 0;
-    #endif
-}
-
-static inline bool __atomic_btr(volatile unsigned long long int * target, unsigned long long int bit ) {
-	#if defined(__CFA_NO_BIT_TEST_AND_SET__)
-        unsigned long long int mask = 1ul << bit;
-        unsigned long long int ret = __atomic_fetch_and(target, ~mask, (int)__ATOMIC_RELAXED);
-        return (ret & mask) != 0;
-	#else
-        int result = 0;
-        asm volatile(
-            "LOCK btrq %[bit], %[target]\n\t"
-            :"=@ccc" (result)
-            : [target] "m" (*target), [bit] "r" (bit)
-        );
-        return result != 0;
-    #endif
-}
-#elif defined( __ARM_ARCH )
-    #error __atomic_bts and __atomic_btr not implemented for arm
-#else
-	#error uknown hardware architecture
-#endif
Index: libcfa/src/bits/locks.hfa
===================================================================
--- libcfa/src/bits/locks.hfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/bits/locks.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -130,5 +130,5 @@
 		pthread_mutex_init(&lock, &mattr);
 
-		pthread_cond_init (&cond, 0p);
+		pthread_cond_init (&cond, (const pthread_condattr_t *)0p);  // workaround trac#208: cast should not be required
 		val = 0;
 	}
@@ -164,3 +164,58 @@
 
 	#undef CHECKED
+
+	struct $thread;
+	extern void park( __cfaabi_dbg_ctx_param );
+	extern void unpark( struct $thread * this __cfaabi_dbg_ctx_param2 );
+	static inline struct $thread * active_thread ();
+
+	// Semaphore which only supports a single thread
+	struct single_sem {
+		struct $thread * volatile ptr;
+	};
+
+	static inline {
+		void  ?{}(single_sem & this) {
+			this.ptr = 0p;
+		}
+
+		void ^?{}(single_sem & this) {}
+
+		bool wait(single_sem & this) {
+			for() {
+				struct $thread * expected = this.ptr;
+				if(expected == 1p) {
+					if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+						return false;
+					}
+				}
+				else {
+					/* paranoid */ verify( expected == 0p );
+					if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+						park( __cfaabi_dbg_ctx );
+						return true;
+					}
+				}
+
+			}
+		}
+
+		bool post(single_sem & this) {
+			for() {
+				struct $thread * expected = this.ptr;
+				if(expected == 1p) return false;
+				if(expected == 0p) {
+					if(__atomic_compare_exchange_n(&this.ptr, &expected, 1p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+						return false;
+					}
+				}
+				else {
+					if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+						unpark( expected __cfaabi_dbg_ctx2 );
+						return true;
+					}
+				}
+			}
+		}
+	}
 #endif
Index: libcfa/src/concurrency/alarm.cfa
===================================================================
--- libcfa/src/concurrency/alarm.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/alarm.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -23,5 +23,5 @@
 
 #include "alarm.hfa"
-#include "kernel_private.hfa"
+#include "kernel/fwd.hfa"
 #include "preemption.hfa"
 
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/invoke.h	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -17,4 +17,5 @@
 #include "bits/defs.hfa"
 #include "bits/locks.hfa"
+#include "kernel/fwd.hfa"
 
 #ifdef __cforall
@@ -25,44 +26,4 @@
 #ifndef _INVOKE_H_
 #define _INVOKE_H_
-
-#ifdef __ARM_ARCH
-	// function prototypes are only really used by these macros on ARM
-	void disable_global_interrupts();
-	void enable_global_interrupts();
-
-	#define TL_GET( member ) ( { __typeof__( kernelTLS.member ) target; \
-                disable_global_interrupts(); \
-                target = kernelTLS.member; \
-                enable_global_interrupts(); \
-                target; } )
-	#define TL_SET( member, value ) disable_global_interrupts(); \
-		kernelTLS.member = value; \
-		enable_global_interrupts();
-#else
-	#define TL_GET( member ) kernelTLS.member
-	#define TL_SET( member, value ) kernelTLS.member = value;
-#endif
-
-	#ifdef __cforall
-	extern "Cforall" {
-		extern __attribute__((aligned(128))) thread_local struct KernelThreadData {
-			struct $thread    * volatile this_thread;
-			struct processor  * volatile this_processor;
-			struct __stats_t  * volatile this_stats;
-
-			struct {
-				volatile unsigned short disable_count;
-				volatile bool enabled;
-				volatile bool in_progress;
-			} preemption_state;
-
-			#if defined(__SIZEOF_INT128__)
-				__uint128_t rand_seed;
-			#else
-				uint64_t rand_seed;
-			#endif
-		} kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
-	}
-	#endif
 
 	struct __stack_context_t {
@@ -98,5 +59,4 @@
 
 	enum __Coroutine_State { Halted, Start, Primed, Blocked, Ready, Active };
-	enum __Preemption_Reason { __NO_PREEMPTION, __ALARM_PREEMPTION, __POLL_PREEMPTION, __MANUAL_PREEMPTION };
 
 	struct $coroutine {
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/io.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -14,4 +14,6 @@
 //
 
+#define __cforall_thread__
+
 #if defined(__CFA_DEBUG__)
 	// #define __CFA_DEBUG_PRINT_IO__
@@ -19,33 +21,15 @@
 #endif
 
-#include "kernel.hfa"
-#include "bitmanip.hfa"
-
-#if !defined(CFA_HAVE_LINUX_IO_URING_H)
-	void __kernel_io_startup( cluster &, unsigned, bool ) {
-		// Nothing to do without io_uring
-	}
-
-	void __kernel_io_finish_start( cluster & ) {
-		// Nothing to do without io_uring
-	}
-
-	void __kernel_io_prepare_stop( cluster & ) {
-		// Nothing to do without io_uring
-	}
-
-	void __kernel_io_shutdown( cluster &, bool ) {
-		// Nothing to do without io_uring
-	}
-
-#else
+
+#if defined(CFA_HAVE_LINUX_IO_URING_H)
 	#define _GNU_SOURCE         /* See feature_test_macros(7) */
 	#include <errno.h>
+	#include <signal.h>
 	#include <stdint.h>
 	#include <string.h>
 	#include <unistd.h>
-	#include <sys/mman.h>
 
 	extern "C" {
+		#include <sys/epoll.h>
 		#include <sys/syscall.h>
 
@@ -53,389 +37,16 @@
 	}
 
-	#include "bits/signal.hfa"
-	#include "kernel_private.hfa"
-	#include "thread.hfa"
-
-	uint32_t entries_per_cluster() {
-		return 256;
-	}
-
-	static void * __io_poller_slow( void * arg );
-
-	// Weirdly, some systems that do support io_uring don't actually define these
-	#ifdef __alpha__
-		/*
-		* alpha is the only exception, all other architectures
-		* have common numbers for new system calls.
-		*/
-		#ifndef __NR_io_uring_setup
-			#define __NR_io_uring_setup           535
-		#endif
-		#ifndef __NR_io_uring_enter
-			#define __NR_io_uring_enter           536
-		#endif
-		#ifndef __NR_io_uring_register
-			#define __NR_io_uring_register        537
-		#endif
-	#else /* !__alpha__ */
-		#ifndef __NR_io_uring_setup
-			#define __NR_io_uring_setup           425
-		#endif
-		#ifndef __NR_io_uring_enter
-			#define __NR_io_uring_enter           426
-		#endif
-		#ifndef __NR_io_uring_register
-			#define __NR_io_uring_register        427
-		#endif
-	#endif
-
-	// Fast poller user-thread
-	// Not using the "thread" keyword because we want to control
-	// more carefully when to start/stop it
-	struct __io_poller_fast {
-		struct __io_data * ring;
-		$thread thrd;
-	};
-
-	void ?{}( __io_poller_fast & this, struct cluster & cltr ) {
-		this.ring = cltr.io;
-		(this.thrd){ "Fast I/O Poller", cltr };
-	}
-	void ^?{}( __io_poller_fast & mutex this );
-	void main( __io_poller_fast & this );
-	static inline $thread * get_thread( __io_poller_fast & this ) { return &this.thrd; }
-	void ^?{}( __io_poller_fast & mutex this ) {}
-
-	struct __submition_data {
-		// Head and tail of the ring (associated with array)
-		volatile uint32_t * head;
-		volatile uint32_t * tail;
-		volatile uint32_t prev_head;
-
-		// The actual kernel ring which uses head/tail
-		// indexes into the sqes arrays
-		uint32_t * array;
-
-		// number of entries and mask to go with it
-		const uint32_t * num;
-		const uint32_t * mask;
-
-		// Submission flags (Not sure what for)
-		uint32_t * flags;
-
-		// number of sqes not submitted (whatever that means)
-		uint32_t * dropped;
-
-		// Like head/tail but not seen by the kernel
-		volatile uint32_t * ready;
-		uint32_t ready_cnt;
-
-		__spinlock_t lock;
-		__spinlock_t release_lock;
-
-		// A buffer of sqes (not the actual ring)
-		struct io_uring_sqe * sqes;
-
-		// The location and size of the mmaped area
-		void * ring_ptr;
-		size_t ring_sz;
-	};
-
-	struct __completion_data {
-		// Head and tail of the ring
-		volatile uint32_t * head;
-		volatile uint32_t * tail;
-
-		// number of entries and mask to go with it
-		const uint32_t * mask;
-		const uint32_t * num;
-
-		// number of cqes not submitted (whatever that means)
-		uint32_t * overflow;
-
-		// the kernel ring
-		struct io_uring_cqe * cqes;
-
-		// The location and size of the mmaped area
-		void * ring_ptr;
-		size_t ring_sz;
-	};
-
-	struct __io_data {
-		struct __submition_data submit_q;
-		struct __completion_data completion_q;
-		uint32_t ring_flags;
-		int cltr_flags;
-		int fd;
-		semaphore submit;
-		volatile bool done;
-		struct {
-			struct {
-				__processor_id_t id;
-				void * stack;
-				pthread_t kthrd;
-				volatile bool blocked;
-			} slow;
-			__io_poller_fast fast;
-			__bin_sem_t sem;
-		} poller;
-	};
-
-//=============================================================================================
-// I/O Startup / Shutdown logic
-//=============================================================================================
-	void __kernel_io_startup( cluster & this, unsigned io_flags, bool main_cluster ) {
-		if( (io_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS) && (io_flags & CFA_CLUSTER_IO_EAGER_SUBMITS) ) {
-			abort("CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS and CFA_CLUSTER_IO_EAGER_SUBMITS cannot be mixed\n");
-		}
-
-		this.io = malloc();
-
-		// Step 1 : call to setup
-		struct io_uring_params params;
-		memset(&params, 0, sizeof(params));
-		if( io_flags & CFA_CLUSTER_IO_KERNEL_POLL_SUBMITS   ) params.flags |= IORING_SETUP_SQPOLL;
-		if( io_flags & CFA_CLUSTER_IO_KERNEL_POLL_COMPLETES ) params.flags |= IORING_SETUP_IOPOLL;
-
-		uint32_t nentries = entries_per_cluster();
-
-		int fd = syscall(__NR_io_uring_setup, nentries, &params );
-		if(fd < 0) {
-			abort("KERNEL ERROR: IO_URING SETUP - %s\n", strerror(errno));
-		}
-
-		// Step 2 : mmap result
-		memset( this.io, 0, sizeof(struct __io_data) );
-		struct __submition_data  & sq = this.io->submit_q;
-		struct __completion_data & cq = this.io->completion_q;
-
-		// calculate the right ring size
-		sq.ring_sz = params.sq_off.array + (params.sq_entries * sizeof(unsigned)           );
-		cq.ring_sz = params.cq_off.cqes  + (params.cq_entries * sizeof(struct io_uring_cqe));
-
-		// Requires features
-		#if defined(IORING_FEAT_SINGLE_MMAP)
-			// adjust the size according to the parameters
-			if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
-				cq.ring_sz = sq.ring_sz = max(cq.ring_sz, sq.ring_sz);
-			}
-		#endif
-
-		// mmap the Submit Queue into existence
-		sq.ring_ptr = mmap(0, sq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
-		if (sq.ring_ptr == (void*)MAP_FAILED) {
-			abort("KERNEL ERROR: IO_URING MMAP1 - %s\n", strerror(errno));
-		}
-
-		// Requires features
-		#if defined(IORING_FEAT_SINGLE_MMAP)
-			// mmap the Completion Queue into existence (may or may not be needed)
-			if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
-				cq.ring_ptr = sq.ring_ptr;
-			}
-			else
-		#endif
-		{
-			// We need multiple call to MMAP
-			cq.ring_ptr = mmap(0, cq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
-			if (cq.ring_ptr == (void*)MAP_FAILED) {
-				munmap(sq.ring_ptr, sq.ring_sz);
-				abort("KERNEL ERROR: IO_URING MMAP2 - %s\n", strerror(errno));
-			}
-		}
-
-		// mmap the submit queue entries
-		size_t size = params.sq_entries * sizeof(struct io_uring_sqe);
-		sq.sqes = (struct io_uring_sqe *)mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
-		if (sq.sqes == (struct io_uring_sqe *)MAP_FAILED) {
-			munmap(sq.ring_ptr, sq.ring_sz);
-			if (cq.ring_ptr != sq.ring_ptr) munmap(cq.ring_ptr, cq.ring_sz);
-			abort("KERNEL ERROR: IO_URING MMAP3 - %s\n", strerror(errno));
-		}
-
-		// Get the pointers from the kernel to fill the structure
-		// submit queue
-		sq.head    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
-		sq.tail    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
-		sq.mask    = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
-		sq.num     = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
-		sq.flags   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
-		sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
-		sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
-		sq.prev_head = *sq.head;
-
-		{
-			const uint32_t num = *sq.num;
-			for( i; num ) {
-				sq.sqes[i].user_data = 0ul64;
-			}
-		}
-
-		(sq.lock){};
-		(sq.release_lock){};
-
-		if( io_flags & ( CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS | CFA_CLUSTER_IO_EAGER_SUBMITS ) ) {
-			/* paranoid */ verify( is_pow2( io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET ) || ((io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET) < 8)  );
-			sq.ready_cnt = max(io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET, 8);
-			sq.ready = alloc_align( 64, sq.ready_cnt );
-			for(i; sq.ready_cnt) {
-				sq.ready[i] = -1ul32;
-			}
-		}
-		else {
-			sq.ready_cnt = 0;
-			sq.ready = 0p;
-		}
-
-		// completion queue
-		cq.head     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
-		cq.tail     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
-		cq.mask     = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
-		cq.num      = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
-		cq.overflow = (         uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
-		cq.cqes   = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
-
-		// some paranoid checks
-		/* paranoid */ verifyf( (*cq.mask) == ((*cq.num) - 1ul32), "IO_URING Expected mask to be %u (%u entries), was %u", (*cq.num) - 1ul32, *cq.num, *cq.mask  );
-		/* paranoid */ verifyf( (*cq.num)  >= nentries, "IO_URING Expected %u entries, got %u", nentries, *cq.num );
-		/* paranoid */ verifyf( (*cq.head) == 0, "IO_URING Expected head to be 0, got %u", *cq.head );
-		/* paranoid */ verifyf( (*cq.tail) == 0, "IO_URING Expected tail to be 0, got %u", *cq.tail );
-
-		/* paranoid */ verifyf( (*sq.mask) == ((*sq.num) - 1ul32), "IO_URING Expected mask to be %u (%u entries), was %u", (*sq.num) - 1ul32, *sq.num, *sq.mask );
-		/* paranoid */ verifyf( (*sq.num) >= nentries, "IO_URING Expected %u entries, got %u", nentries, *sq.num );
-		/* paranoid */ verifyf( (*sq.head) == 0, "IO_URING Expected head to be 0, got %u", *sq.head );
-		/* paranoid */ verifyf( (*sq.tail) == 0, "IO_URING Expected tail to be 0, got %u", *sq.tail );
-
-		// Update the global ring info
-		this.io->ring_flags = params.flags;
-		this.io->cltr_flags = io_flags;
-		this.io->fd         = fd;
-		this.io->done       = false;
-		(this.io->submit){ min(*sq.num, *cq.num) };
-
-		if(!main_cluster) {
-			__kernel_io_finish_start( this );
-		}
-	}
-
-	void __kernel_io_finish_start( cluster & this ) {
-		if( this.io->cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD ) {
-			__cfadbg_print_safe(io_core, "Kernel I/O : Creating fast poller for cluter %p\n", &this);
-			(this.io->poller.fast){ this };
-			__thrd_start( this.io->poller.fast, main );
-		}
-
-		// Create the poller thread
-		__cfadbg_print_safe(io_core, "Kernel I/O : Creating slow poller for cluster %p\n", &this);
-		this.io->poller.slow.blocked = false;
-		this.io->poller.slow.stack = __create_pthread( &this.io->poller.slow.kthrd, __io_poller_slow, &this );
-	}
-
-	void __kernel_io_prepare_stop( cluster & this ) {
-		__cfadbg_print_safe(io_core, "Kernel I/O : Stopping pollers for cluster\n", &this);
-		// Notify the poller thread of the shutdown
-		__atomic_store_n(&this.io->done, true, __ATOMIC_SEQ_CST);
-
-		// Stop the IO Poller
-		sigval val = { 1 };
-		pthread_sigqueue( this.io->poller.slow.kthrd, SIGUSR1, val );
-		post( this.io->poller.sem );
-
-		// Wait for the poller thread to finish
-		pthread_join( this.io->poller.slow.kthrd, 0p );
-		free( this.io->poller.slow.stack );
-
-		__cfadbg_print_safe(io_core, "Kernel I/O : Slow poller stopped for cluster\n", &this);
-
-		if( this.io->cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD ) {
-			with( this.io->poller.fast ) {
-				/* paranoid */ verify( this.nprocessors == 0 || &this == mainCluster );
-				/* paranoid */ verify( !ready_mutate_islocked() );
-
-				// We need to adjust the clean-up based on where the thread is
-				if( thrd.state == Ready || thrd.preempted != __NO_PREEMPTION ) {
-
-					ready_schedule_lock( (struct __processor_id_t *)active_processor() );
-
-						// This is the tricky case
-						// The thread was preempted and now it is on the ready queue
-						// The thread should be the last on the list
-						/* paranoid */ verify( thrd.link.next != 0p );
-
-						// Remove the thread from the ready queue of this cluster
-						__attribute__((unused)) bool removed = remove_head( &this, &thrd );
-						/* paranoid */ verify( removed );
-						thrd.link.next = 0p;
-						thrd.link.prev = 0p;
-						__cfaabi_dbg_debug_do( thrd.unpark_stale = true );
-
-						// Fixup the thread state
-						thrd.state = Blocked;
-						thrd.ticket = 0;
-						thrd.preempted = __NO_PREEMPTION;
-
-					ready_schedule_unlock( (struct __processor_id_t *)active_processor() );
-
-					// Pretend like the thread was blocked all along
-				}
-				// !!! This is not an else if !!!
-				if( thrd.state == Blocked ) {
-
-					// This is the "easy case"
-					// The thread is parked and can easily be moved to active cluster
-					verify( thrd.curr_cluster != active_cluster() || thrd.curr_cluster == mainCluster );
-					thrd.curr_cluster = active_cluster();
-
-					// unpark the fast io_poller
-					unpark( &thrd __cfaabi_dbg_ctx2 );
-				}
-				else {
-
-					// The thread is in a weird state
-					// I don't know what to do here
-					abort("Fast poller thread is in unexpected state, cannot clean-up correctly\n");
-				}
-
-			}
-
-			^(this.io->poller.fast){};
-
-			__cfadbg_print_safe(io_core, "Kernel I/O : Fast poller stopped for cluster\n", &this);
-		}
-	}
-
-	void __kernel_io_shutdown( cluster & this, bool main_cluster ) {
-		if(!main_cluster) {
-			__kernel_io_prepare_stop( this );
-		}
-
-		// Shutdown the io rings
-		struct __submition_data  & sq = this.io->submit_q;
-		struct __completion_data & cq = this.io->completion_q;
-
-		// unmap the submit queue entries
-		munmap(sq.sqes, (*sq.num) * sizeof(struct io_uring_sqe));
-
-		// unmap the Submit Queue ring
-		munmap(sq.ring_ptr, sq.ring_sz);
-
-		// unmap the Completion Queue ring, if it is different
-		if (cq.ring_ptr != sq.ring_ptr) {
-			munmap(cq.ring_ptr, cq.ring_sz);
-		}
-
-		// close the file descriptor
-		close(this.io->fd);
-
-		free( this.io->submit_q.ready ); // Maybe null, doesn't matter
-		free( this.io );
-	}
-
-	int __io_uring_enter( struct __io_data & ring, unsigned to_submit, bool get, sigset_t * mask ) {
+	#include "stats.hfa"
+	#include "kernel.hfa"
+	#include "kernel/fwd.hfa"
+	#include "io/types.hfa"
+
+//=============================================================================================
+// I/O Syscall
+//=============================================================================================
+	static int __io_uring_enter( struct __io_data & ring, unsigned to_submit, bool get ) {
 		bool need_sys_to_submit = false;
 		bool need_sys_to_complete = false;
-		unsigned min_complete = 0;
 		unsigned flags = 0;
-
 
 		TO_SUBMIT:
@@ -451,12 +62,6 @@
 		}
 
-		TO_COMPLETE:
 		if( get && !(ring.ring_flags & IORING_SETUP_SQPOLL) ) {
 			flags |= IORING_ENTER_GETEVENTS;
-			if( mask ) {
-				need_sys_to_complete = true;
-				min_complete = 1;
-				break TO_COMPLETE;
-			}
 			if( (ring.ring_flags & IORING_SETUP_IOPOLL) ) {
 				need_sys_to_complete = true;
@@ -466,5 +71,5 @@
 		int ret = 0;
 		if( need_sys_to_submit || need_sys_to_complete ) {
-			ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, min_complete, flags, mask, _NSIG / 8);
+			ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, 0, flags, 0p, _NSIG / 8);
 			if( ret < 0 ) {
 				switch((int)errno) {
@@ -490,25 +95,24 @@
 	static uint32_t __release_consumed_submission( struct __io_data & ring );
 
-	static inline void process(struct io_uring_cqe & cqe, struct __processor_id_t * id ) {
+	static inline void process(struct io_uring_cqe & cqe ) {
 		struct __io_user_data_t * data = (struct __io_user_data_t *)(uintptr_t)cqe.user_data;
 		__cfadbg_print_safe( io, "Kernel I/O : Syscall completed : cqe %p, result %d for %p\n", data, cqe.res, data->thrd );
 
 		data->result = cqe.res;
-		if(!id) { unpark(     data->thrd __cfaabi_dbg_ctx2 ); }
-		else  { __unpark( id, data->thrd __cfaabi_dbg_ctx2 ); }
+		unpark( data->thrd __cfaabi_dbg_ctx2 );
 	}
 
 	// Process a single completion message from the io_uring
 	// This is NOT thread-safe
-	static [int, bool] __drain_io( & struct __io_data ring, * sigset_t mask ) {
+	static [int, bool] __drain_io( & struct __io_data ring ) {
 		/* paranoid */ verify( !kernelTLS.preemption_state.enabled );
 
 		unsigned to_submit = 0;
-		if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
+		if( ring.poller_submits ) {
 			// If the poller thread also submits, then we need to aggregate the submissions which are ready
 			to_submit = __collect_submitions( ring );
 		}
 
-		int ret = __io_uring_enter(ring, to_submit, true, mask);
+		int ret = __io_uring_enter(ring, to_submit, true);
 		if( ret < 0 ) {
 			return [0, true];
@@ -547,9 +151,6 @@
 			/* paranoid */ verify(&cqe);
 
-			process( cqe, !mask ? (struct __processor_id_t *)0p : &ring.poller.slow.id );
-		}
-
-		// Allow new submissions to happen
-		// V(ring.submit, count);
+			process( cqe );
+		}
 
 		// Mark to the kernel that the cqe has been seen
@@ -561,99 +162,18 @@
 	}
 
-	static void * __io_poller_slow( void * arg ) {
-		#if !defined( __CFA_NO_STATISTICS__ )
-			__stats_t local_stats;
-			__init_stats( &local_stats );
-			kernelTLS.this_stats = &local_stats;
-		#endif
-
-		cluster * cltr = (cluster *)arg;
-		struct __io_data & ring = *cltr->io;
-
-		ring.poller.slow.id.id = doregister( &ring.poller.slow.id );
-
-		sigset_t mask;
-		sigfillset(&mask);
-		if ( pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
-			abort( "KERNEL ERROR: IO_URING - pthread_sigmask" );
-		}
-
-		sigdelset( &mask, SIGUSR1 );
-
-		verify( (*ring.submit_q.head) == (*ring.submit_q.tail) );
-		verify( (*ring.completion_q.head) == (*ring.completion_q.tail) );
-
-		__cfadbg_print_safe(io_core, "Kernel I/O : Slow poller for ring %p ready\n", &ring);
-
-		if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD ) {
-			while(!__atomic_load_n(&ring.done, __ATOMIC_SEQ_CST)) {
-
-				__atomic_store_n( &ring.poller.slow.blocked, true, __ATOMIC_SEQ_CST );
-
-				// In the user-thread approach drain and if anything was drained,
-				// batton pass to the user-thread
-				int count;
-				bool again;
-				[count, again] = __drain_io( ring, &mask );
-
-				__atomic_store_n( &ring.poller.slow.blocked, false, __ATOMIC_SEQ_CST );
-
-				// Update statistics
-				__STATS__( true,
-					io.complete_q.completed_avg.val += count;
-					io.complete_q.completed_avg.slow_cnt += 1;
-				)
-
-				if(again) {
-					__cfadbg_print_safe(io_core, "Kernel I/O : Moving to ring %p to fast poller\n", &ring);
-					__unpark( &ring.poller.slow.id, &ring.poller.fast.thrd __cfaabi_dbg_ctx2 );
-					wait( ring.poller.sem );
-				}
-			}
-		}
-		else {
-			while(!__atomic_load_n(&ring.done, __ATOMIC_SEQ_CST)) {
-				//In the naive approach, just poll the io completion queue directly
-				int count;
-				bool again;
-				[count, again] = __drain_io( ring, &mask );
-
-				// Update statistics
-				__STATS__( true,
-					io.complete_q.completed_avg.val += count;
-					io.complete_q.completed_avg.slow_cnt += 1;
-				)
-			}
-		}
-
-		__cfadbg_print_safe(io_core, "Kernel I/O : Slow poller for ring %p stopping\n", &ring);
-
-		unregister( &ring.poller.slow.id );
-
-		#if !defined(__CFA_NO_STATISTICS__)
-			__tally_stats(cltr->stats, &local_stats);
-		#endif
-
-		return 0p;
-	}
-
-	void main( __io_poller_fast & this ) {
-		verify( this.ring->cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD );
-
-		// Start parked
-		park( __cfaabi_dbg_ctx );
-
-		__cfadbg_print_safe(io_core, "Kernel I/O : Fast poller for ring %p ready\n", &this.ring);
+	void main( $io_ctx_thread & this ) {
+		epoll_event ev;
+		__ioctx_register( this, ev );
+
+		__cfadbg_print_safe(io_core, "Kernel I/O : IO poller %p for ring %p ready\n", &this, &this.ring);
 
 		int reset = 0;
-
 		// Then loop until we need to start
-		while(!__atomic_load_n(&this.ring->done, __ATOMIC_SEQ_CST)) {
-
+		while(!__atomic_load_n(&this.done, __ATOMIC_SEQ_CST)) {
 			// Drain the io
 			int count;
 			bool again;
 			disable_interrupts();
-				[count, again] = __drain_io( *this.ring, 0p );
+				[count, again] = __drain_io( *this.ring );
 
 				if(!again) reset++;
@@ -672,24 +192,14 @@
 			// We didn't get anything baton pass to the slow poller
 			else {
-				__cfadbg_print_safe(io_core, "Kernel I/O : Moving to ring %p to slow poller\n", &this.ring);
+				__cfadbg_print_safe(io_core, "Kernel I/O : Parking io poller %p\n", &this.self);
 				reset = 0;
 
-				// wake up the slow poller
-				post( this.ring->poller.sem );
-
-				// park this thread
-				park( __cfaabi_dbg_ctx );
+				// block this thread
+				__ioctx_prepare_block( this, ev );
+				wait( this.sem );
 			}
 		}
 
 		__cfadbg_print_safe(io_core, "Kernel I/O : Fast poller for ring %p stopping\n", &this.ring);
-	}
-
-	static inline void __wake_poller( struct __io_data & ring ) __attribute__((artificial));
-	static inline void __wake_poller( struct __io_data & ring ) {
-		if(!__atomic_load_n( &ring.poller.slow.blocked, __ATOMIC_SEQ_CST)) return;
-
-		sigval val = { 1 };
-		pthread_sigqueue( ring.poller.slow.kthrd, SIGUSR1, val );
 	}
 
@@ -806,19 +316,20 @@
 	}
 
-	void __submit( struct __io_data & ring, uint32_t idx ) {
+	void __submit( struct io_context * ctx, uint32_t idx ) __attribute__((nonnull (1))) {
+		__io_data & ring = *ctx->thrd.ring;
 		// Get now the data we definetely need
 		uint32_t * const tail = ring.submit_q.tail;
-		const uint32_t mask = *ring.submit_q.mask;
+		const uint32_t mask  = *ring.submit_q.mask;
 
 		// There are 2 submission schemes, check which one we are using
-		if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
+		if( ring.poller_submits ) {
 			// If the poller thread submits, then we just need to add this to the ready array
 			__submit_to_ready_array( ring, idx, mask );
 
-			__wake_poller( ring );
+			post( ctx->thrd.sem );
 
 			__cfadbg_print_safe( io, "Kernel I/O : Added %u to ready for %p\n", idx, active_thread() );
 		}
-		else if( ring.cltr_flags & CFA_CLUSTER_IO_EAGER_SUBMITS ) {
+		else if( ring.eager_submits ) {
 			uint32_t picked = __submit_to_ready_array( ring, idx, mask );
 
@@ -849,5 +360,5 @@
 			// We got the lock
 			unsigned to_submit = __collect_submitions( ring );
-			int ret = __io_uring_enter( ring, to_submit, false, 0p );
+			int ret = __io_uring_enter( ring, to_submit, false );
 			if( ret < 0 ) {
 				unlock(ring.submit_q.lock);
@@ -892,5 +403,5 @@
 
 			// Submit however, many entries need to be submitted
-			int ret = __io_uring_enter( ring, 1, false, 0p );
+			int ret = __io_uring_enter( ring, 1, false );
 			if( ret < 0 ) {
 				switch((int)errno) {
@@ -958,16 +469,3 @@
 		return count;
 	}
-
-//=============================================================================================
-// I/O Submissions
-//=============================================================================================
-
-	void register_fixed_files( cluster & cl, int * files, unsigned count ) {
-		int ret = syscall( __NR_io_uring_register, cl.io->fd, IORING_REGISTER_FILES, files, count );
-		if( ret < 0 ) {
-			abort( "KERNEL ERROR: IO_URING SYSCALL - (%d) %s\n", (int)errno, strerror(errno) );
-		}
-
-		__cfadbg_print_safe( io_core, "Kernel I/O : Performed io_register for %p, returned %d\n", active_thread(), ret );
-	}
 #endif
Index: libcfa/src/concurrency/io/setup.cfa
===================================================================
--- libcfa/src/concurrency/io/setup.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
+++ libcfa/src/concurrency/io/setup.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -0,0 +1,475 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// io/setup.cfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Fri Jul 31 16:25:51 2020
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#define __cforall_thread__
+#define _GNU_SOURCE         /* See feature_test_macros(7) */
+
+#include "io/types.hfa"
+#include "kernel.hfa"
+
+#if !defined(CFA_HAVE_LINUX_IO_URING_H)
+	void __kernel_io_startup() {
+		// Nothing to do without io_uring
+	}
+
+	void __kernel_io_shutdown() {
+		// Nothing to do without io_uring
+	}
+
+	void ?{}(io_context_params & this) {}
+
+	void ?{}(io_context & this, struct cluster & cl) {}
+	void ?{}(io_context & this, struct cluster & cl, const io_context_params & params) {}
+
+	void ^?{}(io_context & this) {}
+	void ^?{}(io_context & this, bool cluster_context) {}
+
+#else
+	#include <errno.h>
+	#include <stdint.h>
+	#include <string.h>
+	#include <signal.h>
+	#include <unistd.h>
+
+	extern "C" {
+		#include <pthread.h>
+		#include <sys/epoll.h>
+		#include <sys/mman.h>
+		#include <sys/syscall.h>
+
+		#include <linux/io_uring.h>
+	}
+
+	#include "bitmanip.hfa"
+	#include "kernel_private.hfa"
+	#include "thread.hfa"
+
+	void ?{}(io_context_params & this) {
+		this.num_entries = 256;
+		this.num_ready = 256;
+		this.submit_aff = -1;
+		this.eager_submits = false;
+		this.poller_submits = false;
+		this.poll_submit = false;
+		this.poll_complete = false;
+	}
+
+	static void * __io_poller_slow( void * arg );
+
+	// Weirdly, some systems that do support io_uring don't actually define these
+	#ifdef __alpha__
+		/*
+		* alpha is the only exception, all other architectures
+		* have common numbers for new system calls.
+		*/
+		#ifndef __NR_io_uring_setup
+			#define __NR_io_uring_setup           535
+		#endif
+		#ifndef __NR_io_uring_enter
+			#define __NR_io_uring_enter           536
+		#endif
+		#ifndef __NR_io_uring_register
+			#define __NR_io_uring_register        537
+		#endif
+	#else /* !__alpha__ */
+		#ifndef __NR_io_uring_setup
+			#define __NR_io_uring_setup           425
+		#endif
+		#ifndef __NR_io_uring_enter
+			#define __NR_io_uring_enter           426
+		#endif
+		#ifndef __NR_io_uring_register
+			#define __NR_io_uring_register        427
+		#endif
+	#endif
+
+//=============================================================================================
+// I/O Startup / Shutdown logic + Master Poller
+//=============================================================================================
+
+	// IO Master poller loop forward
+	static void * iopoll_loop( __attribute__((unused)) void * args );
+
+	static struct {
+		pthread_t     thrd;    // pthread handle to io poller thread
+		void *        stack;   // pthread stack for io poller thread
+		int           epollfd; // file descriptor to the epoll instance
+		volatile bool run;     // Whether or not to continue
+	} iopoll;
+
+	void __kernel_io_startup(void) {
+		__cfaabi_dbg_print_safe( "Kernel : Creating EPOLL instance\n" );
+
+		iopoll.epollfd = epoll_create1(0);
+		if (iopoll.epollfd == -1) {
+			abort( "internal error, epoll_create1\n");
+		}
+
+		__cfaabi_dbg_print_safe( "Kernel : Starting io poller thread\n" );
+
+		iopoll.run = true;
+		iopoll.stack = __create_pthread( &iopoll.thrd, iopoll_loop, 0p );
+	}
+
+	void __kernel_io_shutdown(void) {
+		// Notify the io poller thread of the shutdown
+		iopoll.run = false;
+		sigval val = { 1 };
+		pthread_sigqueue( iopoll.thrd, SIGUSR1, val );
+
+		// Wait for the io poller thread to finish
+
+		pthread_join( iopoll.thrd, 0p );
+		free( iopoll.stack );
+
+		int ret = close(iopoll.epollfd);
+		if (ret == -1) {
+			abort( "internal error, close epoll\n");
+		}
+
+		// Io polling is now fully stopped
+
+		__cfaabi_dbg_print_safe( "Kernel : IO poller stopped\n" );
+	}
+
+	static void * iopoll_loop( __attribute__((unused)) void * args ) {
+		__processor_id_t id;
+		id.id = doregister(&id);
+		__cfaabi_dbg_print_safe( "Kernel : IO poller thread starting\n" );
+
+		// Block signals to control when they arrive
+		sigset_t mask;
+		sigfillset(&mask);
+		if ( pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
+		abort( "internal error, pthread_sigmask" );
+		}
+
+		sigdelset( &mask, SIGUSR1 );
+
+		// Create sufficient events
+		struct epoll_event events[10];
+		// Main loop
+		while( iopoll.run ) {
+			// Wait for events
+			int nfds = epoll_pwait( iopoll.epollfd, events, 10, -1, &mask );
+
+			// Check if an error occured
+			if (nfds == -1) {
+				if( errno == EINTR ) continue;
+				abort( "internal error, pthread_sigmask" );
+			}
+
+			for(i; nfds) {
+				$io_ctx_thread * io_ctx = ($io_ctx_thread *)(uintptr_t)events[i].data.u64;
+				/* paranoid */ verify( io_ctx );
+				__cfadbg_print_safe(io_core, "Kernel I/O : Unparking io poller %p\n", io_ctx);
+				#if !defined( __CFA_NO_STATISTICS__ )
+					kernelTLS.this_stats = io_ctx->self.curr_cluster->stats;
+				#endif
+				__post( io_ctx->sem, &id );
+			}
+		}
+
+		__cfaabi_dbg_print_safe( "Kernel : IO poller thread stopping\n" );
+		unregister(&id);
+		return 0p;
+	}
+
+//=============================================================================================
+// I/O Context Constrution/Destruction
+//=============================================================================================
+
+	void ?{}($io_ctx_thread & this, struct cluster & cl) { (this.self){ "IO Poller", cl }; }
+	void main( $io_ctx_thread & this );
+	static inline $thread * get_thread( $io_ctx_thread & this ) { return &this.self; }
+	void ^?{}( $io_ctx_thread & mutex this ) {}
+
+	static void __io_create ( __io_data & this, const io_context_params & params_in );
+	static void __io_destroy( __io_data & this );
+
+	void ?{}(io_context & this, struct cluster & cl, const io_context_params & params) {
+		(this.thrd){ cl };
+		this.thrd.ring = malloc();
+		__cfadbg_print_safe(io_core, "Kernel I/O : Creating ring for io_context %p\n", &this);
+		__io_create( *this.thrd.ring, params );
+
+		__cfadbg_print_safe(io_core, "Kernel I/O : Starting poller thread for io_context %p\n", &this);
+		this.thrd.done = false;
+		__thrd_start( this.thrd, main );
+
+		__cfadbg_print_safe(io_core, "Kernel I/O : io_context %p ready\n", &this);
+	}
+
+	void ?{}(io_context & this, struct cluster & cl) {
+		io_context_params params;
+		(this){ cl, params };
+	}
+
+	void ^?{}(io_context & this, bool cluster_context) {
+		__cfadbg_print_safe(io_core, "Kernel I/O : tearing down io_context %p\n", &this);
+
+		// Notify the thread of the shutdown
+		__atomic_store_n(&this.thrd.done, true, __ATOMIC_SEQ_CST);
+
+		// If this is an io_context within a cluster, things get trickier
+		$thread & thrd = this.thrd.self;
+		if( cluster_context ) {
+			cluster & cltr = *thrd.curr_cluster;
+			/* paranoid */ verify( cltr.nprocessors == 0 || &cltr == mainCluster );
+			/* paranoid */ verify( !ready_mutate_islocked() );
+
+			// We need to adjust the clean-up based on where the thread is
+			if( thrd.state == Ready || thrd.preempted != __NO_PREEMPTION ) {
+
+				ready_schedule_lock( (struct __processor_id_t *)active_processor() );
+
+					// This is the tricky case
+					// The thread was preempted and now it is on the ready queue
+					// The thread should be the last on the list
+					/* paranoid */ verify( thrd.link.next != 0p );
+
+					// Remove the thread from the ready queue of this cluster
+					__attribute__((unused)) bool removed = remove_head( &cltr, &thrd );
+					/* paranoid */ verify( removed );
+					thrd.link.next = 0p;
+					thrd.link.prev = 0p;
+					__cfaabi_dbg_debug_do( thrd.unpark_stale = true );
+
+					// Fixup the thread state
+					thrd.state = Blocked;
+					thrd.ticket = 0;
+					thrd.preempted = __NO_PREEMPTION;
+
+				ready_schedule_unlock( (struct __processor_id_t *)active_processor() );
+
+				// Pretend like the thread was blocked all along
+			}
+			// !!! This is not an else if !!!
+			if( thrd.state == Blocked ) {
+
+				// This is the "easy case"
+				// The thread is parked and can easily be moved to active cluster
+				verify( thrd.curr_cluster != active_cluster() || thrd.curr_cluster == mainCluster );
+				thrd.curr_cluster = active_cluster();
+
+				// unpark the fast io_poller
+				unpark( &thrd __cfaabi_dbg_ctx2 );
+			}
+			else {
+
+				// The thread is in a weird state
+				// I don't know what to do here
+				abort("io_context poller thread is in unexpected state, cannot clean-up correctly\n");
+			}
+		} else {
+			unpark( &thrd __cfaabi_dbg_ctx2 );
+		}
+
+		^(this.thrd){};
+		__cfadbg_print_safe(io_core, "Kernel I/O : Stopped poller thread for io_context %p\n", &this);
+
+		__io_destroy( *this.thrd.ring );
+		__cfadbg_print_safe(io_core, "Kernel I/O : Destroyed ring for io_context %p\n", &this);
+
+		free(this.thrd.ring);
+	}
+
+	void ^?{}(io_context & this) {
+		^(this){ false };
+	}
+
+	static void __io_create( __io_data & this, const io_context_params & params_in ) {
+		// Step 1 : call to setup
+		struct io_uring_params params;
+		memset(&params, 0, sizeof(params));
+		if( params_in.poll_submit   ) params.flags |= IORING_SETUP_SQPOLL;
+		if( params_in.poll_complete ) params.flags |= IORING_SETUP_IOPOLL;
+
+		uint32_t nentries = params_in.num_entries;
+
+		int fd = syscall(__NR_io_uring_setup, nentries, &params );
+		if(fd < 0) {
+			abort("KERNEL ERROR: IO_URING SETUP - %s\n", strerror(errno));
+		}
+
+		// Step 2 : mmap result
+		memset( &this, 0, sizeof(struct __io_data) );
+		struct __submition_data  & sq = this.submit_q;
+		struct __completion_data & cq = this.completion_q;
+
+		// calculate the right ring size
+		sq.ring_sz = params.sq_off.array + (params.sq_entries * sizeof(unsigned)           );
+		cq.ring_sz = params.cq_off.cqes  + (params.cq_entries * sizeof(struct io_uring_cqe));
+
+		// Requires features
+		#if defined(IORING_FEAT_SINGLE_MMAP)
+			// adjust the size according to the parameters
+			if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
+				cq.ring_sz = sq.ring_sz = max(cq.ring_sz, sq.ring_sz);
+			}
+		#endif
+
+		// mmap the Submit Queue into existence
+		sq.ring_ptr = mmap(0, sq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
+		if (sq.ring_ptr == (void*)MAP_FAILED) {
+			abort("KERNEL ERROR: IO_URING MMAP1 - %s\n", strerror(errno));
+		}
+
+		// Requires features
+		#if defined(IORING_FEAT_SINGLE_MMAP)
+			// mmap the Completion Queue into existence (may or may not be needed)
+			if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
+				cq.ring_ptr = sq.ring_ptr;
+			}
+			else
+		#endif
+		{
+			// We need multiple call to MMAP
+			cq.ring_ptr = mmap(0, cq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
+			if (cq.ring_ptr == (void*)MAP_FAILED) {
+				munmap(sq.ring_ptr, sq.ring_sz);
+				abort("KERNEL ERROR: IO_URING MMAP2 - %s\n", strerror(errno));
+			}
+		}
+
+		// mmap the submit queue entries
+		size_t size = params.sq_entries * sizeof(struct io_uring_sqe);
+		sq.sqes = (struct io_uring_sqe *)mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
+		if (sq.sqes == (struct io_uring_sqe *)MAP_FAILED) {
+			munmap(sq.ring_ptr, sq.ring_sz);
+			if (cq.ring_ptr != sq.ring_ptr) munmap(cq.ring_ptr, cq.ring_sz);
+			abort("KERNEL ERROR: IO_URING MMAP3 - %s\n", strerror(errno));
+		}
+
+		// Get the pointers from the kernel to fill the structure
+		// submit queue
+		sq.head    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
+		sq.tail    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
+		sq.mask    = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
+		sq.num     = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
+		sq.flags   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
+		sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
+		sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
+		sq.prev_head = *sq.head;
+
+		{
+			const uint32_t num = *sq.num;
+			for( i; num ) {
+				sq.sqes[i].user_data = 0ul64;
+			}
+		}
+
+		(sq.lock){};
+		(sq.release_lock){};
+
+		if( params_in.poller_submits || params_in.eager_submits ) {
+			/* paranoid */ verify( is_pow2( params_in.num_ready ) || (params_in.num_ready < 8) );
+			sq.ready_cnt = max( params_in.num_ready, 8 );
+			sq.ready = alloc_align( 64, sq.ready_cnt );
+			for(i; sq.ready_cnt) {
+				sq.ready[i] = -1ul32;
+			}
+		}
+		else {
+			sq.ready_cnt = 0;
+			sq.ready = 0p;
+		}
+
+		// completion queue
+		cq.head     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
+		cq.tail     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
+		cq.mask     = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
+		cq.num      = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
+		cq.overflow = (         uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
+		cq.cqes   = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
+
+		// some paranoid checks
+		/* paranoid */ verifyf( (*cq.mask) == ((*cq.num) - 1ul32), "IO_URING Expected mask to be %u (%u entries), was %u", (*cq.num) - 1ul32, *cq.num, *cq.mask  );
+		/* paranoid */ verifyf( (*cq.num)  >= nentries, "IO_URING Expected %u entries, got %u", nentries, *cq.num );
+		/* paranoid */ verifyf( (*cq.head) == 0, "IO_URING Expected head to be 0, got %u", *cq.head );
+		/* paranoid */ verifyf( (*cq.tail) == 0, "IO_URING Expected tail to be 0, got %u", *cq.tail );
+
+		/* paranoid */ verifyf( (*sq.mask) == ((*sq.num) - 1ul32), "IO_URING Expected mask to be %u (%u entries), was %u", (*sq.num) - 1ul32, *sq.num, *sq.mask );
+		/* paranoid */ verifyf( (*sq.num) >= nentries, "IO_URING Expected %u entries, got %u", nentries, *sq.num );
+		/* paranoid */ verifyf( (*sq.head) == 0, "IO_URING Expected head to be 0, got %u", *sq.head );
+		/* paranoid */ verifyf( (*sq.tail) == 0, "IO_URING Expected tail to be 0, got %u", *sq.tail );
+
+		// Update the global ring info
+		this.ring_flags = params.flags;
+		this.fd         = fd;
+		this.eager_submits  = params_in.eager_submits;
+		this.poller_submits = params_in.poller_submits;
+	}
+
+	static void __io_destroy( __io_data & this ) {
+		// Shutdown the io rings
+		struct __submition_data  & sq = this.submit_q;
+		struct __completion_data & cq = this.completion_q;
+
+		// unmap the submit queue entries
+		munmap(sq.sqes, (*sq.num) * sizeof(struct io_uring_sqe));
+
+		// unmap the Submit Queue ring
+		munmap(sq.ring_ptr, sq.ring_sz);
+
+		// unmap the Completion Queue ring, if it is different
+		if (cq.ring_ptr != sq.ring_ptr) {
+			munmap(cq.ring_ptr, cq.ring_sz);
+		}
+
+		// close the file descriptor
+		close(this.fd);
+
+		free( this.submit_q.ready ); // Maybe null, doesn't matter
+	}
+
+//=============================================================================================
+// I/O Context Sleep
+//=============================================================================================
+
+	void __ioctx_register($io_ctx_thread & ctx, struct epoll_event & ev) {
+		ev.events = EPOLLIN | EPOLLONESHOT;
+		ev.data.u64 = (uint64_t)&ctx;
+		int ret = epoll_ctl(iopoll.epollfd, EPOLL_CTL_ADD, ctx.ring->fd, &ev);
+		if (ret < 0) {
+			abort( "KERNEL ERROR: EPOLL ADD - (%d) %s\n", (int)errno, strerror(errno) );
+		}
+	}
+
+	void __ioctx_prepare_block($io_ctx_thread & ctx, struct epoll_event & ev) {
+		int ret = epoll_ctl(iopoll.epollfd, EPOLL_CTL_MOD, ctx.ring->fd, &ev);
+		if (ret < 0) {
+			abort( "KERNEL ERROR: EPOLL REARM - (%d) %s\n", (int)errno, strerror(errno) );
+		}
+	}
+
+//=============================================================================================
+// I/O Context Misc Setup
+//=============================================================================================
+	void register_fixed_files( io_context & ctx, int * files, unsigned count ) {
+		int ret = syscall( __NR_io_uring_register, ctx.thrd.ring->fd, IORING_REGISTER_FILES, files, count );
+		if( ret < 0 ) {
+			abort( "KERNEL ERROR: IO_URING SYSCALL - (%d) %s\n", (int)errno, strerror(errno) );
+		}
+
+		__cfadbg_print_safe( io_core, "Kernel I/O : Performed io_register for %p, returned %d\n", active_thread(), ret );
+	}
+
+	void register_fixed_files( cluster & cltr, int * files, unsigned count ) {
+		for(i; cltr.io.cnt) {
+			register_fixed_files( cltr.io.ctxs[i], files, count );
+		}
+	}
+#endif
Index: libcfa/src/concurrency/io/types.hfa
===================================================================
--- libcfa/src/concurrency/io/types.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
+++ libcfa/src/concurrency/io/types.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -0,0 +1,128 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// io/types.hfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Fri Jul 31 16:22:47 2020
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#pragma once
+
+#if defined(CFA_HAVE_LINUX_IO_URING_H)
+      #include "bits/locks.hfa"
+
+	//-----------------------------------------------------------------------
+	// Ring Data structure
+      struct __submition_data {
+		// Head and tail of the ring (associated with array)
+		volatile uint32_t * head;
+		volatile uint32_t * tail;
+		volatile uint32_t prev_head;
+
+		// The actual kernel ring which uses head/tail
+		// indexes into the sqes arrays
+		uint32_t * array;
+
+		// number of entries and mask to go with it
+		const uint32_t * num;
+		const uint32_t * mask;
+
+		// Submission flags (Not sure what for)
+		uint32_t * flags;
+
+		// number of sqes not submitted (whatever that means)
+		uint32_t * dropped;
+
+		// Like head/tail but not seen by the kernel
+		volatile uint32_t * ready;
+		uint32_t ready_cnt;
+
+		__spinlock_t lock;
+		__spinlock_t release_lock;
+
+		// A buffer of sqes (not the actual ring)
+		struct io_uring_sqe * sqes;
+
+		// The location and size of the mmaped area
+		void * ring_ptr;
+		size_t ring_sz;
+	};
+
+	struct __completion_data {
+		// Head and tail of the ring
+		volatile uint32_t * head;
+		volatile uint32_t * tail;
+
+		// number of entries and mask to go with it
+		const uint32_t * mask;
+		const uint32_t * num;
+
+		// number of cqes not submitted (whatever that means)
+		uint32_t * overflow;
+
+		// the kernel ring
+		struct io_uring_cqe * cqes;
+
+		// The location and size of the mmaped area
+		void * ring_ptr;
+		size_t ring_sz;
+	};
+
+	struct __io_data {
+		struct __submition_data submit_q;
+		struct __completion_data completion_q;
+		uint32_t ring_flags;
+		int fd;
+		bool eager_submits:1;
+		bool poller_submits:1;
+	};
+
+
+	//-----------------------------------------------------------------------
+	// IO user data
+	struct __io_user_data_t {
+		int32_t result;
+		$thread * thrd;
+	};
+
+	//-----------------------------------------------------------------------
+	// Misc
+	// Weirdly, some systems that do support io_uring don't actually define these
+	#ifdef __alpha__
+		/*
+		* alpha is the only exception, all other architectures
+		* have common numbers for new system calls.
+		*/
+		#ifndef __NR_io_uring_setup
+			#define __NR_io_uring_setup           535
+		#endif
+		#ifndef __NR_io_uring_enter
+			#define __NR_io_uring_enter           536
+		#endif
+		#ifndef __NR_io_uring_register
+			#define __NR_io_uring_register        537
+		#endif
+	#else /* !__alpha__ */
+		#ifndef __NR_io_uring_setup
+			#define __NR_io_uring_setup           425
+		#endif
+		#ifndef __NR_io_uring_enter
+			#define __NR_io_uring_enter           426
+		#endif
+		#ifndef __NR_io_uring_register
+			#define __NR_io_uring_register        427
+		#endif
+	#endif
+
+	struct epoll_event;
+	struct $io_ctx_thread;
+	void __ioctx_register($io_ctx_thread & ctx, struct epoll_event & ev);
+	void __ioctx_prepare_block($io_ctx_thread & ctx, struct epoll_event & ev);
+#endif
Index: libcfa/src/concurrency/iocall.cfa
===================================================================
--- libcfa/src/concurrency/iocall.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/iocall.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -14,5 +14,8 @@
 //
 
+#define __cforall_thread__
+
 #include "bits/defs.hfa"
+#include "kernel.hfa"
 
 //=============================================================================================
@@ -21,11 +24,14 @@
 
 #if defined(CFA_HAVE_LINUX_IO_URING_H)
+	#include <assert.h>
 	#include <stdint.h>
+	#include <errno.h>
 	#include <linux/io_uring.h>
 
-	#include "kernel_private.hfa"
+	#include "kernel/fwd.hfa"
+	#include "io/types.hfa"
 
 	extern [* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring, uint64_t data );
-	extern void __submit( struct __io_data & ring, uint32_t idx );
+	extern void __submit( struct io_context * ctx, uint32_t idx ) __attribute__((nonnull (1)));
 
 	static inline void ?{}(struct io_uring_sqe & this, uint8_t opcode, int fd) {
@@ -52,16 +58,68 @@
 	}
 
+	static inline io_context * __get_io_context( void ) {
+		cluster * cltr = active_cluster();
+		/* paranoid */ verifyf( cltr, "No active cluster for io operation\n");
+		assertf( cltr->io.cnt > 0, "Cluster %p has no default io contexts and no context was specified\n", cltr );
+		/* paranoid */ verifyf( cltr->io.ctxs, "default io contexts for cluster %p are missing\n", cltr);
+		return &cltr->io.ctxs[ __tls_rand() % cltr->io.cnt ];
+	}
+
+
+      #if defined(CFA_HAVE_IOSQE_FIXED_FILE) && defined(CFA_HAVE_IOSQE_IO_DRAIN) && defined(CFA_HAVE_IOSQE_ASYNC)
+		#define REGULAR_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_DRAIN | IOSQE_ASYNC)
+	#elif defined(CFA_HAVE_IOSQE_FIXED_FILE) && defined(CFA_HAVE_IOSQE_ASYNC)
+		#define REGULAR_FLAGS (IOSQE_FIXED_FILE | IOSQE_ASYNC)
+      #elif defined(CFA_HAVE_IOSQE_FIXED_FILE) && defined(CFA_HAVE_IOSQE_IO_DRAIN)
+		#define REGULAR_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_DRAIN)
+      #elif defined(CFA_HAVE_IOSQE_IO_DRAIN) && defined(CFA_HAVE_IOSQE_ASYNC)
+		#define REGULAR_FLAGS (IOSQE_IO_DRAIN | IOSQE_ASYNC)
+	#elif defined(CFA_HAVE_IOSQE_FIXED_FILE)
+		#define REGULAR_FLAGS (IOSQE_FIXED_FILE)
+      #elif defined(CFA_HAVE_IOSQE_IO_DRAIN)
+		#define REGULAR_FLAGS (IOSQE_IO_DRAIN)
+      #elif defined(CFA_HAVE_IOSQE_ASYNC)
+		#define REGULAR_FLAGS (IOSQE_ASYNC)
+	#else
+		#define REGULAR_FLAGS (0)
+	#endif
+
+	#if defined(CFA_HAVE_IOSQE_IO_LINK) && defined(CFA_HAVE_IOSQE_IO_HARDLINK)
+		#define LINK_FLAGS (IOSQE_IO_LINK | IOSQE_IO_HARDLINK)
+	#elif defined(CFA_HAVE_IOSQE_IO_LINK)
+		#define LINK_FLAGS (IOSQE_IO_LINK)
+	#elif defined(CFA_HAVE_IOSQE_IO_HARDLINK)
+		#define LINK_FLAGS (IOSQE_IO_HARDLINK)
+	#else
+		#define LINK_FLAGS (0)
+	#endif
+
+	#if defined(CFA_HAVE_SPLICE_F_FD_IN_FIXED)
+		#define SPLICE_FLAGS (SPLICE_F_FD_IN_FIXED)
+	#else
+		#define SPLICE_FLAGS (0)
+	#endif
+
+
 	#define __submit_prelude \
+		if( 0 != (submit_flags & LINK_FLAGS) ) { errno = ENOTSUP; return -1; } \
+		(void)timeout; (void)cancellation; \
+		if( !context ) context = __get_io_context(); \
 		__io_user_data_t data = { 0, active_thread() }; \
-		struct __io_data & ring = *data.thrd->curr_cluster->io; \
+		struct __io_data & ring = *context->thrd.ring; \
 		struct io_uring_sqe * sqe; \
 		uint32_t idx; \
-		[sqe, idx] = __submit_alloc( ring, (uint64_t)(uintptr_t)&data );
+		[sqe, idx] = __submit_alloc( ring, (uint64_t)(uintptr_t)&data ); \
+		sqe->flags = REGULAR_FLAGS & submit_flags;
 
 	#define __submit_wait \
 		/*__cfaabi_bits_print_safe( STDERR_FILENO, "Preparing user data %p for %p\n", &data, data.thrd );*/ \
 		verify( sqe->user_data == (uint64_t)(uintptr_t)&data ); \
-		__submit( ring, idx ); \
+		__submit( context, idx ); \
 		park( __cfaabi_dbg_ctx ); \
+		if( data.result < 0 ) { \
+			errno = -data.result; \
+			return -1; \
+		} \
 		return data.result;
 #endif
@@ -70,4 +128,5 @@
 // I/O Forwards
 //=============================================================================================
+#include <time.hfa>
 
 // Some forward declarations
@@ -121,5 +180,5 @@
 // Asynchronous operations
 #if defined(HAVE_PREADV2)
-	ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) {
+	ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 		#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_READV)
 			return preadv2(fd, iov, iovcnt, offset, flags);
@@ -132,13 +191,14 @@
 		#endif
 	}
-
-	ssize_t cfa_preadv2_fixed(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) {
-		#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_READV)
-			return preadv2(fd, iov, iovcnt, offset, flags);
+#endif
+
+#if defined(HAVE_PWRITEV2)
+	ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
+		#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_WRITEV)
+			return pwritev2(fd, iov, iovcnt, offset, flags);
 		#else
 			__submit_prelude
 
-			(*sqe){ IORING_OP_READV, fd, iov, iovcnt, offset };
-			sqe->flags |= IOSQE_FIXED_FILE;
+			(*sqe){ IORING_OP_WRITEV, fd, iov, iovcnt, offset };
 
 			__submit_wait
@@ -147,19 +207,5 @@
 #endif
 
-#if defined(HAVE_PWRITEV2)
-	ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) {
-		#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_WRITEV)
-			return pwritev2(fd, iov, iovcnt, offset, flags);
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_WRITEV, fd, iov, iovcnt, offset };
-
-			__submit_wait
-		#endif
-	}
-#endif
-
-int cfa_fsync(int fd) {
+int cfa_fsync(int fd, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FSYNC)
 		return fsync(fd);
@@ -173,5 +219,5 @@
 }
 
-int cfa_sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags) {
+int cfa_sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SYNC_FILE_RANGE)
 		return sync_file_range(fd, offset, nbytes, flags);
@@ -189,5 +235,5 @@
 
 
-ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags) {
+ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SENDMSG)
 		return sendmsg(sockfd, msg, flags);
@@ -202,5 +248,5 @@
 }
 
-ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags) {
+ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_RECVMSG)
 		return recvmsg(sockfd, msg, flags);
@@ -215,5 +261,5 @@
 }
 
-ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags) {
+ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SEND)
 		return send( sockfd, buf, len, flags );
@@ -230,5 +276,5 @@
 }
 
-ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags) {
+ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_RECV)
 		return recv( sockfd, buf, len, flags );
@@ -245,5 +291,5 @@
 }
 
-int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags) {
+int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_ACCEPT)
 		return accept4( sockfd, addr, addrlen, flags );
@@ -260,5 +306,5 @@
 }
 
-int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen) {
+int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_CONNECT)
 		return connect( sockfd, addr, addrlen );
@@ -274,5 +320,5 @@
 }
 
-int cfa_fallocate(int fd, int mode, uint64_t offset, uint64_t len) {
+int cfa_fallocate(int fd, int mode, uint64_t offset, uint64_t len, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FALLOCATE)
 		return fallocate( fd, mode, offset, len );
@@ -291,5 +337,5 @@
 }
 
-int cfa_fadvise(int fd, uint64_t offset, uint64_t len, int advice) {
+int cfa_fadvise(int fd, uint64_t offset, uint64_t len, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FADVISE)
 		return posix_fadvise( fd, offset, len, advice );
@@ -306,5 +352,5 @@
 }
 
-int cfa_madvise(void *addr, size_t length, int advice) {
+int cfa_madvise(void *addr, size_t length, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_MADVISE)
 		return madvise( addr, length, advice );
@@ -321,5 +367,5 @@
 }
 
-int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode) {
+int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_OPENAT)
 		return openat( dirfd, pathname, flags, mode );
@@ -336,5 +382,5 @@
 }
 
-int cfa_close(int fd) {
+int cfa_close(int fd, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_CLOSE)
 		return close( fd );
@@ -350,5 +396,5 @@
 // Forward declare in case it is not supported
 struct statx;
-int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf) {
+int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_STATX)
 		#if defined(__NR_statx)
@@ -362,11 +408,11 @@
 
 		(*sqe){ IORING_OP_STATX, dirfd, pathname, mask, (uint64_t)statxbuf };
-		sqe->flags = flags;
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_read(int fd, void *buf, size_t count) {
+		sqe->statx_flags = flags;
+
+		__submit_wait
+	#endif
+}
+
+ssize_t cfa_read(int fd, void *buf, size_t count, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_READ)
 		return read( fd, buf, count );
@@ -380,5 +426,5 @@
 }
 
-ssize_t cfa_write(int fd, void *buf, size_t count) {
+ssize_t cfa_write(int fd, void *buf, size_t count, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_WRITE)
 		return read( fd, buf, count );
@@ -392,5 +438,5 @@
 }
 
-ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags) {
+ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SPLICE)
 		return splice( fd_in, off_in, fd_out, off_out, len, flags );
@@ -413,39 +459,11 @@
 			sqe->splice_off_in = (uint64_t)-1;
 		}
-		sqe->splice_flags  = flags;
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags, int in_flags, int out_flags) {
-	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SPLICE)
-		return splice( fd_in, off_in, fd_out, off_out, len, flags );
-	#else
-		__submit_prelude
-
-		(*sqe){ IORING_OP_SPLICE, fd_out };
-		if( off_out ) {
-			sqe->off = *off_out;
-		}
-		else {
-			sqe->off = (uint64_t)-1;
-		}
-		sqe->len = len;
-		sqe->splice_fd_in  = fd_in;
-		if( off_in ) {
-			sqe->splice_off_in = *off_in;
-		}
-		else {
-			sqe->splice_off_in = (uint64_t)-1;
-		}
-		sqe->splice_flags  = flags | out_flags;
-		sqe->flags = in_flags;
-
-		__submit_wait
-	#endif
-}
-
-ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags) {
+		sqe->splice_flags  = flags | (SPLICE_FLAGS & submit_flags);
+
+		__submit_wait
+	#endif
+}
+
+ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
 	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_TEE)
 		return tee( fd_in, fd_out, len, flags );
@@ -455,5 +473,5 @@
 		(*sqe){ IORING_OP_TEE, fd_out, 0p, len, 0 };
 		sqe->splice_fd_in = fd_in;
-		sqe->splice_flags = flags;
+		sqe->splice_flags  = flags | (SPLICE_FLAGS & submit_flags);
 
 		__submit_wait
@@ -562,6 +580,5 @@
 
 		if( /*func == (fptr_t)splice || */
-			func == (fptr_t)(ssize_t (*)(int, loff_t *, int, loff_t *, size_t, unsigned int))cfa_splice,
-			func == (fptr_t)(ssize_t (*)(int, loff_t *, int, loff_t *, size_t, unsigned int, int, int))cfa_splice )
+			func == (fptr_t)cfa_splice )
 			#define _CFA_IO_FEATURE_CFA_HAVE_IORING_OP_SPLICE ,
 			return IS_DEFINED(CFA_HAVE_IORING_OP_SPLICE);
Index: libcfa/src/concurrency/iofwd.hfa
===================================================================
--- libcfa/src/concurrency/iofwd.hfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/iofwd.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -19,6 +19,27 @@
 extern "C" {
 	#include <sys/types.h>
+	#if CFA_HAVE_LINUX_IO_URING_H
+		#include <linux/io_uring.h>
+	#endif
 }
 #include "bits/defs.hfa"
+#include "time.hfa"
+
+#if defined(CFA_HAVE_IOSQE_FIXED_FILE)
+	#define CFA_IO_FIXED_FD1 IOSQE_FIXED_FILE
+#endif
+#if defined(CFA_HAVE_SPLICE_F_FD_IN_FIXED)
+	#define CFA_IO_FIXED_FD2 SPLICE_F_FD_IN_FIXED
+#endif
+#if defined(CFA_HAVE_IOSQE_IO_DRAIN)
+	#define CFA_IO_DRAIN IOSQE_IO_DRAIN
+#endif
+#if defined(CFA_HAVE_IOSQE_ASYNC)
+	#define CFA_IO_ASYNC IOSQE_ASYNC
+#endif
+
+struct cluster;
+struct io_context;
+struct io_cancellation;
 
 struct iovec;
@@ -27,26 +48,30 @@
 struct statx;
 
-extern ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
-extern ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
-extern int cfa_fsync(int fd);
-extern int cfa_sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags);
-extern ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags);
-extern ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags);
-extern ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags);
-extern ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags);
-extern int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
-extern int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
-extern int cfa_fallocate(int fd, int mode, uint64_t offset, uint64_t len);
-extern int cfa_fadvise(int fd, uint64_t offset, uint64_t len, int advice);
-extern int cfa_madvise(void *addr, size_t length, int advice);
-extern int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode);
-extern int cfa_close(int fd);
-extern int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf);
-extern ssize_t cfa_read(int fd, void *buf, size_t count);
-extern ssize_t cfa_write(int fd, void *buf, size_t count);
-extern ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags);
-extern ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags);
+extern ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_fsync(int fd, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_fallocate(int fd, int mode, uint64_t offset, uint64_t len, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_fadvise(int fd, uint64_t offset, uint64_t len, int advice, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_madvise(void *addr, size_t length, int advice, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_close(int fd, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern ssize_t cfa_read(int fd, void *buf, size_t count, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern ssize_t cfa_write(int fd, void *buf, size_t count, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+extern ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
 
 //-----------------------------------------------------------------------------
 // Check if a function is blocks a only the user thread
 bool has_user_level_blocking( fptr_t func );
+
+//-----------------------------------------------------------------------------
+void register_fixed_files( io_context & ctx , int * files, unsigned count );
+void register_fixed_files( cluster    & cltr, int * files, unsigned count );
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/kernel.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -18,22 +18,12 @@
 
 //C Includes
-#include <stddef.h>
 #include <errno.h>
-#include <string.h>
 #include <stdio.h>
-#include <fenv.h>
 #include <signal.h>
 #include <unistd.h>
-#include <limits.h>										// PTHREAD_STACK_MIN
-#include <sys/mman.h>									// mprotect
-extern "C" {
-#include <sys/resource.h>
-}
 
 //CFA Includes
-#include "time.hfa"
 #include "kernel_private.hfa"
 #include "preemption.hfa"
-#include "startup.hfa"
 
 //Private includes
@@ -45,12 +35,4 @@
 // Some assembly required
 #if defined( __i386 )
-	#define CtxGet( ctx )        \
-		__asm__ volatile (     \
-			"movl %%esp,%0\n"\
-			"movl %%ebp,%1\n"\
-			: "=rm" (ctx.SP),\
-				"=rm" (ctx.FP) \
-		)
-
 	// mxcr : SSE Status and Control bits (control bits are preserved across function calls)
 	// fcw  : X87 FPU control word (preserved across function calls)
@@ -74,12 +56,4 @@
 
 #elif defined( __x86_64 )
-	#define CtxGet( ctx )        \
-		__asm__ volatile (     \
-			"movq %%rsp,%0\n"\
-			"movq %%rbp,%1\n"\
-			: "=rm" (ctx.SP),\
-				"=rm" (ctx.FP) \
-		)
-
 	#define __x87_store         \
 		uint32_t __mxcr;      \
@@ -102,16 +76,10 @@
 
 #elif defined( __ARM_ARCH )
-#define CtxGet( ctx ) __asm__ ( \
-		"mov %0,%%sp\n"   \
-		"mov %1,%%r11\n"   \
-	: "=rm" (ctx.SP), "=rm" (ctx.FP) )
 #else
 	#error unknown hardware architecture
 #endif
 
-//-----------------------------------------------------------------------------
-//Start and stop routine for the kernel, declared first to make sure they run first
-static void __kernel_startup (void) __attribute__(( constructor( STARTUP_PRIORITY_KERNEL ) ));
-static void __kernel_shutdown(void) __attribute__(( destructor ( STARTUP_PRIORITY_KERNEL ) ));
+extern $thread * mainThread;
+extern processor * mainProcessor;
 
 //-----------------------------------------------------------------------------
@@ -120,244 +88,7 @@
 static bool __has_next_thread(cluster * this);
 static void __run_thread(processor * this, $thread * dst);
-static bool __wake_proc(processor *);
 static bool __wake_one(struct __processor_id_t * id, cluster * cltr);
 static void __halt(processor * this);
-
-//-----------------------------------------------------------------------------
-// Kernel storage
-KERNEL_STORAGE(cluster,	             mainCluster);
-KERNEL_STORAGE(processor,            mainProcessor);
-KERNEL_STORAGE($thread,	             mainThread);
-KERNEL_STORAGE(__stack_t,            mainThreadCtx);
-KERNEL_STORAGE(__scheduler_RWLock_t, __scheduler_lock);
-#if !defined(__CFA_NO_STATISTICS__)
-KERNEL_STORAGE(__stats_t, mainProcStats);
-#endif
-
-cluster              * mainCluster;
-processor            * mainProcessor;
-$thread              * mainThread;
-__scheduler_RWLock_t * __scheduler_lock;
-
-extern "C" {
-	struct { __dllist_t(cluster) list; __spinlock_t lock; } __cfa_dbg_global_clusters;
-}
-
-size_t __page_size = 0;
-
-//-----------------------------------------------------------------------------
-// Global state
-thread_local struct KernelThreadData kernelTLS __attribute__ ((tls_model ( "initial-exec" ))) @= {
-	NULL,												// cannot use 0p
-	NULL,
-	NULL,
-	{ 1, false, false },
-};
-
-//-----------------------------------------------------------------------------
-// Struct to steal stack
-struct current_stack_info_t {
-	__stack_t * storage;								// pointer to stack object
-	void * base;										// base of stack
-	void * limit;										// stack grows towards stack limit
-	void * context;										// address of cfa_context_t
-};
-
-void ?{}( current_stack_info_t & this ) {
-	__stack_context_t ctx;
-	CtxGet( ctx );
-	this.base = ctx.FP;
-
-	rlimit r;
-	getrlimit( RLIMIT_STACK, &r);
-	size_t size = r.rlim_cur;
-
-	this.limit = (void *)(((intptr_t)this.base) - size);
-	this.context = &storage_mainThreadCtx;
-}
-
-//-----------------------------------------------------------------------------
-// Main thread construction
-
-void ?{}( $coroutine & this, current_stack_info_t * info) with( this ) {
-	stack.storage = info->storage;
-	with(*stack.storage) {
-		limit     = info->limit;
-		base      = info->base;
-	}
-	__attribute__((may_alias)) intptr_t * istorage = (intptr_t*) &stack.storage;
-	*istorage |= 0x1;
-	name = "Main Thread";
-	state = Start;
-	starter = 0p;
-	last = 0p;
-	cancellation = 0p;
-}
-
-void ?{}( $thread & this, current_stack_info_t * info) with( this ) {
-	ticket = 1;
-	state = Start;
-	self_cor{ info };
-	curr_cor = &self_cor;
-	curr_cluster = mainCluster;
-	self_mon.owner = &this;
-	self_mon.recursion = 1;
-	self_mon_p = &self_mon;
-	link.next = 0p;
-	link.prev = 0p;
-
-	node.next = 0p;
-	node.prev = 0p;
-	doregister(curr_cluster, this);
-
-	monitors{ &self_mon_p, 1, (fptr_t)0 };
-}
-
-//-----------------------------------------------------------------------------
-// Processor coroutine
-void ?{}(processorCtx_t & this) {
-
-}
-
-// Construct the processor context of non-main processors
-static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info) {
-	(this.__cor){ info };
-	this.proc = proc;
-}
-
-static void * __invoke_processor(void * arg);
-
-static init(processor & this, const char name[], cluster & _cltr) with( this ) {
-	this.name = name;
-	this.cltr = &_cltr;
-	id = -1u;
-	destroyer = 0p;
-	do_terminate = false;
-	preemption_alarm = 0p;
-	pending_preemption = false;
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		print_stats = 0;
-		print_halts = false;
-	#endif
-
-	int target = __atomic_add_fetch( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
-
-	id = doregister((__processor_id_t*)&this);
-
-	// Lock the RWlock so no-one pushes/pops while we are changing the queue
-	uint_fast32_t last_size = ready_mutate_lock();
-
-		// Adjust the ready queue size
-		ready_queue_grow( cltr, target );
-
-	// Unlock the RWlock
-	ready_mutate_unlock( last_size );
-
-	__cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
-}
-
-// Not a ctor, it just preps the destruction but should not destroy members
-void deinit(processor & this) {
-
-	int target = __atomic_sub_fetch( &this.cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
-
-	// Lock the RWlock so no-one pushes/pops while we are changing the queue
-	uint_fast32_t last_size = ready_mutate_lock();
-
-		// Adjust the ready queue size
-		ready_queue_shrink( this.cltr, target );
-
-		// Make sure we aren't on the idle queue
-		unsafe_remove( this.cltr->idles, &this );
-
-	// Unlock the RWlock
-	ready_mutate_unlock( last_size );
-
-	// Finally we don't need the read_lock any more
-	unregister((__processor_id_t*)&this);
-}
-
-void ?{}(processor & this, const char name[], cluster & _cltr) {
-	( this.idle ){};
-	( this.terminated ){ 0 };
-	( this.runner ){};
-	init( this, name, _cltr );
-
-	__cfadbg_print_safe(runtime_core, "Kernel : Starting core %p\n", &this);
-
-	this.stack = __create_pthread( &this.kernel_thread, __invoke_processor, (void *)&this );
-
-}
-
-void ^?{}(processor & this) with( this ){
-	if( ! __atomic_load_n(&do_terminate, __ATOMIC_ACQUIRE) ) {
-		__cfadbg_print_safe(runtime_core, "Kernel : core %p signaling termination\n", &this);
-
-		__atomic_store_n(&do_terminate, true, __ATOMIC_RELAXED);
-		__wake_proc( &this );
-
-		P( terminated );
-		verify( kernelTLS.this_processor != &this);
-	}
-
-	int err = pthread_join( kernel_thread, 0p );
-	if( err != 0 ) abort("KERNEL ERROR: joining processor %p caused error %s\n", &this, strerror(err));
-
-	free( this.stack );
-
-	deinit( this );
-}
-
-void ?{}(cluster & this, const char name[], Duration preemption_rate, unsigned io_flags) with( this ) {
-	this.name = name;
-	this.preemption_rate = preemption_rate;
-	this.nprocessors = 0;
-	ready_queue{};
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		print_stats = 0;
-		stats = alloc();
-		__init_stats( stats );
-	#endif
-
-	threads{ __get };
-
-	doregister(this);
-
-	// Lock the RWlock so no-one pushes/pops while we are changing the queue
-	uint_fast32_t last_size = ready_mutate_lock();
-
-		// Adjust the ready queue size
-		ready_queue_grow( &this, 0 );
-
-	// Unlock the RWlock
-	ready_mutate_unlock( last_size );
-
-
-	__kernel_io_startup( this, io_flags, &this == mainCluster );
-}
-
-void ^?{}(cluster & this) {
-	__kernel_io_shutdown( this, &this == mainCluster );
-
-	// Lock the RWlock so no-one pushes/pops while we are changing the queue
-	uint_fast32_t last_size = ready_mutate_lock();
-
-		// Adjust the ready queue size
-		ready_queue_shrink( &this, 0 );
-
-	// Unlock the RWlock
-	ready_mutate_unlock( last_size );
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		if( 0 != this.print_stats ) {
-			__print_stats( this.stats, this.print_stats, true, this.name, (void*)&this );
-		}
-		free( this.stats );
-	#endif
-
-	unregister(this);
-}
+bool __wake_proc(processor *);
 
 //=============================================================================================
@@ -550,147 +281,4 @@
 }
 
-// KERNEL_ONLY
-// Context invoker for processors
-// This is the entry point for processors (kernel threads)
-// It effectively constructs a coroutine by stealing the pthread stack
-static void * __invoke_processor(void * arg) {
-	#if !defined( __CFA_NO_STATISTICS__ )
-		__stats_t local_stats;
-		__init_stats( &local_stats );
-		kernelTLS.this_stats = &local_stats;
-	#endif
-
-	processor * proc = (processor *) arg;
-	kernelTLS.this_processor = proc;
-	kernelTLS.this_thread    = 0p;
-	kernelTLS.preemption_state.[enabled, disable_count] = [false, 1];
-	// SKULLDUGGERY: We want to create a context for the processor coroutine
-	// which is needed for the 2-step context switch. However, there is no reason
-	// to waste the perfectly valid stack create by pthread.
-	current_stack_info_t info;
-	__stack_t ctx;
-	info.storage = &ctx;
-	(proc->runner){ proc, &info };
-
-	__cfaabi_dbg_print_safe("Coroutine : created stack %p\n", get_coroutine(proc->runner)->stack.storage);
-
-	//Set global state
-	kernelTLS.this_thread = 0p;
-
-	//We now have a proper context from which to schedule threads
-	__cfadbg_print_safe(runtime_core, "Kernel : core %p created (%p, %p)\n", proc, &proc->runner, &ctx);
-
-	// SKULLDUGGERY: Since the coroutine doesn't have its own stack, we can't
-	// resume it to start it like it normally would, it will just context switch
-	// back to here. Instead directly call the main since we already are on the
-	// appropriate stack.
-	get_coroutine(proc->runner)->state = Active;
-	main( proc->runner );
-	get_coroutine(proc->runner)->state = Halted;
-
-	// Main routine of the core returned, the core is now fully terminated
-	__cfadbg_print_safe(runtime_core, "Kernel : core %p main ended (%p)\n", proc, &proc->runner);
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		__tally_stats(proc->cltr->stats, &local_stats);
-		if( 0 != proc->print_stats ) {
-			__print_stats( &local_stats, proc->print_stats, true, proc->name, (void*)proc );
-		}
-	#endif
-
-	return 0p;
-}
-
-static void Abort( int ret, const char func[] ) {
-	if ( ret ) {										// pthread routines return errno values
-		abort( "%s : internal error, error(%d) %s.", func, ret, strerror( ret ) );
-	} // if
-} // Abort
-
-void * __create_pthread( pthread_t * pthread, void * (*start)(void *), void * arg ) {
-	pthread_attr_t attr;
-
-	Abort( pthread_attr_init( &attr ), "pthread_attr_init" ); // initialize attribute
-
-	size_t stacksize;
-	// default stack size, normally defined by shell limit
-	Abort( pthread_attr_getstacksize( &attr, &stacksize ), "pthread_attr_getstacksize" );
-	assert( stacksize >= PTHREAD_STACK_MIN );
-
-	void * stack;
-	__cfaabi_dbg_debug_do(
-		stack = memalign( __page_size, stacksize + __page_size );
-		// pthread has no mechanism to create the guard page in user supplied stack.
-		if ( mprotect( stack, __page_size, PROT_NONE ) == -1 ) {
-			abort( "mprotect : internal error, mprotect failure, error(%d) %s.", errno, strerror( errno ) );
-		} // if
-	);
-	__cfaabi_dbg_no_debug_do(
-		stack = malloc( stacksize );
-	);
-
-	Abort( pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" );
-
-	Abort( pthread_create( pthread, &attr, start, arg ), "pthread_create" );
-	return stack;
-}
-
-// KERNEL_ONLY
-static void __kernel_first_resume( processor * this ) {
-	$thread * src = mainThread;
-	$coroutine * dst = get_coroutine(this->runner);
-
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	kernelTLS.this_thread->curr_cor = dst;
-	__stack_prepare( &dst->stack, 65000 );
-	__cfactx_start(main, dst, this->runner, __cfactx_invoke_coroutine);
-
-	verify( ! kernelTLS.preemption_state.enabled );
-
-	dst->last = &src->self_cor;
-	dst->starter = dst->starter ? dst->starter : &src->self_cor;
-
-	// make sure the current state is still correct
-	/* paranoid */ verify(src->state == Ready);
-
-	// context switch to specified coroutine
-	verify( dst->context.SP );
-	__cfactx_switch( &src->context, &dst->context );
-	// when __cfactx_switch returns we are back in the src coroutine
-
-	mainThread->curr_cor = &mainThread->self_cor;
-
-	// make sure the current state has been update
-	/* paranoid */ verify(src->state == Active);
-
-	verify( ! kernelTLS.preemption_state.enabled );
-}
-
-// KERNEL_ONLY
-static void __kernel_last_resume( processor * this ) {
-	$coroutine * src = &mainThread->self_cor;
-	$coroutine * dst = get_coroutine(this->runner);
-
-	verify( ! kernelTLS.preemption_state.enabled );
-	verify( dst->starter == src );
-	verify( dst->context.SP );
-
-	// SKULLDUGGERY in debug the processors check that the
-	// stack is still within the limit of the stack limits after running a thread.
-	// that check doesn't make sense if we context switch to the processor using the
-	// coroutine semantics. Since this is a special case, use the current context
-	// info to populate these fields.
-	__cfaabi_dbg_debug_do(
-		__stack_context_t ctx;
-		CtxGet( ctx );
-		mainThread->context.SP = ctx.SP;
-		mainThread->context.FP = ctx.FP;
-	)
-
-	// context switch to the processor
-	__cfactx_switch( &src->context, &dst->context );
-}
-
 //-----------------------------------------------------------------------------
 // Scheduler routines
@@ -834,148 +422,4 @@
 
 //=============================================================================================
-// Kernel Setup logic
-//=============================================================================================
-//-----------------------------------------------------------------------------
-// Kernel boot procedures
-static void __kernel_startup(void) {
-	verify( ! kernelTLS.preemption_state.enabled );
-	__cfadbg_print_safe(runtime_core, "Kernel : Starting\n");
-
-	__page_size = sysconf( _SC_PAGESIZE );
-
-	__cfa_dbg_global_clusters.list{ __get };
-	__cfa_dbg_global_clusters.lock{};
-
-	// Initialize the global scheduler lock
-	__scheduler_lock = (__scheduler_RWLock_t*)&storage___scheduler_lock;
-	(*__scheduler_lock){};
-
-	// Initialize the main cluster
-	mainCluster = (cluster *)&storage_mainCluster;
-	(*mainCluster){"Main Cluster"};
-
-	__cfadbg_print_safe(runtime_core, "Kernel : Main cluster ready\n");
-
-	// Start by initializing the main thread
-	// SKULLDUGGERY: the mainThread steals the process main thread
-	// which will then be scheduled by the mainProcessor normally
-	mainThread = ($thread *)&storage_mainThread;
-	current_stack_info_t info;
-	info.storage = (__stack_t*)&storage_mainThreadCtx;
-	(*mainThread){ &info };
-
-	__cfadbg_print_safe(runtime_core, "Kernel : Main thread ready\n");
-
-
-
-	// Construct the processor context of the main processor
-	void ?{}(processorCtx_t & this, processor * proc) {
-		(this.__cor){ "Processor" };
-		this.__cor.starter = 0p;
-		this.proc = proc;
-	}
-
-	void ?{}(processor & this) with( this ) {
-		( this.idle ){};
-		( this.terminated ){ 0 };
-		( this.runner ){};
-		init( this, "Main Processor", *mainCluster );
-		kernel_thread = pthread_self();
-
-		runner{ &this };
-		__cfadbg_print_safe(runtime_core, "Kernel : constructed main processor context %p\n", &runner);
-	}
-
-	// Initialize the main processor and the main processor ctx
-	// (the coroutine that contains the processing control flow)
-	mainProcessor = (processor *)&storage_mainProcessor;
-	(*mainProcessor){};
-
-	//initialize the global state variables
-	kernelTLS.this_processor = mainProcessor;
-	kernelTLS.this_thread    = mainThread;
-
-	#if !defined( __CFA_NO_STATISTICS__ )
-		kernelTLS.this_stats = (__stats_t *)& storage_mainProcStats;
-		__init_stats( kernelTLS.this_stats );
-	#endif
-
-	// Enable preemption
-	kernel_start_preemption();
-
-	// Add the main thread to the ready queue
-	// once resume is called on mainProcessor->runner the mainThread needs to be scheduled like any normal thread
-	__schedule_thread((__processor_id_t *)mainProcessor, mainThread);
-
-	// SKULLDUGGERY: Force a context switch to the main processor to set the main thread's context to the current UNIX
-	// context. Hence, the main thread does not begin through __cfactx_invoke_thread, like all other threads. The trick here is that
-	// mainThread is on the ready queue when this call is made.
-	__kernel_first_resume( kernelTLS.this_processor );
-
-
-	// THE SYSTEM IS NOW COMPLETELY RUNNING
-
-
-	// Now that the system is up, finish creating systems that need threading
-	__kernel_io_finish_start( *mainCluster );
-
-
-	__cfadbg_print_safe(runtime_core, "Kernel : Started\n--------------------------------------------------\n\n");
-
-	verify( ! kernelTLS.preemption_state.enabled );
-	enable_interrupts( __cfaabi_dbg_ctx );
-	verify( TL_GET( preemption_state.enabled ) );
-}
-
-static void __kernel_shutdown(void) {
-	//Before we start shutting things down, wait for systems that need threading to shutdown
-	__kernel_io_prepare_stop( *mainCluster );
-
-	/* paranoid */ verify( TL_GET( preemption_state.enabled ) );
-	disable_interrupts();
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-
-	__cfadbg_print_safe(runtime_core, "\n--------------------------------------------------\nKernel : Shutting down\n");
-
-	// SKULLDUGGERY: Notify the mainProcessor it needs to terminates.
-	// When its coroutine terminates, it return control to the mainThread
-	// which is currently here
-	__atomic_store_n(&mainProcessor->do_terminate, true, __ATOMIC_RELEASE);
-	__kernel_last_resume( kernelTLS.this_processor );
-	mainThread->self_cor.state = Halted;
-
-	// THE SYSTEM IS NOW COMPLETELY STOPPED
-
-	// Disable preemption
-	kernel_stop_preemption();
-
-	// Destroy the main processor and its context in reverse order of construction
-	// These were manually constructed so we need manually destroy them
-	void ^?{}(processor & this) with( this ){
-		deinit( this );
-
-		/* paranoid */ verify( this.do_terminate == true );
-		__cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
-	}
-
-	^(*mainProcessor){};
-
-	// Final step, destroy the main thread since it is no longer needed
-
-	// Since we provided a stack to this taxk it will not destroy anything
-	/* paranoid */ verify(mainThread->self_cor.stack.storage == (__stack_t*)(((uintptr_t)&storage_mainThreadCtx)| 0x1));
-	^(*mainThread){};
-
-	^(*mainCluster){};
-
-	^(*__scheduler_lock){};
-
-	^(__cfa_dbg_global_clusters.list){};
-	^(__cfa_dbg_global_clusters.lock){};
-
-	__cfadbg_print_safe(runtime_core, "Kernel : Shutdown complete\n");
-}
-
-//=============================================================================================
 // Kernel Idle Sleep
 //=============================================================================================
@@ -997,5 +441,5 @@
 
 // Unconditionnaly wake a thread
-static bool __wake_proc(processor * this) {
+bool __wake_proc(processor * this) {
 	__cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this);
 
@@ -1075,5 +519,5 @@
 
 void kernel_abort_msg( void * kernel_data, char * abort_text, int abort_text_size ) {
-	$thread * thrd = kernel_data;
+	$thread * thrd = ( $thread * ) kernel_data;
 
 	if(thrd) {
@@ -1170,32 +614,4 @@
 
 	return thrd != 0p;
-}
-
-//-----------------------------------------------------------------------------
-// Global Queues
-void doregister( cluster     & cltr ) {
-	lock      ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
-	push_front( __cfa_dbg_global_clusters.list, cltr );
-	unlock    ( __cfa_dbg_global_clusters.lock );
-}
-
-void unregister( cluster     & cltr ) {
-	lock  ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
-	remove( __cfa_dbg_global_clusters.list, cltr );
-	unlock( __cfa_dbg_global_clusters.lock );
-}
-
-void doregister( cluster * cltr, $thread & thrd ) {
-	lock      (cltr->thread_list_lock __cfaabi_dbg_ctx2);
-	cltr->nthreads += 1;
-	push_front(cltr->threads, thrd);
-	unlock    (cltr->thread_list_lock);
-}
-
-void unregister( cluster * cltr, $thread & thrd ) {
-	lock  (cltr->thread_list_lock __cfaabi_dbg_ctx2);
-	remove(cltr->threads, thrd );
-	cltr->nthreads -= 1;
-	unlock(cltr->thread_list_lock);
 }
 
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/kernel.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -16,7 +16,4 @@
 #pragma once
 
-#include <stdbool.h>
-#include <stdint.h>
-
 #include "invoke.h"
 #include "time_t.hfa"
@@ -26,6 +23,5 @@
 
 extern "C" {
-#include <pthread.h>
-#include <semaphore.h>
+#include <bits/pthreadtypes.h>
 }
 
@@ -129,11 +125,42 @@
 struct __io_data;
 
-#define CFA_CLUSTER_IO_POLLER_USER_THREAD    (1 << 0) // 0x01
-#define CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS (1 << 1) // 0x02
-#define CFA_CLUSTER_IO_EAGER_SUBMITS         (1 << 2) // 0x04
-#define CFA_CLUSTER_IO_KERNEL_POLL_SUBMITS   (1 << 3) // 0x08
-#define CFA_CLUSTER_IO_KERNEL_POLL_COMPLETES (1 << 4) // 0x10
-#define CFA_CLUSTER_IO_BUFFLEN_OFFSET        16
-
+// IO poller user-thread
+// Not using the "thread" keyword because we want to control
+// more carefully when to start/stop it
+struct $io_ctx_thread {
+	struct __io_data * ring;
+	single_sem sem;
+	volatile bool done;
+	$thread self;
+};
+
+
+struct io_context {
+	$io_ctx_thread thrd;
+};
+
+struct io_context_params {
+	int num_entries;
+	int num_ready;
+	int submit_aff;
+	bool eager_submits:1;
+	bool poller_submits:1;
+	bool poll_submit:1;
+	bool poll_complete:1;
+};
+
+void  ?{}(io_context_params & this);
+
+void  ?{}(io_context & this, struct cluster & cl);
+void  ?{}(io_context & this, struct cluster & cl, const io_context_params & params);
+void ^?{}(io_context & this);
+
+struct io_cancellation {
+	uint32_t target;
+};
+
+static inline void  ?{}(io_cancellation & this) { this.target = -1u; }
+static inline void ^?{}(io_cancellation & this) {}
+bool cancel(io_cancellation & this);
 
 //-----------------------------------------------------------------------------
@@ -206,5 +233,8 @@
 	} node;
 
-	struct __io_data * io;
+	struct {
+		io_context * ctxs;
+		unsigned cnt;
+	} io;
 
 	#if !defined(__CFA_NO_STATISTICS__)
@@ -215,13 +245,19 @@
 extern Duration default_preemption();
 
-void ?{} (cluster & this, const char name[], Duration preemption_rate, unsigned flags);
+void ?{} (cluster & this, const char name[], Duration preemption_rate, unsigned num_io, const io_context_params & io_params);
 void ^?{}(cluster & this);
 
-static inline void ?{} (cluster & this)                                           { this{"Anonymous Cluster", default_preemption(), 0}; }
-static inline void ?{} (cluster & this, Duration preemption_rate)                 { this{"Anonymous Cluster", preemption_rate, 0}; }
-static inline void ?{} (cluster & this, const char name[])                        { this{name, default_preemption(), 0}; }
-static inline void ?{} (cluster & this, unsigned flags)                           { this{"Anonymous Cluster", default_preemption(), flags}; }
-static inline void ?{} (cluster & this, Duration preemption_rate, unsigned flags) { this{"Anonymous Cluster", preemption_rate, flags}; }
-static inline void ?{} (cluster & this, const char name[], unsigned flags)        { this{name, default_preemption(), flags}; }
+static inline void ?{} (cluster & this)                                            { io_context_params default_params;    this{"Anonymous Cluster", default_preemption(), 1, default_params}; }
+static inline void ?{} (cluster & this, Duration preemption_rate)                  { io_context_params default_params;    this{"Anonymous Cluster", preemption_rate, 1, default_params}; }
+static inline void ?{} (cluster & this, const char name[])                         { io_context_params default_params;    this{name, default_preemption(), 1, default_params}; }
+static inline void ?{} (cluster & this, unsigned num_io)                           { io_context_params default_params;    this{"Anonymous Cluster", default_preemption(), num_io, default_params}; }
+static inline void ?{} (cluster & this, Duration preemption_rate, unsigned num_io) { io_context_params default_params;    this{"Anonymous Cluster", preemption_rate, num_io, default_params}; }
+static inline void ?{} (cluster & this, const char name[], unsigned num_io)        { io_context_params default_params;    this{name, default_preemption(), num_io, default_params}; }
+static inline void ?{} (cluster & this, const io_context_params & io_params)                                            { this{"Anonymous Cluster", default_preemption(), 1, io_params}; }
+static inline void ?{} (cluster & this, Duration preemption_rate, const io_context_params & io_params)                  { this{"Anonymous Cluster", preemption_rate, 1, io_params}; }
+static inline void ?{} (cluster & this, const char name[], const io_context_params & io_params)                         { this{name, default_preemption(), 1, io_params}; }
+static inline void ?{} (cluster & this, unsigned num_io, const io_context_params & io_params)                           { this{"Anonymous Cluster", default_preemption(), num_io, io_params}; }
+static inline void ?{} (cluster & this, Duration preemption_rate, unsigned num_io, const io_context_params & io_params) { this{"Anonymous Cluster", preemption_rate, num_io, io_params}; }
+static inline void ?{} (cluster & this, const char name[], unsigned num_io, const io_context_params & io_params)        { this{name, default_preemption(), num_io, io_params}; }
 
 static inline [cluster *&, cluster *& ] __get( cluster & this ) __attribute__((const)) { return this.node.[next, prev]; }
Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -0,0 +1,124 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// kernel/fwd.hfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Thu Jul 30 16:46:41 2020
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#pragma once
+
+#include "bits/defs.hfa"
+#include "bits/debug.hfa"
+
+#ifdef __cforall
+#include "bits/random.hfa"
+#endif
+
+struct $thread;
+struct processor;
+struct cluster;
+
+enum __Preemption_Reason { __NO_PREEMPTION, __ALARM_PREEMPTION, __POLL_PREEMPTION, __MANUAL_PREEMPTION };
+
+#define KERNEL_STORAGE(T,X) __attribute((aligned(__alignof__(T)))) static char storage_##X[sizeof(T)]
+
+#ifdef __cforall
+extern "C" {
+	extern "Cforall" {
+		extern __attribute__((aligned(128))) thread_local struct KernelThreadData {
+			struct $thread    * volatile this_thread;
+			struct processor  * volatile this_processor;
+			struct __stats_t  * volatile this_stats;
+
+			struct {
+				volatile unsigned short disable_count;
+				volatile bool enabled;
+				volatile bool in_progress;
+			} preemption_state;
+
+			#if defined(__SIZEOF_INT128__)
+				__uint128_t rand_seed;
+			#else
+				uint64_t rand_seed;
+			#endif
+		} kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
+
+		static inline uint64_t __tls_rand() {
+			#if defined(__SIZEOF_INT128__)
+				return __lehmer64( kernelTLS.rand_seed );
+			#else
+				return __xorshift64( kernelTLS.rand_seed );
+			#endif
+		}
+	}
+
+	#ifdef __ARM_ARCH
+		// function prototypes are only really used by these macros on ARM
+		void disable_global_interrupts();
+		void enable_global_interrupts();
+
+		#define TL_GET( member ) ( { __typeof__( kernelTLS.member ) target; \
+			disable_global_interrupts(); \
+			target = kernelTLS.member; \
+			enable_global_interrupts(); \
+			target; } )
+		#define TL_SET( member, value ) disable_global_interrupts(); \
+			kernelTLS.member = value; \
+			enable_global_interrupts();
+	#else
+		#define TL_GET( member ) kernelTLS.member
+		#define TL_SET( member, value ) kernelTLS.member = value;
+	#endif
+
+	extern void disable_interrupts();
+	extern void enable_interrupts_noPoll();
+	extern void enable_interrupts( __cfaabi_dbg_ctx_param );
+
+	extern "Cforall" {
+		extern void park( __cfaabi_dbg_ctx_param );
+		extern void unpark( struct $thread * this __cfaabi_dbg_ctx_param2 );
+		static inline struct $thread * active_thread () { return TL_GET( this_thread ); }
+
+		extern bool force_yield( enum __Preemption_Reason );
+
+		static inline void yield() {
+			force_yield(__MANUAL_PREEMPTION);
+		}
+
+		// Yield: yield N times
+		static inline void yield( unsigned times ) {
+			for( times ) {
+				yield();
+			}
+		}
+
+		//-----------------------------------------------------------------------
+		// Statics call at the end of each thread to register statistics
+		#if !defined(__CFA_NO_STATISTICS__)
+			static inline struct __stats_t * __tls_stats() {
+				/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+				/* paranoid */ verify( kernelTLS.this_stats );
+				return kernelTLS.this_stats;
+			}
+
+			#define __STATS__(in_kernel, ...) { \
+				if( !(in_kernel) ) disable_interrupts(); \
+				with( *__tls_stats() ) { \
+					__VA_ARGS__ \
+				} \
+				if( !(in_kernel) ) enable_interrupts( __cfaabi_dbg_ctx ); \
+			}
+		#else
+			#define __STATS__(in_kernel, ...)
+		#endif
+	}
+}
+#endif
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -0,0 +1,667 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// kernel/startup.cfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Thu Jul 30 15:12:54 2020
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#define __cforall_thread__
+
+// C Includes
+#include <errno.h>              // errno
+#include <string.h>             // strerror
+#include <unistd.h>             // sysconf
+extern "C" {
+      #include <limits.h>       // PTHREAD_STACK_MIN
+      #include <sys/mman.h>     // mprotect
+      #include <sys/resource.h> // getrlimit
+}
+
+// CFA Includes
+#include "kernel_private.hfa"
+#include "startup.hfa"          // STARTUP_PRIORITY_XXX
+
+//-----------------------------------------------------------------------------
+// Some assembly required
+#if defined( __i386 )
+	#define CtxGet( ctx )        \
+		__asm__ volatile (     \
+			"movl %%esp,%0\n"\
+			"movl %%ebp,%1\n"\
+			: "=rm" (ctx.SP),\
+				"=rm" (ctx.FP) \
+		)
+#elif defined( __x86_64 )
+	#define CtxGet( ctx )        \
+		__asm__ volatile (     \
+			"movq %%rsp,%0\n"\
+			"movq %%rbp,%1\n"\
+			: "=rm" (ctx.SP),\
+				"=rm" (ctx.FP) \
+		)
+#elif defined( __ARM_ARCH )
+#define CtxGet( ctx ) __asm__ ( \
+		"mov %0,%%sp\n"   \
+		"mov %1,%%r11\n"   \
+	: "=rm" (ctx.SP), "=rm" (ctx.FP) )
+#else
+	#error unknown hardware architecture
+#endif
+
+//-----------------------------------------------------------------------------
+// Start and stop routine for the kernel, declared first to make sure they run first
+static void __kernel_startup (void) __attribute__(( constructor( STARTUP_PRIORITY_KERNEL ) ));
+static void __kernel_shutdown(void) __attribute__(( destructor ( STARTUP_PRIORITY_KERNEL ) ));
+
+//-----------------------------------------------------------------------------
+// Static Forward Declarations
+struct current_stack_info_t;
+
+static void * __invoke_processor(void * arg);
+static void __kernel_first_resume( processor * this );
+static void __kernel_last_resume ( processor * this );
+static void init(processor & this, const char name[], cluster & _cltr);
+static void deinit(processor & this);
+static void doregister( struct cluster & cltr );
+static void unregister( struct cluster & cltr );
+static void ?{}( $coroutine & this, current_stack_info_t * info);
+static void ?{}( $thread & this, current_stack_info_t * info);
+static void ?{}(processorCtx_t & this) {}
+static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info);
+
+//-----------------------------------------------------------------------------
+// Forward Declarations for other modules
+extern void __kernel_alarm_startup(void);
+extern void __kernel_alarm_shutdown(void);
+extern void __kernel_io_startup (void);
+extern void __kernel_io_shutdown(void);
+
+//-----------------------------------------------------------------------------
+// Other Forward Declarations
+extern bool __wake_proc(processor *);
+
+//-----------------------------------------------------------------------------
+// Kernel storage
+KERNEL_STORAGE(cluster,	             mainCluster);
+KERNEL_STORAGE(processor,            mainProcessor);
+KERNEL_STORAGE($thread,	             mainThread);
+KERNEL_STORAGE(__stack_t,            mainThreadCtx);
+KERNEL_STORAGE(io_context,           mainPollerThread);
+KERNEL_STORAGE(__scheduler_RWLock_t, __scheduler_lock);
+#if !defined(__CFA_NO_STATISTICS__)
+KERNEL_STORAGE(__stats_t, mainProcStats);
+#endif
+
+cluster              * mainCluster;
+processor            * mainProcessor;
+$thread              * mainThread;
+__scheduler_RWLock_t * __scheduler_lock;
+
+extern "C" {
+	struct { __dllist_t(cluster) list; __spinlock_t lock; } __cfa_dbg_global_clusters;
+}
+
+size_t __page_size = 0;
+
+//-----------------------------------------------------------------------------
+// Global state
+thread_local struct KernelThreadData kernelTLS __attribute__ ((tls_model ( "initial-exec" ))) @= {
+	NULL,												// cannot use 0p
+	NULL,
+	NULL,
+	{ 1, false, false },
+};
+
+//-----------------------------------------------------------------------------
+// Struct to steal stack
+struct current_stack_info_t {
+	__stack_t * storage;  // pointer to stack object
+	void * base;          // base of stack
+	void * limit;         // stack grows towards stack limit
+	void * context;       // address of cfa_context_t
+};
+
+void ?{}( current_stack_info_t & this ) {
+	__stack_context_t ctx;
+	CtxGet( ctx );
+	this.base = ctx.FP;
+
+	rlimit r;
+	getrlimit( RLIMIT_STACK, &r);
+	size_t size = r.rlim_cur;
+
+	this.limit = (void *)(((intptr_t)this.base) - size);
+	this.context = &storage_mainThreadCtx;
+}
+
+
+
+//=============================================================================================
+// Kernel Setup logic
+//=============================================================================================
+//-----------------------------------------------------------------------------
+// Kernel boot procedures
+static void __kernel_startup(void) {
+	verify( ! kernelTLS.preemption_state.enabled );
+	__cfadbg_print_safe(runtime_core, "Kernel : Starting\n");
+
+	__page_size = sysconf( _SC_PAGESIZE );
+
+	__cfa_dbg_global_clusters.list{ __get };
+	__cfa_dbg_global_clusters.lock{};
+
+	// Initialize the global scheduler lock
+	__scheduler_lock = (__scheduler_RWLock_t*)&storage___scheduler_lock;
+	(*__scheduler_lock){};
+
+	// Initialize the main cluster
+	mainCluster = (cluster *)&storage_mainCluster;
+	(*mainCluster){"Main Cluster", 0};
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Main cluster ready\n");
+
+	// Start by initializing the main thread
+	// SKULLDUGGERY: the mainThread steals the process main thread
+	// which will then be scheduled by the mainProcessor normally
+	mainThread = ($thread *)&storage_mainThread;
+	current_stack_info_t info;
+	info.storage = (__stack_t*)&storage_mainThreadCtx;
+	(*mainThread){ &info };
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Main thread ready\n");
+
+
+
+	// Construct the processor context of the main processor
+	void ?{}(processorCtx_t & this, processor * proc) {
+		(this.__cor){ "Processor" };
+		this.__cor.starter = 0p;
+		this.proc = proc;
+	}
+
+	void ?{}(processor & this) with( this ) {
+		( this.idle ){};
+		( this.terminated ){ 0 };
+		( this.runner ){};
+		init( this, "Main Processor", *mainCluster );
+		kernel_thread = pthread_self();
+
+		runner{ &this };
+		__cfadbg_print_safe(runtime_core, "Kernel : constructed main processor context %p\n", &runner);
+	}
+
+	// Initialize the main processor and the main processor ctx
+	// (the coroutine that contains the processing control flow)
+	mainProcessor = (processor *)&storage_mainProcessor;
+	(*mainProcessor){};
+
+	//initialize the global state variables
+	kernelTLS.this_processor = mainProcessor;
+	kernelTLS.this_thread    = mainThread;
+
+	#if !defined( __CFA_NO_STATISTICS__ )
+		kernelTLS.this_stats = (__stats_t *)& storage_mainProcStats;
+		__init_stats( kernelTLS.this_stats );
+	#endif
+
+	// Enable preemption
+	__kernel_alarm_startup();
+
+	// Start IO
+	__kernel_io_startup();
+
+	// Add the main thread to the ready queue
+	// once resume is called on mainProcessor->runner the mainThread needs to be scheduled like any normal thread
+	__schedule_thread((__processor_id_t *)mainProcessor, mainThread);
+
+	// SKULLDUGGERY: Force a context switch to the main processor to set the main thread's context to the current UNIX
+	// context. Hence, the main thread does not begin through __cfactx_invoke_thread, like all other threads. The trick here is that
+	// mainThread is on the ready queue when this call is made.
+	__kernel_first_resume( kernelTLS.this_processor );
+
+
+	// THE SYSTEM IS NOW COMPLETELY RUNNING
+
+
+	// SKULLDUGGERY: The constructor for the mainCluster will call alloc with a dimension of 0
+	// malloc *can* return a non-null value, we should free it if that is the case
+	free( mainCluster->io.ctxs );
+
+	// Now that the system is up, finish creating systems that need threading
+	mainCluster->io.ctxs = (io_context *)&storage_mainPollerThread;
+	mainCluster->io.cnt  = 1;
+	(*mainCluster->io.ctxs){ *mainCluster };
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Started\n--------------------------------------------------\n\n");
+
+	verify( ! kernelTLS.preemption_state.enabled );
+	enable_interrupts( __cfaabi_dbg_ctx );
+	verify( TL_GET( preemption_state.enabled ) );
+}
+
+static void __kernel_shutdown(void) {
+	//Before we start shutting things down, wait for systems that need threading to shutdown
+	^(*mainCluster->io.ctxs){};
+	mainCluster->io.cnt  = 0;
+	mainCluster->io.ctxs = 0p;
+
+	/* paranoid */ verify( TL_GET( preemption_state.enabled ) );
+	disable_interrupts();
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+
+	__cfadbg_print_safe(runtime_core, "\n--------------------------------------------------\nKernel : Shutting down\n");
+
+	// SKULLDUGGERY: Notify the mainProcessor it needs to terminates.
+	// When its coroutine terminates, it return control to the mainThread
+	// which is currently here
+	__atomic_store_n(&mainProcessor->do_terminate, true, __ATOMIC_RELEASE);
+	__kernel_last_resume( kernelTLS.this_processor );
+	mainThread->self_cor.state = Halted;
+
+	// THE SYSTEM IS NOW COMPLETELY STOPPED
+
+	// Disable preemption
+	__kernel_alarm_shutdown();
+
+	// Stop IO
+	__kernel_io_shutdown();
+
+	// Destroy the main processor and its context in reverse order of construction
+	// These were manually constructed so we need manually destroy them
+	void ^?{}(processor & this) with( this ){
+		deinit( this );
+
+		/* paranoid */ verify( this.do_terminate == true );
+		__cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
+	}
+
+	^(*mainProcessor){};
+
+	// Final step, destroy the main thread since it is no longer needed
+
+	// Since we provided a stack to this taxk it will not destroy anything
+	/* paranoid */ verify(mainThread->self_cor.stack.storage == (__stack_t*)(((uintptr_t)&storage_mainThreadCtx)| 0x1));
+	^(*mainThread){};
+
+	^(*mainCluster){};
+
+	^(*__scheduler_lock){};
+
+	^(__cfa_dbg_global_clusters.list){};
+	^(__cfa_dbg_global_clusters.lock){};
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Shutdown complete\n");
+}
+
+//=============================================================================================
+// Kernel Initial Scheduling logic
+//=============================================================================================
+
+// Context invoker for processors
+// This is the entry point for processors (kernel threads) *except* for the main processor
+// It effectively constructs a coroutine by stealing the pthread stack
+static void * __invoke_processor(void * arg) {
+	#if !defined( __CFA_NO_STATISTICS__ )
+		__stats_t local_stats;
+		__init_stats( &local_stats );
+		kernelTLS.this_stats = &local_stats;
+	#endif
+
+	processor * proc = (processor *) arg;
+	kernelTLS.this_processor = proc;
+	kernelTLS.this_thread    = 0p;
+	kernelTLS.preemption_state.[enabled, disable_count] = [false, 1];
+	// SKULLDUGGERY: We want to create a context for the processor coroutine
+	// which is needed for the 2-step context switch. However, there is no reason
+	// to waste the perfectly valid stack create by pthread.
+	current_stack_info_t info;
+	__stack_t ctx;
+	info.storage = &ctx;
+	(proc->runner){ proc, &info };
+
+	__cfaabi_dbg_print_safe("Coroutine : created stack %p\n", get_coroutine(proc->runner)->stack.storage);
+
+	//Set global state
+	kernelTLS.this_thread = 0p;
+
+	//We now have a proper context from which to schedule threads
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p created (%p, %p)\n", proc, &proc->runner, &ctx);
+
+	// SKULLDUGGERY: Since the coroutine doesn't have its own stack, we can't
+	// resume it to start it like it normally would, it will just context switch
+	// back to here. Instead directly call the main since we already are on the
+	// appropriate stack.
+	get_coroutine(proc->runner)->state = Active;
+	main( proc->runner );
+	get_coroutine(proc->runner)->state = Halted;
+
+	// Main routine of the core returned, the core is now fully terminated
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p main ended (%p)\n", proc, &proc->runner);
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		__tally_stats(proc->cltr->stats, &local_stats);
+		if( 0 != proc->print_stats ) {
+			__print_stats( &local_stats, proc->print_stats, true, proc->name, (void*)proc );
+		}
+	#endif
+
+	return 0p;
+}
+
+static void __kernel_first_resume( processor * this ) {
+	$thread * src = mainThread;
+	$coroutine * dst = get_coroutine(this->runner);
+
+	verify( ! kernelTLS.preemption_state.enabled );
+
+	kernelTLS.this_thread->curr_cor = dst;
+	__stack_prepare( &dst->stack, 65000 );
+	__cfactx_start(main, dst, this->runner, __cfactx_invoke_coroutine);
+
+	verify( ! kernelTLS.preemption_state.enabled );
+
+	dst->last = &src->self_cor;
+	dst->starter = dst->starter ? dst->starter : &src->self_cor;
+
+	// make sure the current state is still correct
+	/* paranoid */ verify(src->state == Ready);
+
+	// context switch to specified coroutine
+	verify( dst->context.SP );
+	__cfactx_switch( &src->context, &dst->context );
+	// when __cfactx_switch returns we are back in the src coroutine
+
+	mainThread->curr_cor = &mainThread->self_cor;
+
+	// make sure the current state has been update
+	/* paranoid */ verify(src->state == Active);
+
+	verify( ! kernelTLS.preemption_state.enabled );
+}
+
+// KERNEL_ONLY
+static void __kernel_last_resume( processor * this ) {
+	$coroutine * src = &mainThread->self_cor;
+	$coroutine * dst = get_coroutine(this->runner);
+
+	verify( ! kernelTLS.preemption_state.enabled );
+	verify( dst->starter == src );
+	verify( dst->context.SP );
+
+	// SKULLDUGGERY in debug the processors check that the
+	// stack is still within the limit of the stack limits after running a thread.
+	// that check doesn't make sense if we context switch to the processor using the
+	// coroutine semantics. Since this is a special case, use the current context
+	// info to populate these fields.
+	__cfaabi_dbg_debug_do(
+		__stack_context_t ctx;
+		CtxGet( ctx );
+		mainThread->context.SP = ctx.SP;
+		mainThread->context.FP = ctx.FP;
+	)
+
+	// context switch to the processor
+	__cfactx_switch( &src->context, &dst->context );
+}
+
+
+//=============================================================================================
+// Kernel Object Constructors logic
+//=============================================================================================
+//-----------------------------------------------------------------------------
+// Main thread construction
+static void ?{}( $coroutine & this, current_stack_info_t * info) with( this ) {
+	stack.storage = info->storage;
+	with(*stack.storage) {
+		limit     = info->limit;
+		base      = info->base;
+	}
+	__attribute__((may_alias)) intptr_t * istorage = (intptr_t*) &stack.storage;
+	*istorage |= 0x1;
+	name = "Main Thread";
+	state = Start;
+	starter = 0p;
+	last = 0p;
+	cancellation = 0p;
+}
+
+static void ?{}( $thread & this, current_stack_info_t * info) with( this ) {
+	ticket = 1;
+	state = Start;
+	self_cor{ info };
+	curr_cor = &self_cor;
+	curr_cluster = mainCluster;
+	self_mon.owner = &this;
+	self_mon.recursion = 1;
+	self_mon_p = &self_mon;
+	link.next = 0p;
+	link.prev = 0p;
+
+	node.next = 0p;
+	node.prev = 0p;
+	doregister(curr_cluster, this);
+
+	monitors{ &self_mon_p, 1, (fptr_t)0 };
+}
+
+//-----------------------------------------------------------------------------
+// Processor
+// Construct the processor context of non-main processors
+static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info) {
+	(this.__cor){ info };
+	this.proc = proc;
+}
+
+static void init(processor & this, const char name[], cluster & _cltr) with( this ) {
+	this.name = name;
+	this.cltr = &_cltr;
+	id = -1u;
+	destroyer = 0p;
+	do_terminate = false;
+	preemption_alarm = 0p;
+	pending_preemption = false;
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		print_stats = 0;
+		print_halts = false;
+	#endif
+
+	int target = __atomic_add_fetch( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+
+	id = doregister((__processor_id_t*)&this);
+
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock();
+
+		// Adjust the ready queue size
+		ready_queue_grow( cltr, target );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
+}
+
+// Not a ctor, it just preps the destruction but should not destroy members
+static void deinit(processor & this) {
+
+	int target = __atomic_sub_fetch( &this.cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock();
+
+		// Adjust the ready queue size
+		ready_queue_shrink( this.cltr, target );
+
+		// Make sure we aren't on the idle queue
+		unsafe_remove( this.cltr->idles, &this );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+
+	// Finally we don't need the read_lock any more
+	unregister((__processor_id_t*)&this);
+}
+
+void ?{}(processor & this, const char name[], cluster & _cltr) {
+	( this.idle ){};
+	( this.terminated ){ 0 };
+	( this.runner ){};
+	init( this, name, _cltr );
+
+	__cfadbg_print_safe(runtime_core, "Kernel : Starting core %p\n", &this);
+
+	this.stack = __create_pthread( &this.kernel_thread, __invoke_processor, (void *)&this );
+
+}
+
+void ^?{}(processor & this) with( this ){
+	if( ! __atomic_load_n(&do_terminate, __ATOMIC_ACQUIRE) ) {
+		__cfadbg_print_safe(runtime_core, "Kernel : core %p signaling termination\n", &this);
+
+		__atomic_store_n(&do_terminate, true, __ATOMIC_RELAXED);
+		__wake_proc( &this );
+
+		P( terminated );
+		verify( kernelTLS.this_processor != &this);
+	}
+
+	int err = pthread_join( kernel_thread, 0p );
+	if( err != 0 ) abort("KERNEL ERROR: joining processor %p caused error %s\n", &this, strerror(err));
+
+	free( this.stack );
+
+	deinit( this );
+}
+
+//-----------------------------------------------------------------------------
+// Cluster
+void ?{}(cluster & this, const char name[], Duration preemption_rate, unsigned num_io, const io_context_params & io_params) with( this ) {
+	this.name = name;
+	this.preemption_rate = preemption_rate;
+	this.nprocessors = 0;
+	ready_queue{};
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		print_stats = 0;
+		stats = alloc();
+		__init_stats( stats );
+	#endif
+
+	threads{ __get };
+
+	doregister(this);
+
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock();
+
+		// Adjust the ready queue size
+		ready_queue_grow( &this, 0 );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+
+	this.io.cnt  = num_io;
+	this.io.ctxs = aalloc(num_io);
+	for(i; this.io.cnt) {
+		(this.io.ctxs[i]){ this, io_params };
+	}
+}
+
+void ^?{}(cluster & this) {
+	for(i; this.io.cnt) {
+		^(this.io.ctxs[i]){ true };
+	}
+	free(this.io.ctxs);
+
+	// Lock the RWlock so no-one pushes/pops while we are changing the queue
+	uint_fast32_t last_size = ready_mutate_lock();
+
+		// Adjust the ready queue size
+		ready_queue_shrink( &this, 0 );
+
+	// Unlock the RWlock
+	ready_mutate_unlock( last_size );
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		if( 0 != this.print_stats ) {
+			__print_stats( this.stats, this.print_stats, true, this.name, (void*)&this );
+		}
+		free( this.stats );
+	#endif
+
+	unregister(this);
+}
+
+//=============================================================================================
+// Miscellaneous Initialization
+//=============================================================================================
+//-----------------------------------------------------------------------------
+// Global Queues
+static void doregister( cluster     & cltr ) {
+	lock      ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
+	push_front( __cfa_dbg_global_clusters.list, cltr );
+	unlock    ( __cfa_dbg_global_clusters.lock );
+}
+
+static void unregister( cluster     & cltr ) {
+	lock  ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
+	remove( __cfa_dbg_global_clusters.list, cltr );
+	unlock( __cfa_dbg_global_clusters.lock );
+}
+
+void doregister( cluster * cltr, $thread & thrd ) {
+	lock      (cltr->thread_list_lock __cfaabi_dbg_ctx2);
+	cltr->nthreads += 1;
+	push_front(cltr->threads, thrd);
+	unlock    (cltr->thread_list_lock);
+}
+
+void unregister( cluster * cltr, $thread & thrd ) {
+	lock  (cltr->thread_list_lock __cfaabi_dbg_ctx2);
+	remove(cltr->threads, thrd );
+	cltr->nthreads -= 1;
+	unlock(cltr->thread_list_lock);
+}
+
+static void check( int ret, const char func[] ) {
+	if ( ret ) {										// pthread routines return errno values
+		abort( "%s : internal error, error(%d) %s.", func, ret, strerror( ret ) );
+	} // if
+} // Abort
+
+void * __create_pthread( pthread_t * pthread, void * (*start)(void *), void * arg ) {
+	pthread_attr_t attr;
+
+	check( pthread_attr_init( &attr ), "pthread_attr_init" ); // initialize attribute
+
+	size_t stacksize;
+	// default stack size, normally defined by shell limit
+	check( pthread_attr_getstacksize( &attr, &stacksize ), "pthread_attr_getstacksize" );
+	assert( stacksize >= PTHREAD_STACK_MIN );
+
+	void * stack;
+	__cfaabi_dbg_debug_do(
+		stack = memalign( __page_size, stacksize + __page_size );
+		// pthread has no mechanism to create the guard page in user supplied stack.
+		if ( mprotect( stack, __page_size, PROT_NONE ) == -1 ) {
+			abort( "mprotect : internal error, mprotect failure, error(%d) %s.", errno, strerror( errno ) );
+		} // if
+	);
+	__cfaabi_dbg_no_debug_do(
+		stack = malloc( stacksize );
+	);
+
+	check( pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" );
+
+	check( pthread_create( pthread, &attr, start, arg ), "pthread_create" );
+	return stack;
+}
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -22,7 +22,4 @@
 #include "stats.hfa"
 
-#include "bits/random.hfa"
-
-
 //-----------------------------------------------------------------------------
 // Scheduler
@@ -53,19 +50,4 @@
 
 
-struct event_kernel_t {
-	alarm_list_t alarms;
-	__spinlock_t lock;
-};
-
-extern event_kernel_t * event_kernel;
-
-struct __cfa_kernel_preemption_state_t {
-	bool enabled;
-	bool in_progress;
-	unsigned short disable_count;
-};
-
-extern volatile thread_local __cfa_kernel_preemption_state_t preemption_state __attribute__ ((tls_model ( "initial-exec" )));
-
 extern cluster * mainCluster;
 
@@ -84,29 +66,30 @@
 void __unpark( struct __processor_id_t *, $thread * thrd __cfaabi_dbg_ctx_param2 );
 
-//-----------------------------------------------------------------------------
-// I/O
-void __kernel_io_startup     ( cluster &, unsigned, bool );
-void __kernel_io_finish_start( cluster & );
-void __kernel_io_prepare_stop( cluster & );
-void __kernel_io_shutdown    ( cluster &, bool );
+static inline bool __post(single_sem & this, struct __processor_id_t * id) {
+	for() {
+		struct $thread * expected = this.ptr;
+		if(expected == 1p) return false;
+		if(expected == 0p) {
+			if(__atomic_compare_exchange_n(&this.ptr, &expected, 1p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+				return false;
+			}
+		}
+		else {
+			if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+				__unpark( id, expected __cfaabi_dbg_ctx2 );
+				return true;
+			}
+		}
+	}
+}
 
 //-----------------------------------------------------------------------------
 // Utils
-#define KERNEL_STORAGE(T,X) __attribute((aligned(__alignof__(T)))) static char storage_##X[sizeof(T)]
-
-static inline uint64_t __tls_rand() {
-	#if defined(__SIZEOF_INT128__)
-		return __lehmer64( kernelTLS.rand_seed );
-	#else
-		return __xorshift64( kernelTLS.rand_seed );
-	#endif
-}
-
-
-void doregister( struct cluster & cltr );
-void unregister( struct cluster & cltr );
-
 void doregister( struct cluster * cltr, struct $thread & thrd );
 void unregister( struct cluster * cltr, struct $thread & thrd );
+
+//-----------------------------------------------------------------------------
+// I/O
+void ^?{}(io_context & this, bool );
 
 //=======================================================================
@@ -280,30 +263,4 @@
 void ready_queue_shrink(struct cluster * cltr, int target);
 
-//-----------------------------------------------------------------------
-// IO user data
-struct __io_user_data_t {
-	int32_t result;
-	$thread * thrd;
-};
-
-//-----------------------------------------------------------------------
-// Statics call at the end of each thread to register statistics
-#if !defined(__CFA_NO_STATISTICS__)
-	static inline struct __stats_t * __tls_stats() {
-		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-		/* paranoid */ verify( kernelTLS.this_stats );
-		return kernelTLS.this_stats;
-	}
-
-	#define __STATS__(in_kernel, ...) { \
-		if( !(in_kernel) ) disable_interrupts(); \
-		with( *__tls_stats() ) { \
-			__VA_ARGS__ \
-		} \
-		if( !(in_kernel) ) enable_interrupts( __cfaabi_dbg_ctx ); \
-	}
-#else
-	#define __STATS__(in_kernel, ...)
-#endif
 
 // Local Variables: //
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/preemption.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -26,4 +26,5 @@
 
 #include "bits/signal.hfa"
+#include "kernel_private.hfa"
 
 #if !defined(__CFA_DEFAULT_PREEMPTION__)
@@ -293,5 +294,5 @@
 // Startup routine to activate preemption
 // Called from kernel_startup
-void kernel_start_preemption() {
+void __kernel_alarm_startup() {
 	__cfaabi_dbg_print_safe( "Kernel : Starting preemption\n" );
 
@@ -315,5 +316,5 @@
 // Shutdown routine to deactivate preemption
 // Called from kernel_shutdown
-void kernel_stop_preemption() {
+void __kernel_alarm_shutdown() {
 	__cfaabi_dbg_print_safe( "Kernel : Preemption stopping\n" );
 
@@ -481,5 +482,5 @@
 	sigset_t oldset;
 	int ret;
-	ret = pthread_sigmask(0, 0p, &oldset);
+	ret = pthread_sigmask(0, ( const sigset_t * ) 0p, &oldset);  // workaround trac#208: cast should be unnecessary
 	if(ret != 0) { abort("ERROR sigprocmask returned %d", ret); }
 
Index: libcfa/src/concurrency/preemption.hfa
===================================================================
--- libcfa/src/concurrency/preemption.hfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/preemption.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -16,9 +16,14 @@
 #pragma once
 
+#include "bits/locks.hfa"
 #include "alarm.hfa"
-#include "kernel_private.hfa"
 
-void kernel_start_preemption();
-void kernel_stop_preemption();
+struct event_kernel_t {
+	alarm_list_t alarms;
+	__spinlock_t lock;
+};
+
+extern event_kernel_t * event_kernel;
+
 void update_preemption( processor * this, Duration duration );
 
Index: libcfa/src/concurrency/thread.hfa
===================================================================
--- libcfa/src/concurrency/thread.hfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/concurrency/thread.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -84,8 +84,4 @@
 
 //-----------------------------------------------------------------------------
-// Thread getters
-static inline struct $thread * active_thread () { return TL_GET( this_thread ); }
-
-//-----------------------------------------------------------------------------
 // Scheduler API
 
@@ -106,15 +102,4 @@
 bool force_yield( enum __Preemption_Reason );
 
-static inline void yield() {
-	force_yield(__MANUAL_PREEMPTION);
-}
-
-// Yield: yield N times
-static inline void yield( unsigned times ) {
-	for( times ) {
-		yield();
-	}
-}
-
 //----------
 // sleep: force thread to block and be rescheduled after Duration duration
Index: libcfa/src/containers/list.hfa
===================================================================
--- libcfa/src/containers/list.hfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/containers/list.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -22,5 +22,5 @@
 \
 static inline NODE& $tempcv_e2n(ELEM &node) { \
-	return node; \
+	return ( NODE & ) node; \
 } \
 \
@@ -187,5 +187,5 @@
 		$next_link(singleton_to_insert) = $next_link(list_pos);
 		if ($next_link(list_pos).is_terminator) {
-			dlist(Tnode, Telem) *list = $next_link(list_pos).terminator;
+			dlist(Tnode, Telem) *list = ( dlist(Tnode, Telem) * ) $next_link(list_pos).terminator;
 			$dlinks(Telem) *list_links = & list->$links;
 			$mgd_link(Telem) *list_last = & list_links->prev;
@@ -210,5 +210,5 @@
 		$prev_link(singleton_to_insert) = $prev_link(list_pos);
 		if ($prev_link(list_pos).is_terminator) {
-			dlist(Tnode, Telem) *list = $prev_link(list_pos).terminator;
+			dlist(Tnode, Telem) *list = ( dlist(Tnode, Telem) * ) $prev_link(list_pos).terminator;
 			$dlinks(Telem) *list_links = & list->$links;
 			$mgd_link(Telem) *list_first = & list_links->next;
@@ -275,5 +275,5 @@
 
 		if ( $prev_link(list_pos).is_terminator ) {
-			dlist(Tnode, Telem) * tgt_before = $prev_link(list_pos).terminator;
+			dlist(Tnode, Telem) * tgt_before = ( dlist(Tnode, Telem) * ) $prev_link(list_pos).terminator;
 			$dlinks(Telem) * links_before = & tgt_before->$links;
 			&incoming_from_prev = & links_before->next;
@@ -285,5 +285,5 @@
 
 		if ( $next_link(list_pos).is_terminator ) {
-			dlist(Tnode, Telem) * tgt_after = $next_link(list_pos).terminator;
+			dlist(Tnode, Telem) * tgt_after = ( dlist(Tnode, Telem) * ) $next_link(list_pos).terminator;
 			$dlinks(Telem) * links_after = & tgt_after->$links;
 			&incoming_from_next = & links_after->prev;
Index: libcfa/src/exception.hfa
===================================================================
--- libcfa/src/exception.hfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/exception.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -10,6 +10,6 @@
 // Created On       : Thu Apr  7 10:25:00 2020
 // Last Modified By : Andrew Beach
-// Last Modified On : Tue May 19 14:17:00 2020
-// Update Count     : 2
+// Last Modified On : Tue Aug  4 16:22:00 2020
+// Update Count     : 3
 //
 
@@ -18,18 +18,15 @@
 // -----------------------------------------------------------------------------------------------
 
-// All internals helper macros begin with an underscore.
-#define _CLOSE(...) __VA_ARGS__ }
-#define _GLUE2(left, right) left##right
-#define _GLUE3(left, middle, right) left##middle##right
-#define _EXC_DISPATCH(to, ...) to(__VA_ARGS__,__cfaehm_base_exception_t,)
-
-// FWD_TRIVIAL_EXCEPTION(exception_name);
+// TRIVIAL_EXCEPTION_DECLARATION(exception_name);
 // Declare a trivial exception, one that adds no fields or features.
 // This will make the exception visible and may go in a .hfa or .cfa file.
-#define FWD_TRIVIAL_EXCEPTION(...) _EXC_DISPATCH(_FWD_TRIVIAL_EXCEPTION, __VA_ARGS__)
-// INST_TRIVIAL_EXCEPTION(exception_name);
+#define TRIVIAL_EXCEPTION_DECLARATION(...) \
+	_EXC_DISPATCH(_TRIVIAL_EXCEPTION_DECLARATION, __VA_ARGS__)
+
+// TRIVIAL_EXCEPTION_INSTANCE(exception_name);
 // Create the trival exception. This must be used exactly once and should be used in a .cfa file,
 // as it creates the unique instance of the virtual table.
-#define INST_TRIVIAL_EXCEPTION(...) _EXC_DISPATCH(_INST_TRIVIAL_EXCEPTION, __VA_ARGS__)
+#define TRIVIAL_EXCEPTION_INSTANCE(...) _EXC_DISPATCH(_TRIVIAL_EXCEPTION_INSTANCE, __VA_ARGS__)
+
 // TRIVIAL_EXCEPTION(exception_name[, parent_name]);
 // Does both of the above, a short hand if the exception is only used in one .cfa file.
@@ -37,7 +34,104 @@
 // base exception. This feature may be removed or changed.
 #define TRIVIAL_EXCEPTION(...) \
-	_EXC_DISPATCH(_FWD_TRIVIAL_EXCEPTION, __VA_ARGS__); \
-	_EXC_DISPATCH(_INST_TRIVIAL_EXCEPTION, __VA_ARGS__)
-#define _FWD_TRIVIAL_EXCEPTION(exception_name, parent_name, ...) \
+	_EXC_DISPATCH(_TRIVIAL_EXCEPTION_DECLARATION, __VA_ARGS__); \
+	_EXC_DISPATCH(_TRIVIAL_EXCEPTION_INSTANCE, __VA_ARGS__)
+
+// FORALL_TRIVIAL_EXCEPTION(exception_name, (assertions...), (parameters...));
+// Forward declare a polymorphic but otherwise trivial exception type. You must provide the entire
+// assertion list (exactly what would go in the forall clause) and parameters list (only the
+// parameter names from the assertion list, same order and comma seperated). This should be
+// visible where ever use the exception. This just generates the polymorphic framework, see
+// POLY_VTABLE_DECLARATION to allow instantiations.
+#define FORALL_TRIVIAL_EXCEPTION(exception_name, assertions, parameters) \
+	_FORALL_TRIVIAL_EXCEPTION(exception_name, __cfaehm_base_exception_t, assertions, parameters, )
+
+// FORALL_TRIVIAL_INSTANCE(exception_name, (assertions...), (parameters...))
+// Create the forall trivial exception. The assertion list and parameters must match.
+// There must be exactly one use of this in a program for each exception type. This just
+// generates the polymorphic framework, see POLY_VTABLE_INSTANCE to allow instantiations.
+#define FORALL_TRIVIAL_INSTANCE(exception_name, assertions, parameters) \
+	_FORALL_CTOR0_INSTANCE(exception_name, assertions, parameters)
+
+// DATA_EXCEPTION(exception_name)(fields...);
+// Forward declare an exception that adds fields but no features. The added fields go in the
+// second argument list. The virtual table instance must be provided later (see VTABLE_INSTANCE).
+#define DATA_EXCEPTION(...) _EXC_DISPATCH(_DATA_EXCEPTION, __VA_ARGS__)
+
+// FORALL_DATA_EXCEPTION(exception_name, (assertions...), (parameters...))(fields...);
+// Define a polymorphic exception that adds fields but no additional features. The assertion list
+// and matching parameters must match. Then you can give the list of fields. This should be
+// visible where ever you use the exception. This just generates the polymorphic framework, see
+// POLY_VTABLE_DECLARATION to allow instantiations.
+#define FORALL_DATA_EXCEPTION(exception_name, assertions, parameters) \
+	_FORALL_DATA_EXCEPTION(exception_name, __cfaehm_base_exception_t, assertions, parameters, )
+
+// FORALL_DATA_INSTANCE(exception_name, (assertions...), (parameters...))
+// Create a polymorphic data exception. The assertion list and parameters must match. This should
+// appear once in each program. This just generates the polymorphic framework, see
+// POLY_VTABLE_INSTANCE to allow instantiations.
+#define FORALL_DATA_INSTANCE(exception_name, assertions, parameters) \
+	_FORALL_CTOR0_INSTANCE(exception_name, assertions, parameters)
+
+// VTABLE_DECLARATION(exception_name)([new_features...]);
+// Declare a virtual table type for an exception with exception_name. You may also add features
+// (fields on the virtual table) by including them in the second list.
+#define VTABLE_DECLARATION(...) _EXC_DISPATCH(_VTABLE_DECLARATION, __VA_ARGS__)
+
+// VTABLE_INSTANCE(exception_name)(msg [, others...]);
+// Create the instance of the virtual table. There must be exactly one instance of a virtual table
+// for each exception type. This fills in most of the fields of the virtual table (uses ?=? and
+// ^?{}) but you must provide the message function and any other fields added in the declaration.
+#define VTABLE_INSTANCE(...) _EXC_DISPATCH(_VTABLE_INSTANCE, __VA_ARGS__)
+
+// FORALL_VTABLE_DECLARATION(exception_name, (assertions...), (parameters...))([new_features...]);
+// Declare a polymorphic virtual table type for an exception with exception_name, the given
+// assertions and parameters. You may also add features (fields on the virtual table). This just
+// generates the polymorphic framework, see POLY_VTABLE_DECLARATION to allow instantiations.
+#define FORALL_VTABLE_DECLARATION(exception_name, assertions, parameters) \
+	_FORALL_VTABLE_DECLARATION(exception_name, __cfaehm_base_exception_t, assertions, parameters, )
+
+// POLY_VTABLE_DECLARATION(exception_name, types...);
+// Declares that an instantiation for this exception exists for the given types. This should be
+// visible anywhere you use the instantiation of the exception is used.
+#define POLY_VTABLE_DECLARATION(exception_name, ...) \
+	void mark_exception(exception_name(__VA_ARGS__) *); \
+	extern VTABLE_TYPE(exception_name)(__VA_ARGS__) VTABLE_NAME(exception_name)
+
+// POLY_VTABLE_INSTANCE(exception_name, types...)(msg [, others...]);
+// Creates an instantiation for the given exception for the given types. This should occur only
+// once in the entire program. You must fill in all features, message and any others given in the
+// initial declaration.
+#define POLY_VTABLE_INSTANCE(exception_name, ...) \
+	_POLY_VTABLE_INSTANCE(exception_name, __cfaehm_base_exception_t, __VA_ARGS__)
+
+// VTABLE_TYPE(exception_name) | VTABLE_NAME(exception_name)
+// Get the name of the vtable type or the name of the vtable instance for an exception type.
+#define VTABLE_TYPE(exception_name) struct _GLUE2(exception_name,_vtable)
+#define VTABLE_NAME(exception_name) _GLUE3(_,exception_name,_vtable_instance)
+
+// VTABLE_FIELD(exception_name);
+// FORALL_VTABLE_FIELD(exception_name, (parameters-or-types));
+// The declaration of the virtual table field. Should be the first declaration in a virtual type.
+#define VTABLE_FIELD(exception_name) VTABLE_TYPE(exception_name) const * virtual_table
+#define FORALL_VTABLE_FIELD(exception_name, parameters) \
+	VTABLE_TYPE(exception_name) parameters const * virtual_table
+
+// VTABLE_INIT(object_reference, exception_name);
+// Sets a virtual table field on an object to the virtual table instance for the type.
+#define VTABLE_INIT(this, exception_name) (this).virtual_table = &VTABLE_NAME(exception_name)
+
+// VTABLE_ASSERTION(exception_name, (parameters...))
+// The assertion that there is an instantiation of the vtable for the exception and types.
+#define VTABLE_ASSERTION(exception_name, parameters) \
+	{ VTABLE_TYPE(exception_name) parameters VTABLE_NAME(exception_name); }
+
+// All internal helper macros begin with an underscore.
+#define _CLOSE(...) __VA_ARGS__ }
+#define _GLUE2(left, right) left##right
+#define _GLUE3(left, middle, right) left##middle##right
+#define _EXC_DISPATCH(to, ...) to(__VA_ARGS__,__cfaehm_base_exception_t,)
+#define _UNPACK(...) __VA_ARGS__
+
+#define _TRIVIAL_EXCEPTION_DECLARATION(exception_name, parent_name, ...) \
 	_VTABLE_DECLARATION(exception_name, parent_name)(); \
 	struct exception_name { \
@@ -46,5 +140,6 @@
 	void ?{}(exception_name & this); \
 	const char * _GLUE2(exception_name,_msg)(exception_name * this)
-#define _INST_TRIVIAL_EXCEPTION(exception_name, parent_name, ...) \
+
+#define _TRIVIAL_EXCEPTION_INSTANCE(exception_name, parent_name, ...) \
 	void ?{}(exception_name & this) { \
 		VTABLE_INIT(this, exception_name); \
@@ -55,16 +150,38 @@
 	_VTABLE_INSTANCE(exception_name, parent_name,)(_GLUE2(exception_name,_msg))
 
-// DATA_EXCEPTION(exception_name)(fields...);
-// Forward declare an exception that adds fields but no features. The added fields go in the
-// second argument list. The virtual table instance must be provided later (see VTABLE_INSTANCE).
-#define DATA_EXCEPTION(...) _EXC_DISPATCH(_DATA_EXCEPTION, __VA_ARGS__)
+#define _FORALL_TRIVIAL_EXCEPTION(exception_name, parent_name, assertions, \
+		parameters, parent_parameters) \
+	_FORALL_VTABLE_DECLARATION(exception_name, parent_name, assertions, \
+		parameters, parent_parameters)(); \
+	forall assertions struct exception_name { \
+		FORALL_VTABLE_FIELD(exception_name, parameters); \
+	}; \
+	_FORALL_CTOR0_DECLARATION(exception_name, assertions, parameters)
+
+#define _FORALL_CTOR0_DECLARATION(exception_name, assertions, parameters) \
+	forall(_UNPACK assertions | VTABLE_ASSERTION(exception_name, assertions, parameters) ) \
+		/*| { VTABLE_TYPE(exception_name) parameters VTABLE_NAME(exception_name); } ) */ \
+	void ?{}(exception_name parameters & this)
+
+#define _FORALL_CTOR0_INSTANCE(exception_name, assertions, parameters) \
+	_FORALL_CTOR0_DECLARATION(exception_name, assertions, parameters) { \
+		VTABLE_INIT(this, exception_name); \
+	}
+
 #define _DATA_EXCEPTION(exception_name, parent_name, ...) \
 	_VTABLE_DECLARATION(exception_name, parent_name)(); \
-	struct exception_name { VTABLE_FIELD(exception_name); _CLOSE
-
-// VTABLE_DECLARATION(exception_name)([new_features...]);
-// Declare a virtual table type for an exception with exception_name. You may also add features
-// (fields on the virtual table) by including them in the second list.
-#define VTABLE_DECLARATION(...) _EXC_DISPATCH(_VTABLE_DECLARATION, __VA_ARGS__)
+	struct exception_name { \
+		VTABLE_FIELD(exception_name); \
+		_CLOSE
+
+#define _FORALL_DATA_EXCEPTION(exception_name, parent_name, \
+		assertions, parameters, parent_parameters) \
+	_FORALL_VTABLE_DECLARATION(exception_name, parent_name, \
+		assertions, parameters, parent_parameters)(); \
+	_FORALL_CTOR0_DECLARATION(exception_name, assertions, parameters); \
+	forall assertions struct exception_name { \
+		FORALL_VTABLE_FIELD(exception_name, parameters); \
+		_CLOSE
+
 #define _VTABLE_DECLARATION(exception_name, parent_name, ...) \
 	struct exception_name; \
@@ -80,9 +197,4 @@
 		_CLOSE
 
-// VTABLE_INSTANCE(exception_name)(msg [, others...]);
-// Create the instance of the virtual table. There must be exactly one instance of a virtual table
-// for each exception type. This fills in most of the fields of the virtual table (uses ?=? and
-// ^?{}) but you must provide the message function and any other fields added in the declaration.
-#define VTABLE_INSTANCE(...) _EXC_DISPATCH(_VTABLE_INSTANCE, __VA_ARGS__)
 #define _VTABLE_INSTANCE(exception_name, parent_name, ...) \
 	void mark_exception(exception_name *) {} \
@@ -95,14 +207,23 @@
 		_CLOSE
 
-// VTABLE_TYPE(exception_name) | VTABLE_NAME(exception_name)
-// Get the name of the vtable type or the name of the vtable instance for an exception type.
-#define VTABLE_TYPE(exception_name) struct _GLUE2(exception_name,_vtable)
-#define VTABLE_NAME(exception_name) _GLUE3(_,exception_name,_vtable_instance)
-
-// VTABLE_FIELD(exception_name);
-// The declaration of the virtual table field. Should be the first declaration in a virtual type.
-#define VTABLE_FIELD(exception_name) VTABLE_TYPE(exception_name) const * virtual_table
-
-// VTABLE_INIT(object_reference, exception_name);
-// Sets a virtual table field on an object to the virtual table instance for the type.
-#define VTABLE_INIT(this, exception_name) (this).virtual_table = &VTABLE_NAME(exception_name)
+#define _FORALL_VTABLE_DECLARATION(exception_name, parent_name, assertions, \
+		parameters, parent_parameters) \
+	forall assertions struct exception_name; \
+	forall assertions VTABLE_TYPE(exception_name) { \
+		VTABLE_TYPE(parent_name) parent_parameters const * parent; \
+		size_t size; \
+		void (*copy)(exception_name parameters * this, exception_name parameters * other); \
+		void (*free)(exception_name parameters & this); \
+		const char * (*msg)(exception_name parameters * this); \
+		_CLOSE
+
+#define _POLY_VTABLE_INSTANCE(exception_name, parent_name, ...) \
+	void mark_exception(exception_name(__VA_ARGS__) *) {} \
+	void _GLUE2(exception_name,_copy)( \
+			exception_name(__VA_ARGS__) * this, exception_name(__VA_ARGS__) * other) { \
+		*this = *other; \
+	} \
+	VTABLE_TYPE(exception_name)(__VA_ARGS__) VTABLE_NAME(exception_name) @= { \
+		&VTABLE_NAME(parent_name), sizeof(exception_name(__VA_ARGS__)), \
+		_GLUE2(exception_name,_copy), ^?{}, \
+		_CLOSE
Index: libcfa/src/heap.cfa
===================================================================
--- libcfa/src/heap.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/heap.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -10,6 +10,6 @@
 // Created On       : Tue Dec 19 21:58:35 2017
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Mon Jul 20 23:00:32 2020
-// Update Count     : 808
+// Last Modified On : Mon Aug  3 19:01:22 2020
+// Update Count     : 828
 //
 
@@ -222,8 +222,9 @@
 
 // Bucket size must be multiple of 16.
-// Powers of 2 are common allocation sizes, so make powers of 2 generate the minimum required size.
+// Smaller multiples of 16 and powers of 2 are common allocation sizes, so make them generate the minimum required bucket size.
+// malloc(0) returns 0p, so no bucket is necessary for 0 bytes returning an address that can be freed.
 static const unsigned int bucketSizes[] @= {			// different bucket sizes
-	16, 32, 48, 64 + sizeof(HeapManager.Storage), // 4
-	96, 112, 128 + sizeof(HeapManager.Storage), // 3
+	16 + sizeof(HeapManager.Storage), 32 + sizeof(HeapManager.Storage), 48 + sizeof(HeapManager.Storage), 64 + sizeof(HeapManager.Storage), // 4
+	96 + sizeof(HeapManager.Storage), 112 + sizeof(HeapManager.Storage), 128 + sizeof(HeapManager.Storage), // 3
 	160, 192, 224, 256 + sizeof(HeapManager.Storage), // 4
 	320, 384, 448, 512 + sizeof(HeapManager.Storage), // 4
@@ -434,4 +435,6 @@
 		#endif // __CFA_DEBUG__
 		header = realHeader( header );					// backup from fake to real header
+	} else {
+		alignment = 0;
 	} // if
 } // fakeHeader
@@ -481,5 +484,6 @@
 			unlock( extlock );
 			errno = ENOMEM;
-			return 0p;
+//			return 0p;
+			abort( "no memory" );
 		} // if
 		#ifdef __STATISTICS__
@@ -550,5 +554,5 @@
 
 			block = (HeapManager.Storage *)extend( tsize );	// mutual exclusion on call
-	if ( unlikely( block == 0p ) ) return 0p;
+//	if ( unlikely( block == 0p ) ) return 0p;
 		#if BUCKETLOCK == SPINLOCK
 		} else {
@@ -746,12 +750,11 @@
 
 static inline void * mallocNoStats( size_t size ) {		// necessary for malloc statistics
-	//assert( heapManager.heapBegin != 0 );
-	if ( unlikely( heapManager.heapBegin == 0p ) ) heapManager{}; // called before memory_startup ?
+	verify( heapManager.heapBegin != 0 );				// called before memory_startup ?
+  if ( size == 0 ) return 0p;							// 0 BYTE ALLOCATION RETURNS NULL POINTER
+
 #if __SIZEOF_POINTER__ == 8
 	verify( size < ((typeof(size_t))1 << 48) );
 #endif // __SIZEOF_POINTER__ == 8
-	void * addr = doMalloc( size );
-	if ( unlikely( addr == 0p ) ) errno = ENOMEM;		// POSIX
-	return addr;
+	return doMalloc( size );
 } // mallocNoStats
 
@@ -759,19 +762,21 @@
 static inline void * callocNoStats( size_t dim, size_t elemSize ) {
 	size_t size = dim * elemSize;
+  if ( size == 0 ) return 0p;							// 0 BYTE ALLOCATION RETURNS NULL POINTER
 	char * addr = (char *)mallocNoStats( size );
-  if ( unlikely( addr == 0p ) ) return 0p;
 
 	HeapManager.Storage.Header * header;
 	HeapManager.FreeHeader * freeElem;
 	size_t bsize, alignment;
-	bool mapped __attribute__(( unused )) = headers( "calloc", addr, header, freeElem, bsize, alignment );
+	#ifndef __CFA_DEBUG__
+	bool mapped =
+	#endif // __CFA_DEBUG__
+		headers( "calloc", addr, header, freeElem, bsize, alignment );
 	#ifndef __CFA_DEBUG__
 	// Mapped storage is zero filled, but in debug mode mapped memory is scrubbed in doMalloc, so it has to be reset to zero.
 	if ( ! mapped )
 	#endif // __CFA_DEBUG__
-		// Zero entire data space even when > than size => realloc without a new allocation and zero fill works.
-		// <-------00000000000000000000000000000000000000000000000000000> bsize (bucket size)
+		// <-------0000000000000000000000000000UUUUUUUUUUUUUUUUUUUUUUUUU> bsize (bucket size) U => undefined
 		// `-header`-addr                      `-size
-		memset( addr, '\0', bsize - sizeof(HeapManager.Storage) ); // set to zeros
+		memset( addr, '\0', size );						// set to zeros
 
 	header->kind.real.blockSize |= 2;					// mark as zero filled
@@ -781,4 +786,6 @@
 
 static inline void * memalignNoStats( size_t alignment, size_t size ) { // necessary for malloc statistics
+  if ( size == 0 ) return 0p;							// 0 BYTE ALLOCATION RETURNS NULL POINTER
+
 	#ifdef __CFA_DEBUG__
 	checkAlign( alignment );							// check alignment
@@ -798,5 +805,4 @@
 	// add sizeof(Storage) for fake header
 	char * addr = (char *)mallocNoStats( size + alignment - libAlign() + sizeof(HeapManager.Storage) );
-  if ( unlikely( addr == 0p ) ) return addr;
 
 	// address in the block of the "next" alignment address
@@ -805,4 +811,5 @@
 	// address of header from malloc
 	HeapManager.Storage.Header * realHeader = headerAddr( addr );
+	realHeader->kind.real.size = size;					// correct size to eliminate above alignment offset
 	// address of fake header * before* the alignment location
 	HeapManager.Storage.Header * fakeHeader = headerAddr( user );
@@ -818,6 +825,7 @@
 static inline void * cmemalignNoStats( size_t alignment, size_t dim, size_t elemSize ) {
 	size_t size = dim * elemSize;
+  if ( size == 0 ) return 0p;							// 0 BYTE ALLOCATION RETURNS NULL POINTER
 	char * addr = (char *)memalignNoStats( alignment, size );
-  if ( unlikely( addr == 0p ) ) return 0p;
+
 	HeapManager.Storage.Header * header;
 	HeapManager.FreeHeader * freeElem;
@@ -889,5 +897,5 @@
 
 		// If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
-	  if ( unlikely( size == 0 ) ) { free( oaddr ); return mallocNoStats( size ); } // special cases
+	  if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
 	  if ( unlikely( oaddr == 0p ) ) return mallocNoStats( size );
 
@@ -901,5 +909,5 @@
 	  if ( oalign == 0 && size <= odsize && odsize <= size * 2 ) { // allow 50% wasted storage for smaller size
 			header->kind.real.blockSize &= -2;			// no alignment and turn off 0 fill
-			if ( size != odsize ) header->kind.real.size = size; // reset allocation size
+			header->kind.real.size = size;				// reset allocation size
 			return oaddr;
 		} // if
@@ -907,6 +915,5 @@
 		// change size, DO NOT preserve STICKY PROPERTIES.
 		free( oaddr );
-		void * naddr = mallocNoStats( size );			// create new area
-		return naddr;
+		return mallocNoStats( size );					// create new area
 	} // resize
 
@@ -921,5 +928,5 @@
 
 		// If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
-	  if ( unlikely( size == 0 ) ) { free( oaddr ); return mallocNoStats( size ); } // special cases
+	  if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
 	  if ( unlikely( oaddr == 0p ) ) return mallocNoStats( size );
 
@@ -930,6 +937,11 @@
 
 		size_t odsize = dataStorage( bsize, oaddr, header ); // data storage available in bucket
-	  if ( size <= odsize && odsize <= size * 2 ) {		// allow up to 50% wasted storage in smaller size
-			if ( size != odsize ) header->kind.real.size = size; // reset allocation size
+		size_t osize = header->kind.real.size;			// old allocation size
+		bool ozfill = (header->kind.real.blockSize & 2) != 0; // old allocation zero filled
+	  if ( unlikely( size <= odsize ) && size > odsize / 2 ) { // allow up to 50% wasted storage
+	  		header->kind.real.size = size;				// reset allocation size
+	  		if ( unlikely( ozfill ) && size > osize ) {	// previous request zero fill and larger ?
+	  			memset( (char *)oaddr + osize, (int)'\0', size - osize ); // initialize added storage
+	  		} // if
 			return oaddr;
 		} // if
@@ -938,24 +950,20 @@
 
 		void * naddr;
-		if ( unlikely( oalign != 0 ) ) {				// previous request memalign?
-			if ( unlikely( header->kind.real.blockSize & 2 ) ) { // previous request zero fill
-				naddr = cmemalignNoStats( oalign, 1, size ); // create new aligned area
-			} else {
-				naddr = memalignNoStats( oalign, size ); // create new aligned area
+		if ( likely( oalign == 0 ) ) {					// previous request memalign?
+			naddr = mallocNoStats( size );				// create new area
+		} else {
+			naddr = memalignNoStats( oalign, size );	// create new aligned area
+		} // if
+
+		headers( "realloc", naddr, header, freeElem, bsize, oalign );
+		memcpy( naddr, oaddr, MIN( osize, size ) );		// copy bytes
+		free( oaddr );
+
+		if ( unlikely( ozfill ) ) {						// previous request zero fill ?
+			header->kind.real.blockSize |= 2;			// mark new request as zero filled
+			if ( size > osize ) {						// previous request larger ?
+				memset( (char *)naddr + osize, (int)'\0', size - osize ); // initialize added storage
 			} // if
-		} else {
-			if ( unlikely( header->kind.real.blockSize & 2 ) ) { // previous request zero fill
-				naddr = callocNoStats( 1, size );		// create new area
-			} else {
-				naddr = mallocNoStats( size );			// create new area
-			} // if
-		} // if
-	  if ( unlikely( naddr == 0p ) ) return 0p;
-
-		headers( "realloc", naddr, header, freeElem, bsize, oalign );
-		size_t ndsize = dataStorage( bsize, naddr, header ); // data storage avilable in bucket
-		// To preserve prior fill, the entire bucket must be copied versus the size.
-		memcpy( naddr, oaddr, MIN( odsize, ndsize ) );	// copy bytes
-		free( oaddr );
+		} // if
 		return naddr;
 	} // realloc
@@ -1007,5 +1015,4 @@
 	  if ( alignment < sizeof(void *) || ! libPow2( alignment ) ) return EINVAL; // check alignment
 		* memptr = memalign( alignment, size );
-	  if ( unlikely( * memptr == 0p ) ) return ENOMEM;
 		return 0;
 	} // posix_memalign
@@ -1205,7 +1212,6 @@
 
 	// If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
-  if ( unlikely( size == 0 ) ) { free( oaddr ); return memalignNoStats( nalign, size ); } // special cases
+  if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
   if ( unlikely( oaddr == 0p ) ) return memalignNoStats( nalign, size );
-
 
 	if ( unlikely( nalign == 0 ) ) nalign = libAlign();	// reset alignment to minimum
@@ -1234,11 +1240,5 @@
 	// change size
 
-	void * naddr;
-	if ( unlikely( header->kind.real.blockSize & 2 ) ) { // previous request zero fill
-		naddr = cmemalignNoStats( nalign, 1, size );	// create new aligned area
-	} else {
-		naddr = memalignNoStats( nalign, size );		// create new aligned area
-	} // if
-
+	void * naddr = memalignNoStats( nalign, size );		// create new aligned area
 	free( oaddr );
 	return naddr;
@@ -1257,5 +1257,4 @@
 	size_t bsize, oalign = 0;
 	headers( "realloc", oaddr, header, freeElem, bsize, oalign );
-	size_t odsize = dataStorage( bsize, oaddr, header ); // data storage available in bucket
 
 	if ( oalign <= nalign && (uintptr_t)oaddr % nalign == 0 ) { // <= alignment and new alignment happens to match
@@ -1273,6 +1272,9 @@
 	#endif // __STATISTICS__
 
+	size_t osize = header->kind.real.size;			// old allocation size
+	bool ozfill = (header->kind.real.blockSize & 2) != 0; // old allocation zero filled
+
 	// If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
-  if ( unlikely( size == 0 ) ) { free( oaddr ); return memalignNoStats( nalign, size ); } // special cases
+  if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
   if ( unlikely( oaddr == 0p ) ) return memalignNoStats( nalign, size );
 
@@ -1285,8 +1287,13 @@
 
 	headers( "realloc", naddr, header, freeElem, bsize, oalign );
-	size_t ndsize = dataStorage( bsize, naddr, header ); // data storage available in bucket
-	// To preserve prior fill, the entire bucket must be copied versus the size.
-	memcpy( naddr, oaddr, MIN( odsize, ndsize ) );		// copy bytes
+	memcpy( naddr, oaddr, MIN( osize, size ) );			// copy bytes
 	free( oaddr );
+
+	if ( unlikely( ozfill ) ) {							// previous request zero fill ?
+		header->kind.real.blockSize |= 2;				// mark new request as zero filled
+		if ( size > osize ) {							// previous request larger ?
+			memset( (char *)naddr + osize, (int)'\0', size - osize ); // initialize added storage
+		} // if
+	} // if
 	return naddr;
 } // realloc
Index: libcfa/src/iostream.hfa
===================================================================
--- libcfa/src/iostream.hfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/iostream.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -363,5 +363,5 @@
 	_Istream_Cstr excl( const char scanset[], char * s ) { return (_Istream_Cstr){ s, scanset, -1, { .flags.inex : true } }; }
 	_Istream_Cstr & excl( const char scanset[], _Istream_Cstr & fmt ) { fmt.scanset = scanset; fmt.flags.inex = true; return fmt; }
-	_Istream_Cstr ignore( const char s[] ) { return (_Istream_Cstr)@{ s, 0p, -1, { .flags.ignore : true } }; }
+	_Istream_Cstr ignore( char s[] ) { return (_Istream_Cstr)@{ s, 0p, -1, { .flags.ignore : true } }; }
 	_Istream_Cstr & ignore( _Istream_Cstr & fmt ) { fmt.flags.ignore = true; return fmt; }
 	_Istream_Cstr wdi( unsigned int w, char s[] ) { return (_Istream_Cstr)@{ s, 0p, w, { .all : 0 } }; }
Index: libcfa/src/stdlib.hfa
===================================================================
--- libcfa/src/stdlib.hfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ libcfa/src/stdlib.hfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -10,6 +10,6 @@
 // Created On       : Thu Jan 28 17:12:35 2016
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Tue Jul 21 07:58:05 2020
-// Update Count     : 475
+// Last Modified On : Thu Jul 30 16:14:58 2020
+// Update Count     : 490
 //
 
@@ -71,10 +71,12 @@
 	T * resize( T * ptr, size_t size ) {				// CFA resize, eliminate return-type cast
 		$RE_SPECIALS( ptr, size, malloc, memalign );
-		return (T *)(void *)resize( (void *)ptr, size ); // CFA resize
+		if ( _Alignof(T) <= libAlign() ) return (T *)(void *)resize( (void *)ptr, size ); // CFA resize
+		else return (T *)(void *)resize( (void *)ptr, _Alignof(T), size ); // CFA resize
 	} // resize
 
 	T * realloc( T * ptr, size_t size ) {				// CFA realloc, eliminate return-type cast
 		$RE_SPECIALS( ptr, size, malloc, memalign );
-		return (T *)(void *)realloc( (void *)ptr, size ); // C realloc
+		if ( _Alignof(T) <= libAlign() ) return (T *)(void *)realloc( (void *)ptr, size ); // C realloc
+		else return (T *)(void *)realloc( (void *)ptr, _Alignof(T), size ); // CFA realloc
 	} // realloc
 
@@ -121,15 +123,8 @@
 	forall( dtype S | sized(S) )
 	T * alloc( S ptr[], size_t dim = 1 ) {				// singleton/array resize
-		size_t len = malloc_usable_size( ptr );			// current bucket size
-		if ( sizeof(T) * dim > len ) {					// not enough space ?
-			T * temp = alloc( dim );					// new storage
-			free( ptr );								// free old storage
-			return temp;
-		} else {
-			return (T *)ptr;
-		} // if
-	} // alloc
-
-	T * alloc( T ptr[], size_t dim, bool copy = true ) {
+		return resize( (T *)ptr, dim * sizeof(T) );		// CFA resize
+	} // alloc
+
+	T * alloc( T ptr[], size_t dim = 1, bool copy = true ) {
 		if ( copy ) {
 			return realloc( ptr, dim * sizeof(T) );		// CFA realloc
@@ -168,5 +163,5 @@
 			memset( (char *)nptr + osize, (int)fill, nsize - osize ); // initialize added storage
 		} // if
-		return (T *)nptr;
+		return nptr;
 	} // alloc_set
 
@@ -181,5 +176,5 @@
 			} // for
 		} // if
-		return (T *)nptr;
+		return nptr;
 	} // alloc_align_set
 } // distribution
@@ -195,5 +190,5 @@
 
 	T * alloc_align( T * ptr, size_t align ) {			// aligned realloc array
-		return (T *)(void *)realloc( (void *)ptr, align, sizeof(T) ); // CFA realloc
+		return (T *)(void *)realloc( (void *)ptr, align, sizeof(T) ); // CFA C realloc
 	} // alloc_align
 
@@ -232,9 +227,9 @@
 		size_t osize = malloc_size( ptr );				// current allocation
 		size_t nsize = dim * sizeof(T);					// new allocation
-		T * nptr = realloc( ptr, align, nsize );		// CFA realloc
+		T * nptr = alloc_align( ptr, align, nsize );
 		if ( nsize > osize ) {							// larger ?
 			memset( (char *)nptr + osize, (int)fill, nsize - osize ); // initialize added storage
 		} // if
-		return (T *)nptr;
+		return nptr;
 	} // alloc_align_set
 
@@ -243,5 +238,5 @@
 		size_t nsize = dim * sizeof(T);					// new allocation
 		size_t ndim = nsize / sizeof(T);				// new dimension
-		T * nptr = realloc( ptr, align, nsize );		// CFA realloc
+		T * nptr = alloc_align( ptr, align, nsize );
 		if ( ndim > odim ) {							// larger ?
 			for ( i; odim ~ ndim ) {
@@ -249,5 +244,5 @@
 			} // for
 		} // if
-		return (T *)nptr;
+		return nptr;
 	} // alloc_align_set
 } // distribution
Index: longrun_tests/Makefile.in
===================================================================
--- longrun_tests/Makefile.in	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ longrun_tests/Makefile.in	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -483,6 +483,10 @@
 AUTOMAKE_OPTIONS = foreign    # do not require all the GNU file names
 ACLOCAL_AMFLAGS = -I automake
-CFACOMPILE = $(CFACC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CFAFLAGS) $(CFAFLAGS) $(AM_CFLAGS) $(CFLAGS)
-LTCFACOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+AM_T_CFA = $(am__t_CFA_@AM_T@)
+am__t_CFA_ = 
+am__t_CFA_0 = 
+am__t_CFA_1 = /usr/bin/time --quiet -f "$@ %E" # trailling space is necessary
+CFACOMPILE = $(AM_T_CFA)$(CFACC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CFAFLAGS) $(CFAFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCFACOMPILE = $(AM_T_CFA)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=compile $(CFACC) $(DEFS) \
 	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CFAFLAGS) $(AM_CFLAGS) $(CFAFLAGS) $(CFLAGS)
Index: src/Common/ScopedMap.h
===================================================================
--- src/Common/ScopedMap.h	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ src/Common/ScopedMap.h	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -93,5 +93,5 @@
 
 		reference operator* () { return *it; }
-		pointer operator-> () { return it.operator->(); }
+		pointer operator-> () const { return it.operator->(); }
 
 		iterator& operator++ () {
Index: src/Concurrency/Keywords.cc
===================================================================
--- src/Concurrency/Keywords.cc	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ src/Concurrency/Keywords.cc	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -510,5 +510,6 @@
 						new CastExpr(
 							new VariableExpr( func->get_functionType()->get_parameters().front() ),
-							func->get_functionType()->get_parameters().front()->get_type()->stripReferences()->clone()
+							func->get_functionType()->get_parameters().front()->get_type()->stripReferences()->clone(),
+							false
 						)
 					)
@@ -888,5 +889,5 @@
 			new SingleInit( new UntypedExpr(
 				new NameExpr( "get_monitor" ),
-				{  new CastExpr( new VariableExpr( args.front() ), arg_type ) }
+				{  new CastExpr( new VariableExpr( args.front() ), arg_type, false ) }
 			))
 		);
@@ -909,5 +910,5 @@
 					{
 						new SingleInit( new AddressExpr( new VariableExpr( monitors ) ) ),
-						new SingleInit( new CastExpr( new VariableExpr( func ), generic_func->clone() ) )
+						new SingleInit( new CastExpr( new VariableExpr( func ), generic_func->clone(), false ) )
 					},
 					noDesignators,
@@ -946,5 +947,5 @@
 					return new SingleInit( new UntypedExpr(
 						new NameExpr( "get_monitor" ),
-						{  new CastExpr( new VariableExpr( var ), type ) }
+						{  new CastExpr( new VariableExpr( var ), type, false ) }
 					) );
 				})
@@ -970,5 +971,5 @@
 						new SingleInit( new VariableExpr( monitors ) ),
 						new SingleInit( new ConstantExpr( Constant::from_ulong( args.size() ) ) ),
-						new SingleInit( new CastExpr( new VariableExpr( func ), generic_func->clone() ) )
+						new SingleInit( new CastExpr( new VariableExpr( func ), generic_func->clone(), false ) )
 					},
 					noDesignators,
Index: src/Concurrency/Waitfor.cc
===================================================================
--- src/Concurrency/Waitfor.cc	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ src/Concurrency/Waitfor.cc	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -384,5 +384,6 @@
 								decl_monitor
 							)
-						)
+						),
+						false
 					);
 
@@ -408,5 +409,5 @@
 			new CompoundStmt({
 				makeAccStatement( acceptables, index, "is_dtor", detectIsDtor( clause.target.function )                                    , indexer ),
-				makeAccStatement( acceptables, index, "func"   , new CastExpr( clause.target.function, fptr_t )                            , indexer ),
+				makeAccStatement( acceptables, index, "func"   , new CastExpr( clause.target.function, fptr_t, false )                     , indexer ),
 				makeAccStatement( acceptables, index, "data"   , new VariableExpr( monitors )                                              , indexer ),
 				makeAccStatement( acceptables, index, "size"   , new ConstantExpr( Constant::from_ulong( clause.target.arguments.size() ) ), indexer ),
@@ -531,5 +532,6 @@
 								decl_mask
 							)
-						)
+						),
+						false
 					),
 					timeout
Index: src/Parser/ExpressionNode.cc
===================================================================
--- src/Parser/ExpressionNode.cc	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ src/Parser/ExpressionNode.cc	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -427,15 +427,15 @@
 		if ( str[1] == '8' ) goto Default;				// utf-8 characters => array of char
 		// lookup type of associated typedef
-		strtype = new TypeInstType( Type::Qualifiers( Type::Const ), "char16_t", false );
+		strtype = new TypeInstType( Type::Qualifiers( ), "char16_t", false );
 		break;
 	  case 'U':
-		strtype = new TypeInstType( Type::Qualifiers( Type::Const ), "char32_t", false );
+		strtype = new TypeInstType( Type::Qualifiers( ), "char32_t", false );
 		break;
 	  case 'L':
-		strtype = new TypeInstType( Type::Qualifiers( Type::Const ), "wchar_t", false );
+		strtype = new TypeInstType( Type::Qualifiers( ), "wchar_t", false );
 		break;
 	  Default:											// char default string type
 	  default:
-		strtype = new BasicType( Type::Qualifiers( Type::Const ), BasicType::Char );
+		strtype = new BasicType( Type::Qualifiers( ), BasicType::Char );
 	} // switch
 	ArrayType * at = new ArrayType( noQualifiers, strtype,
Index: src/ResolvExpr/AlternativeFinder.cc
===================================================================
--- src/ResolvExpr/AlternativeFinder.cc	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ src/ResolvExpr/AlternativeFinder.cc	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -1216,6 +1216,8 @@
 			unify( castExpr->result, alt.expr->result, alt.env, needAssertions,
 				haveAssertions, openVars, indexer );
-			Cost thisCost = castCost( alt.expr->result, castExpr->result, alt.expr->get_lvalue(),
-				indexer, alt.env );
+			Cost thisCost =
+				castExpr->isGenerated
+				? conversionCost( alt.expr->result, castExpr->result, alt.expr->get_lvalue(),	indexer, alt.env )
+				: castCost( alt.expr->result, castExpr->result, alt.expr->get_lvalue(),	indexer, alt.env );
 			PRINT(
 				std::cerr << "working on cast with result: " << castExpr->result << std::endl;
@@ -1698,9 +1700,23 @@
 
 				// unification run for side-effects
-				unify( toType, alt.expr->result, newEnv, need, have, openVars, indexer );
+				bool canUnify = unify( toType, alt.expr->result, newEnv, need, have, openVars, indexer );
+				(void) canUnify;
 				// xxx - do some inspecting on this line... why isn't result bound to initAlt.type?
 
-				Cost thisCost = castCost( alt.expr->result, toType, alt.expr->get_lvalue(),
+				Cost thisCost = computeConversionCost( alt.expr->result, toType, alt.expr->get_lvalue(),
 					indexer, newEnv );
+
+				PRINT(
+					Cost legacyCost = castCost( alt.expr->result, toType, alt.expr->get_lvalue(),
+						indexer, newEnv );
+					std::cerr << "Considering initialization:";
+					std::cerr << std::endl << "  FROM: "; alt.expr->result->print(std::cerr);
+					std::cerr << std::endl << "  TO: ";   toType          ->print(std::cerr);
+					std::cerr << std::endl << "  Unification " << (canUnify ? "succeeded" : "failed");
+					std::cerr << std::endl << "  Legacy cost " << legacyCost;
+					std::cerr << std::endl << "  New cost " << thisCost;
+					std::cerr << std::endl;
+				)
+				
 				if ( thisCost != Cost::infinity ) {
 					// count one safe conversion for each value that is thrown away
Index: src/ResolvExpr/ConversionCost.cc
===================================================================
--- src/ResolvExpr/ConversionCost.cc	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ src/ResolvExpr/ConversionCost.cc	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -10,6 +10,6 @@
 // Created On       : Sun May 17 07:06:19 2015
 // Last Modified By : Andrew Beach
-// Last Modified On : Mon Aug 12 10:21:00 2019
-// Update Count     : 27
+// Last Modified On : Wed Jul 29 16:11:00 2020
+// Update Count     : 28
 //
 
@@ -392,20 +392,4 @@
 	void ConversionCost::postvisit( const FunctionType * ) {}
 
-	void ConversionCost::postvisit( const StructInstType * inst ) {
-		if ( const StructInstType * destAsInst = dynamic_cast< const StructInstType * >( dest ) ) {
-			if ( inst->name == destAsInst->name ) {
-				cost = Cost::zero;
-			} // if
-		} // if
-	}
-
-	void ConversionCost::postvisit( const UnionInstType * inst ) {
-		if ( const UnionInstType * destAsInst = dynamic_cast< const UnionInstType * >( dest ) ) {
-			if ( inst->name == destAsInst->name ) {
-				cost = Cost::zero;
-			} // if
-		} // if
-	}
-
 	void ConversionCost::postvisit( const EnumInstType * ) {
 		static Type::Qualifiers q;
@@ -681,22 +665,4 @@
 }
 
-void ConversionCost_new::postvisit( const ast::StructInstType * structInstType ) {
-	if ( const ast::StructInstType * dstAsInst =
-			dynamic_cast< const ast::StructInstType * >( dst ) ) {
-		if ( structInstType->name == dstAsInst->name ) {
-			cost = Cost::zero;
-		}
-	}
-}
-
-void ConversionCost_new::postvisit( const ast::UnionInstType * unionInstType ) {
-	if ( const ast::UnionInstType * dstAsInst =
-			dynamic_cast< const ast::UnionInstType * >( dst ) ) {
-		if ( unionInstType->name == dstAsInst->name ) {
-			cost = Cost::zero;
-		}
-	}
-}
-
 void ConversionCost_new::postvisit( const ast::EnumInstType * enumInstType ) {
 	(void)enumInstType;
Index: src/ResolvExpr/ConversionCost.h
===================================================================
--- src/ResolvExpr/ConversionCost.h	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ src/ResolvExpr/ConversionCost.h	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -10,6 +10,6 @@
 // Created On       : Sun May 17 09:37:28 2015
 // Last Modified By : Andrew Beach
-// Last Modified On : Thu Aug  8 16:13:00 2019
-// Update Count     : 6
+// Last Modified On : Wed Jul 29 16:12:00 2020
+// Update Count     : 7
 //
 
@@ -51,6 +51,4 @@
 		void postvisit( const ReferenceType * refType );
 		void postvisit( const FunctionType * functionType );
-		void postvisit( const StructInstType * aggregateUseType );
-		void postvisit( const UnionInstType * aggregateUseType );
 		void postvisit( const EnumInstType * aggregateUseType );
 		void postvisit( const TraitInstType * aggregateUseType );
@@ -102,6 +100,4 @@
 	void postvisit( const ast::ReferenceType * refType );
 	void postvisit( const ast::FunctionType * functionType );
-	void postvisit( const ast::StructInstType * structInstType );
-	void postvisit( const ast::UnionInstType * unionInstType );
 	void postvisit( const ast::EnumInstType * enumInstType );
 	void postvisit( const ast::TraitInstType * traitInstType );
Index: src/SynTree/Expression.h
===================================================================
--- src/SynTree/Expression.h	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ src/SynTree/Expression.h	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -206,5 +206,15 @@
   public:
 	Expression * arg;
-	bool isGenerated = true; // cast generated implicitly by code generation or explicit in program
+
+	// Inidicates cast is introduced by the CFA type system.
+	// true for casts that the resolver introduces to force a return type
+	// false for casts from user code
+	// false for casts from desugaring advanced CFA features into simpler CFA
+	// example
+	//   int * p;     // declaration
+	//   (float *) p; // use, with subject cast
+	// subject cast isGenerated means we are considering an interpretation with a type mismatch
+	// subject cast not isGenerated means someone in charge wants it that way
+	bool isGenerated = true;
 
 	CastExpr( Expression * arg, bool isGenerated = true );
Index: src/Virtual/ExpandCasts.cc
===================================================================
--- src/Virtual/ExpandCasts.cc	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ src/Virtual/ExpandCasts.cc	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -10,6 +10,6 @@
 // Created On       : Mon Jul 24 13:59:00 2017
 // Last Modified By : Andrew Beach
-// Last Modified On : Tue Jul 22 10:04:00 2020
-// Update Count     : 3
+// Last Modified On : Fri Jul 31 10:29:00 2020
+// Update Count     : 4
 //
 
@@ -18,9 +18,8 @@
 #include <cassert>                 // for assert, assertf
 #include <iterator>                // for back_inserter, inserter
-#include <map>                     // for map, _Rb_tree_iterator, map<>::ite...
 #include <string>                  // for string, allocator, operator==, ope...
-#include <utility>                 // for pair
 
 #include "Common/PassVisitor.h"    // for PassVisitor
+#include "Common/ScopedMap.h"      // for ScopedMap
 #include "Common/SemanticError.h"  // for SemanticError
 #include "SymTab/Mangler.h"        // for mangleType
@@ -37,6 +36,13 @@
 	/// Maps virtual table types the instance for that type.
 	class VirtualTableMap final {
-		std::unordered_map<std::string, ObjectDecl *> vtable_instances;
+		ScopedMap<std::string, ObjectDecl *> vtable_instances;
 	public:
+		void enterScope() {
+			vtable_instances.beginScope();
+		}
+		void leaveScope() {
+			vtable_instances.endScope();
+		}
+
 		ObjectDecl * insert( ObjectDecl * vtableDecl ) {
 			std::string const & mangledName = SymTab::Mangler::mangleType( vtableDecl->type );
@@ -93,8 +99,4 @@
 
 	class VirtualCastCore {
-		VirtualTableMap vtable_instances;
-		FunctionDecl *vcast_decl;
-		StructDecl *pvt_decl;
-
 		Type * pointer_to_pvt(int level_of_indirection) {
 			Type * type = new StructInstType(
@@ -108,5 +110,5 @@
 	public:
 		VirtualCastCore() :
-			vtable_instances(), vcast_decl( nullptr ), pvt_decl( nullptr )
+			indexer(), vcast_decl( nullptr ), pvt_decl( nullptr )
 		{}
 
@@ -116,4 +118,9 @@
 
 		Expression * postmutate( VirtualCastExpr * castExpr );
+
+		VirtualTableMap indexer;
+	private:
+		FunctionDecl *vcast_decl;
+		StructDecl *pvt_decl;
 	};
 
@@ -135,5 +142,5 @@
 	void VirtualCastCore::premutate( ObjectDecl * objectDecl ) {
 		if ( is_vtable_inst_name( objectDecl->get_name() ) ) {
-			if ( ObjectDecl * existing = vtable_instances.insert( objectDecl ) ) {
+			if ( ObjectDecl * existing = indexer.insert( objectDecl ) ) {
 				std::string msg = "Repeated instance of virtual table, original found at: ";
 				msg += existing->location.filename;
@@ -222,5 +229,5 @@
 
 		const Type * vtable_type = getVirtualTableType( castExpr );
-		ObjectDecl * table = vtable_instances.lookup( vtable_type );
+		ObjectDecl * table = indexer.lookup( vtable_type );
 		if ( nullptr == table ) {
 			SemanticError( castLocation( castExpr ),
Index: src/cfa.make
===================================================================
--- src/cfa.make	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ src/cfa.make	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -1,4 +1,10 @@
-CFACOMPILE = $(CFACC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CFAFLAGS) $(CFAFLAGS) $(AM_CFLAGS) $(CFLAGS)
-LTCFACOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+AM_T_CFA = $(am__t_CFA_@AM_T@)
+am__t_CFA_ =
+am__t_CFA_0 =
+am__t_CFA_1 = /usr/bin/time --quiet -f "$@ %E" # trailling space is necessary
+
+
+CFACOMPILE = $(AM_T_CFA)$(CFACC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CFAFLAGS) $(CFAFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCFACOMPILE = $(AM_T_CFA)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=compile $(CFACC) $(DEFS) \
 	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CFAFLAGS) $(AM_CFLAGS) $(CFAFLAGS) $(CFLAGS)
Index: tests/.expect/castError.txt
===================================================================
--- tests/.expect/castError.txt	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ tests/.expect/castError.txt	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -1,3 +1,3 @@
-castError.cfa:21:1 error: Cannot choose between 3 alternatives for expression
+castError.cfa:23:1 error: Cannot choose between 3 alternatives for expression
 Explicit Cast of:
   Name: f
@@ -35,5 +35,5 @@
 
 
-castError.cfa:26:1 error: Cannot choose between 2 alternatives for expression
+castError.cfa:28:1 error: Cannot choose between 2 alternatives for expression
 Generated Cast of:
   Comma Expression:
@@ -62,2 +62,9 @@
 
 
+castError.cfa:30:1 error: No reasonable alternatives for expression Explicit Cast of:
+  Name: sint
+... to:
+  instance of struct S with body 1
+  ... with parameters
+    char
+
Index: tests/.expect/init1.txt
===================================================================
--- tests/.expect/init1.txt	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
+++ tests/.expect/init1.txt	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -0,0 +1,47 @@
+error: No reasonable alternatives for expression Untyped Init Expression
+  Name: rx  InitAlternative: reference to signed int
+error: No reasonable alternatives for expression Untyped Init Expression
+  Name: px  InitAlternative: pointer to signed int
+error: No reasonable alternatives for expression Untyped Init Expression
+  Name: crx  InitAlternative: reference to float
+error: No reasonable alternatives for expression Untyped Init Expression
+  Name: cpx  InitAlternative: pointer to float
+init1.cfa:94:1 error: No reasonable alternatives for expression Generated Cast of:
+  Name: rx
+... to:
+  reference to signed int
+init1.cfa:97:1 error: No reasonable alternatives for expression Applying untyped:
+  Name: ?{}
+...to:
+  Generated Cast of:
+    Variable Expression: _retval_f_py: pointer to signed int
+  ... to:
+    reference to pointer to signed int
+  Name: px
+
+init1.cfa:104:1 error: No reasonable alternatives for expression Generated Cast of:
+  Name: crx
+... to:
+  reference to float
+init1.cfa:107:1 error: No reasonable alternatives for expression Applying untyped:
+  Name: ?{}
+...to:
+  Generated Cast of:
+    Variable Expression: _retval_f_py2: pointer to float
+  ... to:
+    reference to pointer to float
+  Name: cpx
+
+init1.cfa:114:1 error: No reasonable alternatives for expression Generated Cast of:
+  Name: s
+... to:
+  reference to instance of type T (not function type)
+init1.cfa:118:1 error: No reasonable alternatives for expression Applying untyped:
+  Name: ?{}
+...to:
+  Generated Cast of:
+    Variable Expression: _retval_anycvt: pointer to instance of type T (not function type)
+  ... to:
+    reference to pointer to instance of type T (not function type)
+  Name: s
+
Index: tests/Makefile.in
===================================================================
--- tests/Makefile.in	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ tests/Makefile.in	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -355,6 +355,10 @@
 AUTOMAKE_OPTIONS = foreign    # do not require all the GNU file names
 ACLOCAL_AMFLAGS = -I automake
-CFACOMPILE = $(CFACC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CFAFLAGS) $(CFAFLAGS) $(AM_CFLAGS) $(CFLAGS)
-LTCFACOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+AM_T_CFA = $(am__t_CFA_@AM_T@)
+am__t_CFA_ = 
+am__t_CFA_0 = 
+am__t_CFA_1 = /usr/bin/time --quiet -f "$@ %E" # trailling space is necessary
+CFACOMPILE = $(AM_T_CFA)$(CFACC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CFAFLAGS) $(CFAFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCFACOMPILE = $(AM_T_CFA)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=compile $(CFACC) $(DEFS) \
 	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CFAFLAGS) $(AM_CFLAGS) $(CFAFLAGS) $(CFLAGS)
Index: tests/avltree/avl1.cfa
===================================================================
--- tests/avltree/avl1.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ tests/avltree/avl1.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -24,5 +24,5 @@
 tree(K, V) * create(K key, V value) {
   // infinite loop trying to resolve ... t = malloc();
-  tree(K, V) * t = malloc(sizeof(tree(K,V)));
+  tree(K, V) * t = ( tree(K, V) * ) malloc(sizeof(tree(K,V)));
   (*t){ key, value };
   return t;
Index: tests/bugs/140.cfa
===================================================================
--- tests/bugs/140.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
+++ tests/bugs/140.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -0,0 +1,10 @@
+// Trac Ticket: https://cforall.uwaterloo.ca/trac/ticket/140
+
+#include <stdio.h>
+
+int main() {
+	char c[1] = { 3 };
+	int i = 4;
+	int r = 3 < 4 ? c[0] : i;
+	printf( "%d\n", r );
+}
Index: tests/bugs/203-2.cfa
===================================================================
--- tests/bugs/203-2.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
+++ tests/bugs/203-2.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -0,0 +1,16 @@
+// Trac ticket: https://cforall.uwaterloo.ca/trac/ticket/203
+
+forall(dtype A)
+struct empty {
+	// Nothing.
+};
+
+forall(dtype C)
+struct wrap_e {
+	empty(C) field;
+};
+
+empty(int) empty_obj;
+empty(char) empty_obj;
+
+wrap_e(char) outer_obj @= { empty_obj };
Index: tests/bugs/203-7.cfa
===================================================================
--- tests/bugs/203-7.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
+++ tests/bugs/203-7.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -0,0 +1,13 @@
+// Trac ticket: https://cforall.uwaterloo.ca/trac/ticket/203
+
+forall(dtype A)
+struct empty {
+	// Nothing.
+};
+
+forall(dtype C)
+struct wrap_e {
+	empty(C) field;
+};
+
+wrap_e(bool) error_obj = { {} };
Index: tests/bugs/203-9.cfa
===================================================================
--- tests/bugs/203-9.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
+++ tests/bugs/203-9.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -0,0 +1,13 @@
+// Trac ticket: https://cforall.uwaterloo.ca/trac/ticket/203
+
+forall(dtype A)
+struct empty {
+	// Nothing.
+};
+
+forall(dtype C)
+struct wrap_e {
+	empty(C) field;
+};
+
+wrap_e(bool) error_obj @= { {} };
Index: tests/bugs/66.cfa
===================================================================
--- tests/bugs/66.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ tests/bugs/66.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -5,5 +5,5 @@
 
 int main() {
-	int * next = (void*)0;
+	int * next = 0p;
 	if( next ) {
 		return 1;
Index: tests/castError.cfa
===================================================================
--- tests/castError.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ tests/castError.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -14,5 +14,7 @@
 // 
 
+forall(otype T) struct S { T p; };
 int f;
+S(int) sint;
 
 void f() {
@@ -25,4 +27,6 @@
 	short int v;
 	3, v;		// implicit void cast
+
+	(S(char)) sint;
 }
 
Index: tests/concurrent/signal/block.cfa
===================================================================
--- tests/concurrent/signal/block.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ tests/concurrent/signal/block.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -82,5 +82,5 @@
 	if( !is_empty( cond ) ) {
 
-		$thread * next = front( cond );
+		$thread * next = ( $thread * ) front( cond );
 
 		if( ! signal_block( cond ) ) {
Index: tests/exceptions/.expect/polymophic.txt
===================================================================
--- tests/exceptions/.expect/polymophic.txt	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
+++ tests/exceptions/.expect/polymophic.txt	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -0,0 +1,7 @@
+terminate catch
+resume catch
+caught proxy(char)
+
+-7
+0
+1
Index: tests/exceptions/conditional.cfa
===================================================================
--- tests/exceptions/conditional.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ tests/exceptions/conditional.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -17,5 +17,5 @@
 };
 
-void num_error_msg(num_error * this) {
+const char * num_error_msg(num_error * this) {
     if ( ! this->msg ) {
         static const char * base = "Num Error with code: X";
Index: tests/exceptions/defaults.cfa
===================================================================
--- tests/exceptions/defaults.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ tests/exceptions/defaults.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -13,5 +13,5 @@
 }
 
-char * get_log_message(log_message * this) {
+const char * get_log_message(log_message * this) {
 	return this->msg;
 }
@@ -28,10 +28,10 @@
 	// We can catch log:
 	try {
-		throwResume (log_message){(char *)"Should be printed.\n"};
+		throwResume (log_message){"Should be printed.\n"};
 	} catchResume (log_message * this) {
 		printf("%s", this->virtual_table->msg(this));
 	}
 	// But we don't have to:
-	throwResume (log_message){(char *)"Should not be printed.\n"};
+	throwResume (log_message){"Should not be printed.\n"};
 }
 
Index: tests/exceptions/polymophic.cfa
===================================================================
--- tests/exceptions/polymophic.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
+++ tests/exceptions/polymophic.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -0,0 +1,72 @@
+// Testing polymophic exception types.
+
+#include <exception.hfa>
+
+FORALL_TRIVIAL_EXCEPTION_(proxy, (otype U3), (U3));
+FORALL_TRIVIAL_INSTANCE_(proxy, (otype U4), (U4))
+
+const char * msg(proxy(int) * this) { return "proxy(int)"; }
+const char * msg(proxy(char) * this) { return "proxy(char)"; }
+POLY_VTABLE_INSTANCE(proxy, int)(msg);
+POLY_VTABLE_INSTANCE(proxy, char)(msg);
+
+void proxy_test () {
+    try {
+		throw (proxy(int)){};
+	} catch (proxy(int) *) {
+		printf("terminate catch\n");
+	}
+
+	try {
+		throwResume (proxy(char)){};
+	} catchResume (proxy(char) *) {
+		printf("resume catch\n");
+	}
+
+	try {
+		throw (proxy(char)){};
+	} catch (proxy(int) *) {
+		printf("caught proxy(int)\n");
+	} catch (proxy(char) *) {
+		printf("caught proxy(char)\n");
+	}
+}
+
+FORALL_DATA_EXCEPTION(cell, (otype T), (T))(
+	T data;
+);
+
+FORALL_DATA_INSTANCE(cell, (otype T), (T))
+
+const char * msg(cell(int) * this) { return "cell(int)"; }
+const char * msg(cell(char) * this) { return "cell(char)"; }
+const char * msg(cell(bool) * this) { return "cell(bool)"; }
+POLY_VTABLE_INSTANCE(cell, int)(msg);
+POLY_VTABLE_INSTANCE(cell, char)(msg);
+POLY_VTABLE_INSTANCE(cell, bool)(msg);
+
+void cell_test(void) {
+	try {
+		cell(int) except;
+		except.data = -7;
+		throw except;
+	} catch (cell(int) * error) {
+		printf("%d\n", error->data);
+	}
+
+	try {
+		cell(bool) ball;
+		ball.data = false;
+		throwResume ball;
+		printf("%i\n", ball.data);
+	} catchResume (cell(bool) * error) {
+		printf("%i\n", error->data);
+		error->data = true;
+	}
+}
+
+int main(int argc, char * argv[]) {
+	proxy_test();
+	printf("\n");
+	cell_test();
+}
Index: tests/heap.cfa
===================================================================
--- tests/heap.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ tests/heap.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -10,6 +10,6 @@
 // Created On       : Tue Nov  6 17:54:56 2018
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Sun Nov 24 12:34:51 2019
-// Update Count     : 28
+// Last Modified On : Tue Aug  4 06:36:17 2020
+// Update Count     : 56
 // 
 
@@ -75,5 +75,4 @@
 		size_t s = (i + 1) * 20;
 		char * area = (char *)malloc( s );
-		if ( area == 0p ) abort( "malloc/free out of memory" );
 		area[0] = '\345'; area[s - 1] = '\345';			// fill first/last
 		area[malloc_usable_size( area ) - 1] = '\345';	// fill ultimate byte
@@ -84,5 +83,4 @@
 		size_t s = i + 1;								// +1 to make initialization simpler
 		locns[i] = (char *)malloc( s );
-		if ( locns[i] == 0p ) abort( "malloc/free out of memory" );
 		locns[i][0] = '\345'; locns[i][s - 1] = '\345';	// fill first/last
 		locns[i][malloc_usable_size( locns[i] ) - 1] = '\345'; // fill ultimate byte
@@ -100,5 +98,4 @@
 		size_t s = i + default_mmap_start();			// cross over point
 		char * area = (char *)malloc( s );
-		if ( area == 0p ) abort( "malloc/free out of memory" );
 		area[0] = '\345'; area[s - 1] = '\345';			// fill first/last
 		area[malloc_usable_size( area ) - 1] = '\345';	// fill ultimate byte
@@ -109,5 +106,4 @@
 		size_t s = i + default_mmap_start();			// cross over point
 		locns[i] = (char *)malloc( s );
-		if ( locns[i] == 0p ) abort( "malloc/free out of memory" );
 		locns[i][0] = '\345'; locns[i][s - 1] = '\345';	// fill first/last
 		locns[i][malloc_usable_size( locns[i] ) - 1] = '\345'; // fill ultimate byte
@@ -125,7 +121,6 @@
 		size_t s = (i + 1) * 20;
 		char * area = (char *)calloc( 5, s );
-		if ( area == 0p ) abort( "calloc/free out of memory" );
 		if ( area[0] != '\0' || area[s - 1] != '\0' ||
-			 area[malloc_usable_size( area ) - 1] != '\0' ||
+			 area[malloc_size( area ) - 1] != '\0' ||
 			 ! malloc_zero_fill( area ) ) abort( "calloc/free corrupt storage1" );
 		area[0] = '\345'; area[s - 1] = '\345';			// fill first/last
@@ -137,7 +132,6 @@
 		size_t s = i + 1;
 		locns[i] = (char *)calloc( 5, s );
-		if ( locns[i] == 0p ) abort( "calloc/free out of memory" );
 		if ( locns[i][0] != '\0' || locns[i][s - 1] != '\0' ||
-			 locns[i][malloc_usable_size( locns[i] ) - 1] != '\0' ||
+			 locns[i][malloc_size( locns[i] ) - 1] != '\0' ||
 			 ! malloc_zero_fill( locns[i] ) ) abort( "calloc/free corrupt storage2" );
 		locns[i][0] = '\345'; locns[i][s - 1] = '\345';	// fill first/last
@@ -156,7 +150,6 @@
 		size_t s = i + default_mmap_start();			// cross over point
 		char * area = (char *)calloc( 1, s );
-		if ( area == 0p ) abort( "calloc/free out of memory" );
 		if ( area[0] != '\0' || area[s - 1] != '\0' ) abort( "calloc/free corrupt storage4.1" );
-		if ( area[malloc_usable_size( area ) - 1] != '\0' ) abort( "calloc/free corrupt storage4.2" );
+		if ( area[malloc_size( area ) - 1] != '\0' ) abort( "calloc/free corrupt storage4.2" );
 		if ( ! malloc_zero_fill( area ) ) abort( "calloc/free corrupt storage4.3" );
 		area[0] = '\345'; area[s - 1] = '\345';			// fill first/last
@@ -168,7 +161,6 @@
 		size_t s = i + default_mmap_start();			// cross over point
 		locns[i] = (char *)calloc( 1, s );
-		if ( locns[i] == 0p ) abort( "calloc/free out of memory" );
 		if ( locns[i][0] != '\0' || locns[i][s - 1] != '\0' ||
-			 locns[i][malloc_usable_size( locns[i] ) - 1] != '\0' ||
+			 locns[i][malloc_size( locns[i] ) - 1] != '\0' ||
 			 ! malloc_zero_fill( locns[i] ) ) abort( "calloc/free corrupt storage5" );
 		locns[i][0] = '\345'; locns[i][s - 1] = '\345';	// fill first/last
@@ -188,5 +180,4 @@
 		for ( s; 1 ~ NoOfAllocs ) {						// allocation of size 0 can return null
 			char * area = (char *)memalign( a, s );
-			if ( area == 0p ) abort( "memalign/free out of memory" );
 			//sout | i | area;
 			if ( (size_t)area % a != 0 || malloc_alignment( area ) != a ) { // check for initial alignment
@@ -206,5 +197,4 @@
 			size_t s = i + default_mmap_start();		// cross over point
 			char * area = (char *)memalign( a, s );
-			if ( area == 0p ) abort( "memalign/free out of memory" );
 			//sout | i | area;
 			if ( (size_t)area % a != 0 || malloc_alignment( area ) != a ) { // check for initial alignment
@@ -222,7 +212,6 @@
 		// initial N byte allocation
 		char * area = (char *)calloc( 5, i );
-		if ( area == 0p ) abort( "calloc/realloc/free out of memory" );
 		if ( area[0] != '\0' || area[i - 1] != '\0' ||
-			 area[malloc_usable_size( area ) - 1] != '\0' ||
+			 area[malloc_size( area ) - 1] != '\0' ||
 			 ! malloc_zero_fill( area ) ) abort( "calloc/realloc/free corrupt storage1" );
 
@@ -230,7 +219,6 @@
 		for ( s; i ~ 256 * 1024 ~ 26 ) {				// start at initial memory request
 			area = (char *)realloc( area, s );			// attempt to reuse storage
-			if ( area == 0p ) abort( "calloc/realloc/free out of memory" );
 			if ( area[0] != '\0' || area[s - 1] != '\0' ||
-				 area[malloc_usable_size( area ) - 1] != '\0' ||
+				 area[malloc_size( area ) - 1] != '\0' ||
 				 ! malloc_zero_fill( area ) ) abort( "calloc/realloc/free corrupt storage2" );
 		} // for
@@ -244,16 +232,17 @@
 		size_t s = i + default_mmap_start();			// cross over point
 		char * area = (char *)calloc( 1, s );
-		if ( area == 0p ) abort( "calloc/realloc/free out of memory" );
+// 		if ( area == 0p ) abort( "calloc/realloc/free out of memory" );
 		if ( area[0] != '\0' || area[s - 1] != '\0' ||
-			 area[malloc_usable_size( area ) - 1] != '\0' ||
-			 ! malloc_zero_fill( area ) ) abort( "calloc/realloc/free corrupt storage1" );
+			 area[malloc_size( area ) - 1] != '\0' ||
+			 ! malloc_zero_fill( area ) ) //abort( "calloc/realloc/free corrupt storage3" );
+			printf( "C %zd %d %d %d %d\n", s, area[0] != '\0', area[s - 1] != '\0', area[malloc_size( area ) - 1] != '\0', ! malloc_zero_fill( area ) );
 
 		// Do not start this loop index at 0 because realloc of 0 bytes frees the storage.
 		for ( r; i ~ 256 * 1024 ~ 26 ) {				// start at initial memory request
 			area = (char *)realloc( area, r );			// attempt to reuse storage
-			if ( area == 0p ) abort( "calloc/realloc/free out of memory" );
+// 			if ( area == 0p ) abort( "calloc/realloc/free out of memory" );
 			if ( area[0] != '\0' || area[r - 1] != '\0' ||
-				 area[malloc_usable_size( area ) - 1] != '\0' ||
-				 ! malloc_zero_fill( area ) ) abort( "calloc/realloc/free corrupt storage2" );
+				 area[malloc_size( area ) - 1] != '\0' ||
+				 ! malloc_zero_fill( area ) ) abort( "calloc/realloc/free corrupt storage4" );
 		} // for
 		free( area );
@@ -266,5 +255,5 @@
 		// initial N byte allocation
 		char * area = (char *)memalign( a, amount );	// aligned N-byte allocation
-		if ( area == 0p ) abort( "memalign/realloc/free out of memory" ); // no storage ?
+// 		if ( area == 0p ) abort( "memalign/realloc/free out of memory" ); // no storage ?
 		//sout | alignments[a] | area;
 		if ( (size_t)area % a != 0 || malloc_alignment( area ) != a ) { // check for initial alignment
@@ -277,5 +266,4 @@
 			if ( area[0] != '\345' || area[s - 2] != '\345' ) abort( "memalign/realloc/free corrupt storage" );
 			area = (char *)realloc( area, s );			// attempt to reuse storage
-			if ( area == 0p ) abort( "memalign/realloc/free out of memory" ); // no storage ?
 			//sout | i | area;
 			if ( (size_t)area % a != 0 ) {				// check for initial alignment
@@ -293,5 +281,4 @@
 		for ( s; 1 ~ limit ) {							// allocation of size 0 can return null
 			char * area = (char *)cmemalign( a, 1, s );
-			if ( area == 0p ) abort( "cmemalign/free out of memory" );
 			//sout | i | area;
 			if ( (size_t)area % a != 0 || malloc_alignment( area ) != a ) { // check for initial alignment
@@ -299,5 +286,5 @@
 			} // if
 			if ( area[0] != '\0' || area[s - 1] != '\0' ||
-				 area[malloc_usable_size( area ) - 1] != '\0' ||
+				 area[malloc_size( area ) - 1] != '\0' ||
 				 ! malloc_zero_fill( area ) ) abort( "cmemalign/free corrupt storage" );
 			area[0] = '\345'; area[s - 1] = '\345';		// fill first/last byte
@@ -312,5 +299,4 @@
 		// initial N byte allocation
 		char * area = (char *)cmemalign( a, 1, amount ); // aligned N-byte allocation
-		if ( area == 0p ) abort( "cmemalign/realloc/free out of memory" ); // no storage ?
 		//sout | alignments[a] | area;
 		if ( (size_t)area % a != 0 || malloc_alignment( area ) != a ) { // check for initial alignment
@@ -318,5 +304,5 @@
 		} // if
 		if ( area[0] != '\0' || area[amount - 1] != '\0' ||
-			 area[malloc_usable_size( area ) - 1] != '\0' ||
+			 area[malloc_size( area ) - 1] != '\0' ||
 			 ! malloc_zero_fill( area ) ) abort( "cmemalign/realloc/free corrupt storage1" );
 		area[0] = '\345'; area[amount - 2] = '\345';	// fill first/penultimate byte
@@ -326,11 +312,10 @@
 			if ( area[0] != '\345' || area[s - 2] != '\345' ) abort( "cmemalign/realloc/free corrupt storage2" );
 			area = (char *)realloc( area, s );			// attempt to reuse storage
-			if ( area == 0p ) abort( "cmemalign/realloc/free out of memory" ); // no storage ?
 			//sout | i | area;
 			if ( (size_t)area % a != 0 || malloc_alignment( area ) != a ) { // check for initial alignment
 				abort( "cmemalign/realloc/free bad alignment %p", area );
 			} // if
-			if ( area[s - 1] != '\0' || area[s - 1] != '\0' ||
-				 area[malloc_usable_size( area ) - 1] != '\0' ||
+			if ( area[0] != '\345' || area[s - 1] != '\0' ||
+				 area[malloc_size( area ) - 1] != '\0' ||
 				 ! malloc_zero_fill( area ) ) abort( "cmemalign/realloc/free corrupt storage3" );
 			area[s - 1] = '\345';						// fill last byte
@@ -345,5 +330,4 @@
 		// initial N byte allocation
 		char * area = (char *)memalign( a, amount );	// aligned N-byte allocation
-		if ( area == 0p ) abort( "memalign/realloc with align/free out of memory" ); // no storage ?
 		//sout | alignments[a] | area | endl;
 		if ( (size_t)area % a != 0 || malloc_alignment( area ) != a ) { // check for initial alignment
@@ -356,5 +340,4 @@
 			if ( area[0] != '\345' || area[s - 2] != '\345' ) abort( "memalign/realloc/free corrupt storage" );
 			area = (char *)realloc( area, a * 2, s );	// attempt to reuse storage
-			if ( area == 0p ) abort( "memalign/realloc with align/free out of memory" ); // no storage ?
 			//sout | i | area | endl;
 			if ( (size_t)area % a * 2 != 0 ) {			// check for initial alignment
@@ -371,6 +354,5 @@
 	for ( size_t a = libAlign() + libAlign(); a <= limit; a += a ) { // generate powers of 2
 		// initial N byte allocation
-		char *area = (char *)cmemalign( a, 1, amount );	// aligned N-byte allocation
-		if ( area == 0p ) abort( "cmemalign/realloc with align/free out of memory" ); // no storage ?
+		char * area = (char *)cmemalign( a, 1, amount ); // aligned N-byte allocation
 		//sout | alignments[a] | area | endl;
 		if ( (size_t)area % a != 0 || malloc_alignment( area ) != a ) { // check for initial alignment
@@ -378,5 +360,5 @@
 		} // if
 		if ( area[0] != '\0' || area[amount - 1] != '\0' ||
-			 area[malloc_usable_size( area ) - 1] != '\0' ||
+			 area[malloc_size( area ) - 1] != '\0' ||
 			 ! malloc_zero_fill( area ) ) abort( "cmemalign/realloc with align/free corrupt storage1" );
 		area[0] = '\345'; area[amount - 2] = '\345';	// fill first/penultimate byte
@@ -386,11 +368,10 @@
 			if ( area[0] != '\345' || area[s - 2] != '\345' ) abort( "cmemalign/realloc with align/free corrupt storage2" );
 			area = (char *)realloc( area, a * 2, s );	// attempt to reuse storage
-			if ( area == 0p ) abort( "cmemalign/realloc with align/free out of memory" ); // no storage ?
 			//sout | i | area | endl;
 			if ( (size_t)area % a * 2 != 0 || malloc_alignment( area ) != a * 2 ) { // check for initial alignment
-				abort( "cmemalign/realloc with align/free bad alignment %p %jd %jd", area, malloc_alignment( area ), a * 2 );
+				abort( "cmemalign/realloc with align/free bad alignment %p %zd %zd", area, malloc_alignment( area ), a * 2 );
 			} // if
 			if ( area[s - 1] != '\0' || area[s - 1] != '\0' ||
-				 area[malloc_usable_size( area ) - 1] != '\0' ||
+				 area[malloc_size( area ) - 1] != '\0' ||
 				 ! malloc_zero_fill( area ) ) abort( "cmemalign/realloc/free corrupt storage3" );
 			area[s - 1] = '\345';						// fill last byte
Index: tests/init1.cfa
===================================================================
--- tests/init1.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
+++ tests/init1.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -0,0 +1,120 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// init1.cfa -- tests of initializing pointer- and reference-typed variables and returns
+//
+// Author           : Michael Brooks
+// Created On       : Thu Jul 16 22:00:00 2020
+// Last Modified By : Michael Brooks
+// Last Modified On : Thu Jul 16 22:00:00 2020
+// Update Count     : 1
+//
+
+void f() {
+
+    //
+    // setup
+    //
+
+    float x = 3.14;
+
+    float & rx = x;
+    float * px = &x;
+
+    const float & crx = x;
+    const float * cpx = &x;
+
+    //
+    // sound initializations
+    //
+
+    char * str1 = "hi";
+    const char * str2 = "hi";
+
+    float & rx2 = rx;
+    float * px2 = px;
+
+    const float & crx2 = crx;
+    const float * cpx2 = cpx;
+
+    //
+    // unsound initializations
+    //
+
+    // mismatched referenced type
+    int & ry = rx;
+    int * py = px;
+
+    // would discard refered constness (same float is referenced)
+    float & ry2 = crx;
+    float * py2 = cpx;
+}
+
+//
+// sound returns
+//
+
+char * f_str1() {
+    return "hi";
+}
+
+const char * f_str2() {
+    return "hi";
+}
+
+float & f_rx2 () {
+    float & rx = *0p;
+    return rx;
+}
+
+float * f_px2 () {
+    float * px = 0p;
+    return px;
+}
+
+const float & f_crx2 () {
+    float & rx = *0p;
+    return rx;
+}
+
+const float * f_cpx2 () {
+    float * px = 0p;
+    return px;
+}
+
+//
+// unsound returns
+//
+
+int & f_ry() { 
+    float & rx = *0p;
+    return rx;               // mismatched referenced type
+}
+
+int * f_py() {
+    float * px = 0p;
+    return px;               // mismatched referenced type
+}
+
+float & f_ry2() {
+    const float & crx = *0p;
+    return crx;              // would discard refered constness (same float is referenced)
+}
+
+float * f_py2() {
+    const float * cpx = 0p;
+    return cpx;              // would discard refered constness (same float is referenced)
+}
+
+forall (dtype T, dtype S)
+T & anycvt( S & s ) {
+    return s;               // mismatched referenced type
+}
+
+forall (dtype T, dtype S)
+T * anycvt( S * s ) {
+    return s;               // mismatched referenced type
+}
Index: tests/io2.cfa
===================================================================
--- tests/io2.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ tests/io2.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -121,5 +121,5 @@
 
 	[int, int, const char *, double] t3 = { 3, 4, "a", 7.2 };
-	sout | [ 3, 4, "a", 7.2 ];
+	sout | [ 3, 4, (const char*)"a", 7.2 ];             // workaround trac#207: the const cast should not be needed
 	sout | t3;
 	sepSetTuple( sout, " " );
Index: tests/searchsort.cfa
===================================================================
--- tests/searchsort.cfa	(revision 3f850d79f0889d7d287cec10f41f1700e66928c5)
+++ tests/searchsort.cfa	(revision 53ee27e46dd5864a8ace801fe844ed654a140169)
@@ -38,7 +38,7 @@
 	} // for
 	sout | nl;
-	for ( i; 0 ~ size ) {		// C version
+	for ( i; 0 ~ size ) {		// C version, returns void*
 		int key = size - i;
-		int * v = bsearch( &key, iarr, size, sizeof( iarr[0] ), comp );
+		int * v = ( int * ) bsearch( &key, iarr, size, sizeof( iarr[0] ), comp );
 		sout | key | ':' | *v | ", ";
 	} // for
