Index: Jenkinsfile
===================================================================
--- Jenkinsfile	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ Jenkinsfile	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -102,9 +102,4 @@
 
 		echo GitLogMessage()
-
-		// This is a complete hack but it solves problems with automake thinking it needs to regenerate makefiles
-		// We fudged automake/missing to handle that but automake stills bakes prints inside the makefiles
-		// and these cause more problems.
-		sh 'find . -name Makefile.in -exec touch {} +'
 	}
 }
@@ -465,5 +460,5 @@
 					description: 'Which compiler to use',					\
 					name: 'Compiler',									\
-					choices: 'gcc-9\ngcc-8\ngcc-7\ngcc-6\ngcc-5\ngcc-4.9\nclang',					\
+					choices: 'gcc-9\ngcc-8\ngcc-7\ngcc-6\ngcc-5\ngcc-4.9\nclang',	\
 					defaultValue: 'gcc-8',								\
 				],												\
Index: libcfa/configure.ac
===================================================================
--- libcfa/configure.ac	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ libcfa/configure.ac	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -166,4 +166,5 @@
 AH_TEMPLATE([CFA_HAVE_IORING_OP_PROVIDE_BUFFERS],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_PROVIDE_BUFFERS.])
 AH_TEMPLATE([CFA_HAVE_IORING_OP_REMOVE_BUFFER],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_REMOVE_BUFFER.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_TEE],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_TEE.])
 AH_TEMPLATE([CFA_HAVE_IOSQE_FIXED_FILE],[Defined if io_uring support is present when compiling libcfathread and supports the flag FIXED_FILE.])
 AH_TEMPLATE([CFA_HAVE_IOSQE_IO_DRAIN],[Defined if io_uring support is present when compiling libcfathread and supports the flag IO_DRAIN.])
@@ -173,9 +174,12 @@
 AH_TEMPLATE([CFA_HAVE_SPLICE_F_FD_IN_FIXED],[Defined if io_uring support is present when compiling libcfathread and supports the flag SPLICE_F_FD_IN_FIXED.])
 AH_TEMPLATE([CFA_HAVE_IORING_SETUP_ATTACH_WQ],[Defined if io_uring support is present when compiling libcfathread and supports the flag IORING_SETUP_ATTACH_WQ.])
-AH_TEMPLATE([HAVE_PREADV2],[Defined if preadv2 support is present when compiling libcfathread.])
-AH_TEMPLATE([HAVE_PWRITEV2],[Defined if pwritev2 support is present when compiling libcfathread.])
+AH_TEMPLATE([CFA_HAVE_PREADV2],[Defined if preadv2 support is present when compiling libcfathread.])
+AH_TEMPLATE([CFA_HAVE_PWRITEV2],[Defined if pwritev2 support is present when compiling libcfathread.])
+AH_TEMPLATE([CFA_HAVE_PWRITEV2],[Defined if pwritev2 support is present when compiling libcfathread.])
+AH_TEMPLATE([CFA_HAVE_STATX],[Defined if statx support is present when compiling libcfathread.])
+AH_TEMPLATE([CFA_HAVE_OPENAT2],[Defined if openat2 support is present when compiling libcfathread.])
 AH_TEMPLATE([__CFA_NO_STATISTICS__],[Defined if libcfathread was compiled without support for statistics.])
 
-define(ioring_ops, [IORING_OP_NOP,IORING_OP_READV,IORING_OP_WRITEV,IORING_OP_FSYNC,IORING_OP_READ_FIXED,IORING_OP_WRITE_FIXED,IORING_OP_POLL_ADD,IORING_OP_POLL_REMOVE,IORING_OP_SYNC_FILE_RANGE,IORING_OP_SENDMSG,IORING_OP_RECVMSG,IORING_OP_TIMEOUT,IORING_OP_TIMEOUT_REMOVE,IORING_OP_ACCEPT,IORING_OP_ASYNC_CANCEL,IORING_OP_LINK_TIMEOUT,IORING_OP_CONNECT,IORING_OP_FALLOCATE,IORING_OP_OPENAT,IORING_OP_CLOSE,IORING_OP_FILES_UPDATE,IORING_OP_STATX,IORING_OP_READ,IORING_OP_WRITE,IORING_OP_FADVISE,IORING_OP_MADVISE,IORING_OP_SEND,IORING_OP_RECV,IORING_OP_OPENAT2,IORING_OP_EPOLL_CTL,IORING_OP_SPLICE,IORING_OP_PROVIDE_BUFFERS,IORING_OP_REMOVE_BUFFER])
+define(ioring_ops, [IORING_OP_NOP,IORING_OP_READV,IORING_OP_WRITEV,IORING_OP_FSYNC,IORING_OP_READ_FIXED,IORING_OP_WRITE_FIXED,IORING_OP_POLL_ADD,IORING_OP_POLL_REMOVE,IORING_OP_SYNC_FILE_RANGE,IORING_OP_SENDMSG,IORING_OP_RECVMSG,IORING_OP_TIMEOUT,IORING_OP_TIMEOUT_REMOVE,IORING_OP_ACCEPT,IORING_OP_ASYNC_CANCEL,IORING_OP_LINK_TIMEOUT,IORING_OP_CONNECT,IORING_OP_FALLOCATE,IORING_OP_OPENAT,IORING_OP_CLOSE,IORING_OP_FILES_UPDATE,IORING_OP_STATX,IORING_OP_READ,IORING_OP_WRITE,IORING_OP_FADVISE,IORING_OP_MADVISE,IORING_OP_SEND,IORING_OP_RECV,IORING_OP_OPENAT2,IORING_OP_EPOLL_CTL,IORING_OP_SPLICE,IORING_OP_PROVIDE_BUFFERS,IORING_OP_REMOVE_BUFFER,IORING_OP_TEE])
 define(ioring_flags, [IOSQE_FIXED_FILE,IOSQE_IO_DRAIN,IOSQE_ASYNC,IOSQE_IO_LINK,IOSQE_IO_HARDLINK,SPLICE_F_FD_IN_FIXED,IORING_SETUP_ATTACH_WQ])
 
@@ -222,5 +226,6 @@
 	])
 ])
-AC_CHECK_FUNCS([preadv2 pwritev2])
+AC_CHECK_FUNC([preadv2], [AC_DEFINE([CFA_HAVE_PREADV2])])
+AC_CHECK_FUNC([pwritev2], [AC_DEFINE([CFA_HAVE_PWRITEV2])])
 
 AC_CONFIG_FILES([
@@ -229,4 +234,5 @@
 	prelude/Makefile
 	])
+AC_CONFIG_FILES([src/concurrency/io/call.cfa], [python3 ${srcdir}/src/concurrency/io/call.cfa.in > src/concurrency/io/call.cfa])
 
 AC_CONFIG_HEADERS(prelude/defines.hfa)
Index: libcfa/prelude/defines.hfa.in
===================================================================
--- libcfa/prelude/defines.hfa.in	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ libcfa/prelude/defines.hfa.in	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -117,4 +117,8 @@
 
 /* Defined if io_uring support is present when compiling libcfathread and
+   supports the operation IORING_OP_TEE. */
+#undef CFA_HAVE_IORING_OP_TEE
+
+/* Defined if io_uring support is present when compiling libcfathread and
    supports the operation IORING_OP_TIMEOUT. */
 #undef CFA_HAVE_IORING_OP_TIMEOUT
@@ -163,8 +167,20 @@
 #undef CFA_HAVE_LINUX_IO_URING_H
 
+/* Defined if openat2 support is present when compiling libcfathread. */
+#undef CFA_HAVE_OPENAT2
+
+/* Defined if preadv2 support is present when compiling libcfathread. */
+#undef CFA_HAVE_PREADV2
+
+/* Defined if pwritev2 support is present when compiling libcfathread. */
+#undef CFA_HAVE_PWRITEV2
+
 /* Defined if io_uring support is present when compiling libcfathread and
    supports the flag SPLICE_F_FD_IN_FIXED. */
 #undef CFA_HAVE_SPLICE_F_FD_IN_FIXED
 
+/* Defined if statx support is present when compiling libcfathread. */
+#undef CFA_HAVE_STATX
+
 /* Location of include files. */
 #undef CFA_INCDIR
@@ -188,10 +204,4 @@
 #undef HAVE_MEMORY_H
 
-/* Define to 1 if you have the `preadv2' function. */
-#undef HAVE_PREADV2
-
-/* Define to 1 if you have the `pwritev2' function. */
-#undef HAVE_PWRITEV2
-
 /* Define to 1 if you have the <stdint.h> header file. */
 #undef HAVE_STDINT_H
Index: libcfa/src/Makefile.am
===================================================================
--- libcfa/src/Makefile.am	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ libcfa/src/Makefile.am	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -62,4 +62,5 @@
 	iterator.hfa \
 	limits.hfa \
+	memory.hfa \
 	parseargs.hfa \
 	rational.hfa \
@@ -107,5 +108,5 @@
 	concurrency/io/setup.cfa \
 	concurrency/io/types.hfa \
-	concurrency/iocall.cfa \
+	concurrency/io/call.cfa \
 	concurrency/iofwd.hfa \
 	concurrency/kernel_private.hfa \
Index: libcfa/src/bits/locks.hfa
===================================================================
--- libcfa/src/bits/locks.hfa	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ libcfa/src/bits/locks.hfa	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -357,5 +357,12 @@
 				struct oneshot * expected = this.ptr;
 				// was this abandoned?
-				if( expected == 3p ) { free( &this ); return false; }
+				#if defined(__GNUC__) && __GNUC__ >= 7
+					#pragma GCC diagnostic push
+					#pragma GCC diagnostic ignored "-Wfree-nonheap-object"
+				#endif
+					if( expected == 3p ) { free( &this ); return false; }
+				#if defined(__GNUC__) && __GNUC__ >= 7
+					#pragma GCC diagnostic pop
+				#endif
 
 				/* paranoid */ verify( expected != 1p ); // Future is already fulfilled, should not happen
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ libcfa/src/concurrency/io.cfa	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -159,9 +159,8 @@
 
 	static inline void process(struct io_uring_cqe & cqe ) {
-		struct __io_user_data_t * data = (struct __io_user_data_t *)(uintptr_t)cqe.user_data;
-		__cfadbg_print_safe( io, "Kernel I/O : Syscall completed : cqe %p, result %d for %p\n", data, cqe.res, data->thrd );
-
-		data->result = cqe.res;
-		post( data->sem );
+		struct io_future_t * future = (struct io_future_t *)(uintptr_t)cqe.user_data;
+		__cfadbg_print_safe( io, "Kernel I/O : Syscall completed : cqe %p, result %d for %p\n", future, cqe.res, data->thrd );
+
+		fulfil( *future, cqe.res );
 	}
 
Index: libcfa/src/concurrency/io/call.cfa.in
===================================================================
--- libcfa/src/concurrency/io/call.cfa.in	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
+++ libcfa/src/concurrency/io/call.cfa.in	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -0,0 +1,506 @@
+#!python3
+#
+# Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
+#
+# The contents of this file are covered under the licence agreement in the
+# file "LICENCE" distributed with Cforall.
+#
+# call.cfa.in -- Python script to generate io/call.cfa
+#
+# Author           : Thierry Delisle
+# Created On       : Fri Sep 11 12:41:16 2020
+# Last Modified By :
+# Last Modified On :
+# Update Count     :
+#
+
+Header = """//
+// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// call.cfa -- Api for cforall
+//
+// Author           : Generated from call.cfa.in
+// Created On       : {}
+//
+
+"""
+
+Prelude = """#define __cforall_thread__
+
+#include "bits/defs.hfa"
+#include "kernel.hfa"
+#include "io/types.hfa"
+
+//=============================================================================================
+// I/O uring backend
+//=============================================================================================
+
+#if defined(CFA_HAVE_LINUX_IO_URING_H)
+	#include <assert.h>
+	#include <stdint.h>
+	#include <errno.h>
+	#include <linux/io_uring.h>
+
+	#include "kernel/fwd.hfa"
+
+	#if defined(CFA_HAVE_IOSQE_FIXED_FILE) && defined(CFA_HAVE_IOSQE_IO_DRAIN) && defined(CFA_HAVE_IOSQE_ASYNC)
+		#define REGULAR_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_DRAIN | IOSQE_ASYNC)
+	#elif defined(CFA_HAVE_IOSQE_FIXED_FILE) && defined(CFA_HAVE_IOSQE_ASYNC)
+		#define REGULAR_FLAGS (IOSQE_FIXED_FILE | IOSQE_ASYNC)
+	#elif defined(CFA_HAVE_IOSQE_FIXED_FILE) && defined(CFA_HAVE_IOSQE_IO_DRAIN)
+		#define REGULAR_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_DRAIN)
+	#elif defined(CFA_HAVE_IOSQE_IO_DRAIN) && defined(CFA_HAVE_IOSQE_ASYNC)
+		#define REGULAR_FLAGS (IOSQE_IO_DRAIN | IOSQE_ASYNC)
+	#elif defined(CFA_HAVE_IOSQE_FIXED_FILE)
+		#define REGULAR_FLAGS (IOSQE_FIXED_FILE)
+	#elif defined(CFA_HAVE_IOSQE_IO_DRAIN)
+		#define REGULAR_FLAGS (IOSQE_IO_DRAIN)
+	#elif defined(CFA_HAVE_IOSQE_ASYNC)
+		#define REGULAR_FLAGS (IOSQE_ASYNC)
+	#else
+		#define REGULAR_FLAGS (0)
+	#endif
+
+	#if defined(CFA_HAVE_IOSQE_IO_LINK) && defined(CFA_HAVE_IOSQE_IO_HARDLINK)
+		#define LINK_FLAGS (IOSQE_IO_LINK | IOSQE_IO_HARDLINK)
+	#elif defined(CFA_HAVE_IOSQE_IO_LINK)
+		#define LINK_FLAGS (IOSQE_IO_LINK)
+	#elif defined(CFA_HAVE_IOSQE_IO_HARDLINK)
+		#define LINK_FLAGS (IOSQE_IO_HARDLINK)
+	#else
+		#define LINK_FLAGS (0)
+	#endif
+
+	#if defined(CFA_HAVE_SPLICE_F_FD_IN_FIXED)
+		#define SPLICE_FLAGS (SPLICE_F_FD_IN_FIXED)
+	#else
+		#define SPLICE_FLAGS (0)
+	#endif
+
+	extern [* struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data );
+	extern void __submit( struct io_context * ctx, __u32 idx ) __attribute__((nonnull (1)));
+
+	static inline io_context * __get_io_context( void ) {
+		cluster * cltr = active_cluster();
+
+		/* paranoid */ verifyf( cltr, "No active cluster for io operation\\n");
+		assertf( cltr->io.cnt > 0, "Cluster %p has no default io contexts and no context was specified\\n", cltr );
+
+		/* paranoid */ verifyf( cltr->io.ctxs, "default io contexts for cluster %p are missing\\n", cltr);
+		return &cltr->io.ctxs[ __tls_rand() % cltr->io.cnt ];
+	}
+#endif
+
+//=============================================================================================
+// I/O Forwards
+//=============================================================================================
+#include <time.hfa>
+
+// Some forward declarations
+#include <errno.h>
+#include <unistd.h>
+
+extern "C" {
+	#include <sys/types.h>
+	#include <sys/socket.h>
+	#include <sys/syscall.h>
+
+#if defined(CFA_HAVE_PREADV2)
+	struct iovec;
+	extern ssize_t preadv2 (int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
+#endif
+#if defined(CFA_HAVE_PWRITEV2)
+	struct iovec;
+	extern ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
+#endif
+
+	extern int fsync(int fd);
+
+	#if __OFF_T_MATCHES_OFF64_T
+		typedef __off64_t off_t;
+	#else
+		typedef __off_t off_t;
+	#endif
+	typedef __off64_t off64_t;
+	extern int sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags);
+
+	struct msghdr;
+	struct sockaddr;
+	extern ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags);
+	extern ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags);
+	extern ssize_t send(int sockfd, const void *buf, size_t len, int flags);
+	extern ssize_t recv(int sockfd, void *buf, size_t len, int flags);
+	extern int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
+	extern int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+
+	extern int fallocate(int fd, int mode, off_t offset, off_t len);
+	extern int posix_fadvise(int fd, off_t offset, off_t len, int advice);
+	extern int madvise(void *addr, size_t length, int advice);
+
+	extern int openat(int dirfd, const char *pathname, int flags, mode_t mode);
+	extern int close(int fd);
+
+	extern ssize_t read (int fd, void *buf, size_t count);
+
+	struct epoll_event;
+	extern int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
+
+	extern ssize_t splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags);
+	extern ssize_t tee(int fd_in, int fd_out, size_t len, unsigned int flags);
+}
+
+//=============================================================================================
+// I/O Interface
+//=============================================================================================
+"""
+
+print(Header.format("A Date"))
+print(Prelude)
+
+import re
+import sys
+class Call:
+	def __init__(self, op, signature, body, define=None):
+		sig = re.search("(.*) (.*)\((.*)\)", signature)
+		if not sig:
+			print("OP '{}' has invalid signature {}".format(op, signature), file=sys.stderr)
+			sys.exit(1)
+
+		self.op     = op
+		self.ret    = sig.group(1)
+		self.name   = sig.group(2)
+		self.params = sig.group(3)
+		self.define = define
+		self.body = ""
+
+		accepted_keys = [ 'ioprio', 'fd', 'off', 'addr2','addr', 'splice_off_in','len',
+			'rw_flags', 'fsync_flags', 'poll_events', 'poll32_events',
+			'sync_range_flags', 'msg_flags', 'timeout_flags', 'accept_flags',
+			'cancel_flags', 'open_flags', 'statx_flags', 'fadvise_advice',
+			'splice_flags', 'buf_index' ,'buf_group' 'personality',
+			'splice_fd_in' ]
+
+		for k, v in body.items():
+			if not k in accepted_keys:
+				print("OP '{}' has invalid body kew {}".format(op, k), file=sys.stderr)
+				sys.exit(1)
+
+			self.body += "\n		sqe->{key} = {value};".format(key=k, value=v)
+
+
+	def args(self):
+		param_a = self.params.split(',')
+		args_a = [p.replace('*', ' ').split()[-1] for p in param_a]
+		for a in args_a:
+			if '*' in a:
+				print("OP '{}' has invalid * in argument {}".format(self.op, a), file=sys.stderr)
+				sys.exit(1)
+
+		return ', '.join(args_a)
+
+AsyncTemplate = """inline void async_{name}(io_future_t & future, {params}, int submit_flags, io_cancellation * cancellation, io_context * context) {{
+	#if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_{op})
+		ssize_t res = {name}({args});
+		if (res >= 0) {{
+			fulfil(future, res);
+		}}
+		else {{
+			fulfil(future, -errno);
+		}}
+	#else
+		// we don't support LINK yet
+		if( 0 != (submit_flags & LINK_FLAGS) ) {{
+			errno = ENOTSUP; return -1;
+		}}
+
+		if( !context ) {{
+			context = __get_io_context();
+		}}
+		if(cancellation) {{
+			cancellation->target = (__u64)(uintptr_t)&future;
+		}}
+
+		__u8 sflags = REGULAR_FLAGS & submit_flags;
+		struct __io_data & ring = *context->thrd.ring;
+
+		__u32 idx;
+		struct io_uring_sqe * sqe;
+		[sqe, idx] = __submit_alloc( ring, (__u64)(uintptr_t)&future );
+
+		sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
+		sqe->opcode = IORING_OP_{op};
+		sqe->flags = sflags;{body}
+
+		verify( sqe->user_data == (__u64)(uintptr_t)&future );
+		__submit( context, idx );
+	#endif
+}}"""
+
+SyncTemplate = """{ret} cfa_{name}({params}, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {{
+	if( timeout >= 0 ) {{
+		errno = ENOTSUP;
+		return -1;
+	}}
+	io_future_t future;
+
+	async_{name}( future, {args}, submit_flags, cancellation, context );
+
+	wait( future );
+	if( future.result < 0 ) {{
+		errno = -future.result;
+		return -1;
+	}}
+	return future.result;
+}}"""
+
+calls = [
+	# CFA_HAVE_IORING_OP_READV
+	Call('READV', 'ssize_t preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags)', {
+		'fd'  : 'fd',
+		'off' : 'offset',
+		'addr': '(__u64)iov',
+		'len' : 'iovcnt',
+	}, define = 'CFA_HAVE_PREADV2'),
+	# CFA_HAVE_IORING_OP_WRITEV
+	Call('WRITEV', 'ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags)', {
+		'fd'  : 'fd',
+		'off' : 'offset',
+		'addr': '(__u64)iov',
+		'len' : 'iovcnt'
+	}, define = 'CFA_HAVE_PWRITEV2'),
+	# CFA_HAVE_IORING_OP_FSYNC
+	Call('FSYNC', 'int fsync(int fd)', {
+		'fd': 'fd'
+	}),
+	# CFA_HAVE_IORING_OP_EPOLL_CTL
+	Call('EPOLL_CTL', 'int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event)', {
+		'fd': 'epfd',
+		'addr': 'fd',
+		'len': 'op',
+		'off': '(__u64)event'
+	}),
+	# CFA_HAVE_IORING_OP_SYNC_FILE_RANGE
+	Call('SYNC_FILE_RANGE', 'int sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags)', {
+		'fd': 'fd',
+		'off': 'offset',
+		'len': 'nbytes',
+		'sync_range_flags': 'flags'
+	}),
+	# CFA_HAVE_IORING_OP_SENDMSG
+	Call('SENDMSG', 'ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags)', {
+		'fd': 'sockfd',
+		'addr': '(__u64)(struct msghdr *)msg',
+		'len': '1',
+		'msg_flags': 'flags'
+	}),
+	# CFA_HAVE_IORING_OP_RECVMSG
+	Call('RECVMSG', 'ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags)', {
+		'fd': 'sockfd',
+		'addr': '(__u64)(struct msghdr *)msg',
+		'len': '1',
+		'msg_flags': 'flags'
+	}),
+	# CFA_HAVE_IORING_OP_SEND
+	Call('SEND', 'ssize_t send(int sockfd, const void *buf, size_t len, int flags)', {
+		'fd': 'sockfd',
+		'addr': 'buf',
+		'len': 'len',
+		'msg_flags': 'flags'
+	}),
+	# CFA_HAVE_IORING_OP_RECV
+	Call('RECV', 'ssize_t recv(int sockfd, void *buf, size_t len, int flags)', {
+		'fd': 'sockfd',
+		'addr': 'buf',
+		'len': 'len',
+		'msg_flags': 'flags'
+	}),
+	# CFA_HAVE_IORING_OP_ACCEPT
+	Call('ACCEPT4', 'int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags)', {
+		'fd': 'sockfd',
+		'addr': 'addr',
+		'addr2': 'addrlen',
+		'accept_flags': 'flags'
+	}),
+	# CFA_HAVE_IORING_OP_CONNECT
+	Call('CONNECT', 'int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen)', {
+		'fd': 'sockfd',
+		'addr': 'addr',
+		'off': 'addrlen'
+	}),
+	# CFA_HAVE_IORING_OP_FALLOCATE
+	Call('FALLOCATE', 'int fallocate(int fd, int mode, off_t offset, off_t len)', {
+		'fd': 'fd',
+		'addr': '(__u64)len',
+		'len': 'mode',
+		'off': 'offset'
+	}),
+	# CFA_HAVE_IORING_OP_FADVISE
+	Call('FADVISE', 'int posix_fadvise(int fd, off_t offset, off_t len, int advice)', {
+		'fd': 'fd',
+		'off': 'off',
+		'len': 'len',
+		'fadvise_advice': 'advice'
+	}),
+	# CFA_HAVE_IORING_OP_MADVISE
+	Call('MADVISE', 'int madvise(void *addr, size_t length, int advice)', {
+		'addr': 'addr',
+		'len': 'length',
+		'fadvise_advice': 'advice'
+	}),
+	# CFA_HAVE_IORING_OP_OPENAT
+	Call('OPENAT', 'int openat(int dirfd, const char *pathname, int flags, mode_t mode)', {
+		'fd': 'dirfd',
+		'off': 'offset',
+		'addr': '(__u64)pathname',
+		'len': 'mode',
+		'open_flags': 'flags;'
+	}),
+	# CFA_HAVE_IORING_OP_OPENAT2
+	Call('OPENAT2', 'int openat2(int dirfd, const char *pathname, struct open_how * how, size_t size)', {
+		'fd': 'dirfd',
+		'addr': 'pathname',
+		'len': 'sizeof(*how)',
+		'off': '(__u64)how',
+	}, define = 'CFA_HAVE_OPENAT2'),
+	# CFA_HAVE_IORING_OP_CLOSE
+	Call('CLOSE', 'int close(int fd)', {
+		'fd': 'fd'
+	}),
+	# CFA_HAVE_IORING_OP_STATX
+	Call('STATX', 'int statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf)', {
+		'fd': 'dirfd',
+		'off': '(__u64)statxbuf',
+		'addr': 'pathname',
+		'len': 'mask',
+		'statx_flags': 'flags'
+	}, define = 'CFA_HAVE_STATX'),
+	# CFA_HAVE_IORING_OP_READ
+	Call('READ', 'ssize_t read(int fd, void * buf, size_t count)', {
+		'fd': 'fd',
+		'addr': 'buf',
+		'len': 'count'
+	}),
+	# CFA_HAVE_IORING_OP_WRITE
+	Call('WRITE', 'ssize_t write(int fd, void * buf, size_t count)', {
+		'fd': 'fd',
+		'addr': 'buf',
+		'len': 'count'
+	}),
+	# CFA_HAVE_IORING_OP_SPLICE
+	Call('SPLICE', 'ssize_t splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags)', {
+		'splice_fd_in': 'fd_in',
+		'splice_off_in': 'off_in ? *off_in : -1',
+		'fd': 'fd_out',
+		'off': 'off_out ? *off_out : -1',
+		'len': 'len',
+		'splice_flags': 'flags'
+	}),
+	# CFA_HAVE_IORING_OP_TEE
+	Call('TEE', 'ssize_t tee(int fd_in, int fd_out, size_t len, unsigned int flags)', {
+		'splice_fd_in': 'fd_in',
+		'fd': 'fd_out',
+		'len': 'len',
+		'splice_flags': 'flags'
+	})
+]
+
+print("//----------")
+print("// synchronous calls")
+for c in calls:
+	if c.define:
+		print("""#if defined({define})
+	{ret} cfa_{name}({params}, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+#endif""".format(define=c.define,ret=c.ret, name=c.name, params=c.params))
+	else:
+		print("{ret} cfa_{name}({params}, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);"
+		.format(ret=c.ret, name=c.name, params=c.params))
+
+print("\n//----------")
+print("// asynchronous calls")
+for c in calls:
+	if c.define:
+		print("""#if defined({define})
+	void async_{name}(io_future_t & future, {params}, int submit_flags, io_cancellation * cancellation, io_context * context);
+#endif""".format(define=c.define,name=c.name, params=c.params))
+	else:
+		print("void async_{name}(io_future_t & future, {params}, int submit_flags, io_cancellation * cancellation, io_context * context);"
+		.format(name=c.name, params=c.params))
+print("\n")
+
+for c in calls:
+	print("//-----------------------------------------------------------------------------")
+	print("// {}".format(c.name))
+	Async = AsyncTemplate.format(
+		name   = c.name,
+		ret    = c.ret,
+		params = c.params,
+		args   = c.args(),
+		op     = c.op,
+		body   = c.body
+
+	)
+	Sync = SyncTemplate.format(
+		name   = c.name,
+		ret    = c.ret,
+		params = c.params,
+		args   = c.args()
+	)
+
+	if c.define:
+		print("""#if defined({})
+	//----------
+	// asynchronous call
+	{}
+
+	//----------
+	// synchronous call
+	{}
+#endif
+""".format(c.define, "\n\t".join( Async.splitlines() ), "\n\t".join( Sync.splitlines() )))
+	else :
+		print("""//----------
+// asynchronous call
+{}
+
+//----------
+// synchronous call
+{}
+""".format(Async, Sync))
+
+print("""
+//-----------------------------------------------------------------------------
+// Check if a function is has asynchronous
+bool has_user_level_blocking( fptr_t func ) {
+ 	#if defined(CFA_HAVE_LINUX_IO_URING_H)""")
+
+for c in calls:
+	if c.define:
+		print("""		#if defined({define})
+ 			if( /*func == (fptr_t)preadv2 || */
+ 				func == (fptr_t)cfa_{name} ||
+				func == (fptr_t)async_{name} ) {{
+ 				#if defined(CFA_HAVE_IORING_OP_{op})
+					return true;
+				#else
+					return false;
+				#endif
+ 			}}
+ 		#endif""".format(define=c.define, name=c.name, op=c.op))
+	else:
+		print("""		if( /*func == (fptr_t)preadv2 || */
+			func == (fptr_t)cfa_{name} ||
+			func == (fptr_t)async_{name} ) {{
+			#if defined(CFA_HAVE_IORING_OP_{op})
+				return true;
+			#else
+				return false;
+			#endif
+		}}""".format(name=c.name, op=c.op))
+
+print(""" 	#endif
+
+ 	return false;
+}""")
Index: libcfa/src/concurrency/io/types.hfa
===================================================================
--- libcfa/src/concurrency/io/types.hfa	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ libcfa/src/concurrency/io/types.hfa	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -16,11 +16,11 @@
 #pragma once
 
+extern "C" {
+	#include <linux/types.h>
+}
+
+#include "bits/locks.hfa"
+
 #if defined(CFA_HAVE_LINUX_IO_URING_H)
-	extern "C" {
-		#include <linux/types.h>
-	}
-
-      #include "bits/locks.hfa"
-
 	#define LEADER_LOCK
 	struct __leaderlock_t {
@@ -101,12 +101,4 @@
 	};
 
-
-	//-----------------------------------------------------------------------
-	// IO user data
-	struct __io_user_data_t {
-		__s32 result;
-		oneshot sem;
-	};
-
 	//-----------------------------------------------------------------------
 	// Misc
@@ -143,2 +135,21 @@
 	void __ioctx_prepare_block($io_ctx_thread & ctx, struct epoll_event & ev);
 #endif
+
+//-----------------------------------------------------------------------
+// IO user data
+struct io_future_t {
+	future_t self;
+	__s32 result;
+};
+
+static inline {
+	bool fulfil( io_future_t & this, __s32 result ) {
+		this.result = result;
+		return fulfil(this.self);
+	}
+
+	// Wait for the future to be fulfilled
+	bool wait( io_future_t & this ) {
+		return wait(this.self);
+	}
+}
Index: libcfa/src/concurrency/iofwd.hfa
===================================================================
--- libcfa/src/concurrency/iofwd.hfa	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ libcfa/src/concurrency/iofwd.hfa	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -40,4 +40,5 @@
 
 struct cluster;
+struct io_future_t;
 struct io_context;
 struct io_cancellation;
@@ -48,24 +49,70 @@
 struct statx;
 
-extern ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern int cfa_fsync(int fd, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern int cfa_sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern int cfa_fallocate(int fd, int mode, uint64_t offset, uint64_t len, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern int cfa_fadvise(int fd, uint64_t offset, uint64_t len, int advice, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern int cfa_madvise(void *addr, size_t length, int advice, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern int cfa_close(int fd, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern ssize_t cfa_read(int fd, void *buf, size_t count, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern ssize_t cfa_write(int fd, void *buf, size_t count, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
-extern ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags, int submit_flags = 0, Duration timeout = -1`s, io_cancellation * cancellation = 0p, io_context * context = 0p);
+//----------
+// synchronous calls
+#if defined(CFA_HAVE_PREADV2)
+	extern ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+#endif
+#if defined(CFA_HAVE_PWRITEV2)
+	extern ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+#endif
+extern int cfa_fsync(int fd, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+extern int cfa_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+extern int cfa_sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+extern  ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+extern ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+extern ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+extern ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+extern int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+extern int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+extern int cfa_fallocate(int fd, int mode, off_t offset, off_t len, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+extern int cfa_posix_fadvise(int fd, off_t offset, off_t len, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+extern int cfa_madvise(void *addr, size_t length, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+extern int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+#if defined(CFA_HAVE_OPENAT2)
+	extern int cfa_openat2(int dirfd, const char *pathname, struct open_how * how, size_t size, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+#endif
+extern int cfa_close(int fd, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+#if defined(CFA_HAVE_STATX)
+	extern int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+#endif
+extern ssize_t cfa_read(int fd, void * buf, size_t count, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+extern ssize_t cfa_write(int fd, void * buf, size_t count, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+extern ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+extern ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+
+//----------
+// asynchronous calls
+#if defined(CFA_HAVE_PREADV2)
+	extern void async_preadv2(io_future_t & future, int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
+#endif
+#if defined(CFA_HAVE_PWRITEV2)
+	extern void async_pwritev2(io_future_t & future, int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
+#endif
+extern void async_fsync(io_future_t & future, int fd, int submit_flags, io_cancellation * cancellation, io_context * context);
+extern void async_epoll_ctl(io_future_t & future, int epfd, int op, int fd, struct epoll_event *event, int submit_flags, io_cancellation * cancellation, io_context * context);
+extern void async_sync_file_range(io_future_t & future, int fd, off64_t offset, off64_t nbytes, unsigned int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
+extern void async_sendmsg(io_future_t & future, int sockfd, const struct msghdr *msg, int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
+extern void async_recvmsg(io_future_t & future, int sockfd, struct msghdr *msg, int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
+extern void async_send(io_future_t & future, int sockfd, const void *buf, size_t len, int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
+extern void async_recv(io_future_t & future, int sockfd, void *buf, size_t len, int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
+extern void async_accept4(io_future_t & future, int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
+extern void async_connect(io_future_t & future, int sockfd, const struct sockaddr *addr, socklen_t addrlen, int submit_flags, io_cancellation * cancellation, io_context * context);
+extern void async_fallocate(io_future_t & future, int fd, int mode, off_t offset, off_t len, int submit_flags, io_cancellation * cancellation, io_context * context);
+extern void async_posix_fadvise(io_future_t & future, int fd, off_t offset, off_t len, int advice, int submit_flags, io_cancellation * cancellation, io_context * context);
+extern void async_madvise(io_future_t & future, void *addr, size_t length, int advice, int submit_flags, io_cancellation * cancellation, io_context * context);
+extern void async_openat(io_future_t & future, int dirfd, const char *pathname, int flags, mode_t mode, int submit_flags, io_cancellation * cancellation, io_context * context);
+#if defined(CFA_HAVE_OPENAT2)
+	extern void async_openat2(io_future_t & future, int dirfd, const char *pathname, struct open_how * how, size_t size, int submit_flags, io_cancellation * cancellation, io_context * context);
+#endif
+extern void async_close(io_future_t & future, int fd, int submit_flags, io_cancellation * cancellation, io_context * context);
+#if defined(CFA_HAVE_STATX)
+	extern void async_statx(io_future_t & future, int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, int submit_flags, io_cancellation * cancellation, io_context * context);
+#endif
+void async_read(io_future_t & future, int fd, void * buf, size_t count, int submit_flags, io_cancellation * cancellation, io_context * context);
+extern void async_write(io_future_t & future, int fd, void * buf, size_t count, int submit_flags, io_cancellation * cancellation, io_context * context);
+extern void async_splice(io_future_t & future, int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
+extern void async_tee(io_future_t & future, int fd_in, int fd_out, size_t len, unsigned int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
+
 
 //-----------------------------------------------------------------------------
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ libcfa/src/concurrency/kernel.hfa	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -23,5 +23,6 @@
 
 extern "C" {
-#include <bits/pthreadtypes.h>
+	#include <bits/pthreadtypes.h>
+	#include <linux/types.h>
 }
 
@@ -157,5 +158,5 @@
 
 struct io_cancellation {
-	uint32_t target;
+	__u64 target;
 };
 
Index: libcfa/src/concurrency/monitor.cfa
===================================================================
--- libcfa/src/concurrency/monitor.cfa	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ libcfa/src/concurrency/monitor.cfa	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -89,5 +89,8 @@
 	__cfaabi_dbg_print_safe( "Kernel : %10p Entering mon %p (%p)\n", thrd, this, this->owner);
 
-	if( !this->owner ) {
+	if( unlikely(0 != (0x1 & (uintptr_t)this->owner)) ) {
+		abort( "Attempt by thread \"%.256s\" (%p) to access joined monitor %p.", thrd->self_cor.name, thrd, this );
+	}
+	else if( !this->owner ) {
 		// No one has the monitor, just take it
 		__set_owner( this, thrd );
@@ -137,5 +140,5 @@
 }
 
-static void __dtor_enter( $monitor * this, fptr_t func ) {
+static void __dtor_enter( $monitor * this, fptr_t func, bool join ) {
 	// Lock the monitor spinlock
 	lock( this->lock __cfaabi_dbg_ctx2 );
@@ -157,8 +160,22 @@
 		return;
 	}
-	else if( this->owner == thrd) {
+	else if( this->owner == thrd && !join) {
 		// We already have the monitor... but where about to destroy it so the nesting will fail
 		// Abort!
 		abort( "Attempt to destroy monitor %p by thread \"%.256s\" (%p) in nested mutex.", this, thrd->self_cor.name, thrd );
+	}
+	// SKULLDUGGERY: join will act as a dtor so it would normally trigger to above check
+	// to avoid that it sets the owner to the special value thrd | 1p before exiting
+	else if( this->owner == ($thread*)(1 | (uintptr_t)thrd) ) {
+		// restore the owner and just return
+		__cfaabi_dbg_print_safe( "Kernel : Destroying free mon %p\n", this);
+
+		// No one has the monitor, just take it
+		this->owner = thrd;
+
+		verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+
+		unlock( this->lock );
+		return;
 	}
 
@@ -251,13 +268,15 @@
 
 // Leave single monitor for the last time
-void __dtor_leave( $monitor * this ) {
+void __dtor_leave( $monitor * this, bool join ) {
 	__cfaabi_dbg_debug_do(
 		if( TL_GET( this_thread ) != this->owner ) {
 			abort( "Destroyed monitor %p has inconsistent owner, expected %p got %p.\n", this, TL_GET( this_thread ), this->owner);
 		}
-		if( this->recursion != 1 ) {
+		if( this->recursion != 1  && !join ) {
 			abort( "Destroyed monitor %p has %d outstanding nested calls.\n", this, this->recursion - 1);
 		}
 	)
+
+	this->owner = ($thread*)(1 | (uintptr_t)this->owner);
 }
 
@@ -307,4 +326,15 @@
 }
 
+// Join a thread
+forall( dtype T | is_thread(T) )
+T & join( T & this ) {
+	$monitor *    m = get_monitor(this);
+	void (*dtor)(T& mutex this) = ^?{};
+	monitor_dtor_guard_t __guard = { &m, (fptr_t)dtor, true };
+	{
+		return this;
+	}
+}
+
 // Enter multiple monitor
 // relies on the monitor array being sorted
@@ -366,5 +396,5 @@
 // Ctor for monitor guard
 // Sorts monitors before entering
-void ?{}( monitor_dtor_guard_t & this, $monitor * m [], fptr_t func ) {
+void ?{}( monitor_dtor_guard_t & this, $monitor * m [], fptr_t func, bool join ) {
 	// optimization
 	$thread * thrd = TL_GET( this_thread );
@@ -376,8 +406,11 @@
 	this.prev = thrd->monitors;
 
+	// Save whether we are in a join or not
+	this.join = join;
+
 	// Update thread context (needed for conditions)
 	(thrd->monitors){m, 1, func};
 
-	__dtor_enter( this.m, func );
+	__dtor_enter( this.m, func, join );
 }
 
@@ -385,5 +418,5 @@
 void ^?{}( monitor_dtor_guard_t & this ) {
 	// Leave the monitors in order
-	__dtor_leave( this.m );
+	__dtor_leave( this.m, this.join );
 
 	// Restore thread context
Index: libcfa/src/concurrency/monitor.hfa
===================================================================
--- libcfa/src/concurrency/monitor.hfa	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ libcfa/src/concurrency/monitor.hfa	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -53,7 +53,8 @@
 	$monitor *    m;
 	__monitor_group_t prev;
+	bool join;
 };
 
-void ?{}( monitor_dtor_guard_t & this, $monitor ** m, void (*func)() );
+void ?{}( monitor_dtor_guard_t & this, $monitor ** m, void (*func)(), bool join );
 void ^?{}( monitor_dtor_guard_t & this );
 
Index: libcfa/src/concurrency/thread.hfa
===================================================================
--- libcfa/src/concurrency/thread.hfa	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ libcfa/src/concurrency/thread.hfa	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -106,4 +106,9 @@
 void sleep( Duration duration );
 
+//----------
+// join
+forall( dtype T | is_thread(T) )
+T & join( T & this );
+
 // Local Variables: //
 // mode: c //
Index: src/AST/Pass.hpp
===================================================================
--- src/AST/Pass.hpp	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ src/AST/Pass.hpp	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -50,5 +50,5 @@
 // | PureVisitor           - makes the visitor pure, it never modifies nodes in place and always
 //                           clones nodes it needs to make changes to
-// | WithTypeSubstitution  - provides polymorphic const TypeSubstitution * env for the
+// | WithConstTypeSubstitution - provides polymorphic const TypeSubstitution * typeSubs for the
 //                           current expression
 // | WithStmtsToAdd        - provides the ability to insert statements before or after the current
@@ -294,11 +294,10 @@
 //-------------------------------------------------------------------------------------------------
 
-/// Keep track of the polymorphic const TypeSubstitution * env for the current expression
-
 /// If used the visitor will always clone nodes.
 struct PureVisitor {};
 
+/// Keep track of the polymorphic const TypeSubstitution * typeSubs for the current expression.
 struct WithConstTypeSubstitution {
-	const TypeSubstitution * env = nullptr;
+	const TypeSubstitution * typeSubs = nullptr;
 };
 
Index: src/AST/Pass.impl.hpp
===================================================================
--- src/AST/Pass.impl.hpp	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ src/AST/Pass.impl.hpp	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -154,7 +154,7 @@
 		__pedantic_pass_assert( expr );
 
-		const ast::TypeSubstitution ** env_ptr = __pass::env( core, 0);
-		if ( env_ptr && expr->env ) {
-			*env_ptr = expr->env;
+		const ast::TypeSubstitution ** typeSubs_ptr = __pass::typeSubs( core, 0 );
+		if ( typeSubs_ptr && expr->env ) {
+			*typeSubs_ptr = expr->env;
 		}
 
@@ -177,5 +177,5 @@
 
 		// These may be modified by subnode but most be restored once we exit this statemnet.
-		ValueGuardPtr< const ast::TypeSubstitution * > __old_env         ( __pass::env( core, 0) );
+		ValueGuardPtr< const ast::TypeSubstitution * > __old_env         ( __pass::typeSubs( core, 0 ) );
 		ValueGuardPtr< typename std::remove_pointer< decltype(stmts_before) >::type > __old_decls_before( stmts_before );
 		ValueGuardPtr< typename std::remove_pointer< decltype(stmts_after ) >::type > __old_decls_after ( stmts_after  );
@@ -1488,5 +1488,5 @@
 
 		// These may be modified by subnode but most be restored once we exit this statemnet.
-		ValueGuardPtr< const ast::TypeSubstitution * > __old_env( __pass::env( core, 0) );
+		ValueGuardPtr< const ast::TypeSubstitution * > __old_env( __pass::typeSubs( core, 0 ) );
 		ValueGuardPtr< typename std::remove_pointer< decltype(stmts_before) >::type > __old_decls_before( stmts_before );
 		ValueGuardPtr< typename std::remove_pointer< decltype(stmts_after ) >::type > __old_decls_after ( stmts_after  );
Index: src/AST/Pass.proto.hpp
===================================================================
--- src/AST/Pass.proto.hpp	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ src/AST/Pass.proto.hpp	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -236,5 +236,5 @@
 
 	// List of fields and their expected types
-	FIELD_PTR( env, const ast::TypeSubstitution * )
+	FIELD_PTR( typeSubs, const ast::TypeSubstitution * )
 	FIELD_PTR( stmtsToAddBefore, std::list< ast::ptr< ast::Stmt > > )
 	FIELD_PTR( stmtsToAddAfter , std::list< ast::ptr< ast::Stmt > > )
Index: src/Common/Stats/ResolveTime.cc
===================================================================
--- src/Common/Stats/ResolveTime.cc	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
+++ src/Common/Stats/ResolveTime.cc	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -0,0 +1,59 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2019 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// ResolveTime.cc --
+//
+// Author           : Thierry Delisle
+// Created On       : Wed Sep 16 15:45:51 2020
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#include "ResolveTime.h"
+
+#include <fstream>
+#include <iomanip>
+
+#include "AST/Fwd.hpp"
+#include "AST/Expr.hpp"
+#include "AST/Print.hpp"
+#include "AST/Type.hpp"
+
+namespace Stats {
+	namespace ResolveTime {
+		static inline long long rdtscl(void) {
+			unsigned int lo, hi;
+			__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+			return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
+		}
+
+		extern bool enabled;
+		bool started = false;
+		long long before;
+		std::ostream & out = std::cout;
+
+		void start( const ast::Expr * expr ) {
+			if(enabled) {
+				assert(!started);
+				started = true;
+
+				out << expr->location << " : ";
+
+				before = rdtscl();
+			}
+		}
+		void stop() {
+			if(enabled) {
+				assert(started);
+				auto after = rdtscl();
+				out << (after - before) << std::endl;
+
+				started = false;
+			}
+		}
+	};
+};
Index: src/Common/Stats/ResolveTime.h
===================================================================
--- src/Common/Stats/ResolveTime.h	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
+++ src/Common/Stats/ResolveTime.h	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -0,0 +1,38 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2019 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// ResolveTime.h --
+//
+// Author           : Thierry Delisle
+// Created On       : Wed Sep 16 15:45:51 2020
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#pragma once
+
+#include "Common/Stats/Base.h"
+
+#if defined( NO_STATISTICS )
+	#define NO_RESOLVE_TIME_STATISTICS
+#endif
+
+namespace ast {
+	class Expr;
+}
+
+namespace Stats {
+	namespace ResolveTime {
+		#if defined(NO_RESOLVE_TIME_STATISTICS)
+			void start( const ast::Expr * ) {}
+			void stop() {}
+		#else
+			void start( const ast::Expr * );
+			void stop();
+		#endif
+	};
+};
Index: src/Common/Stats/Stats.cc
===================================================================
--- src/Common/Stats/Stats.cc	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ src/Common/Stats/Stats.cc	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -35,4 +35,8 @@
 	}
 
+	namespace ResolveTime {
+		bool enabled = false;
+	}
+
 	struct {
 		const char * const opt;
@@ -43,4 +47,5 @@
 		{ "heap"    , Heap::enabled },
 		{ "time"    , Time::enabled },
+		{ "resolve" , ResolveTime::enabled },
 	};
 
Index: src/Common/module.mk
===================================================================
--- src/Common/module.mk	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ src/Common/module.mk	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -40,4 +40,6 @@
       Common/Stats/Heap.cc \
       Common/Stats/Heap.h \
+      Common/Stats/ResolveTime.cc \
+      Common/Stats/ResolveTime.h \
       Common/Stats/Stats.cc \
       Common/Stats/Time.cc \
Index: src/Concurrency/Keywords.cc
===================================================================
--- src/Concurrency/Keywords.cc	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ src/Concurrency/Keywords.cc	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -931,5 +931,6 @@
 					{
 						new SingleInit( new AddressExpr( new VariableExpr( monitors ) ) ),
-						new SingleInit( new CastExpr( new VariableExpr( func ), generic_func->clone(), false ) )
+						new SingleInit( new CastExpr( new VariableExpr( func ), generic_func->clone(), false ) ),
+						new SingleInit( new ConstantExpr( Constant::from_bool( false ) ) )
 					},
 					noDesignators,
Index: src/ResolvExpr/Resolver.cc
===================================================================
--- src/ResolvExpr/Resolver.cc	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ src/ResolvExpr/Resolver.cc	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -38,4 +38,5 @@
 #include "Common/PassVisitor.h"          // for PassVisitor
 #include "Common/SemanticError.h"        // for SemanticError
+#include "Common/Stats/ResolveTime.h"    // for ResolveTime::start(), ResolveTime::stop()
 #include "Common/utility.h"              // for ValueGuard, group_iterate
 #include "InitTweak/GenInit.h"
@@ -1169,5 +1170,8 @@
 			const ast::Expr * untyped, const ast::SymbolTable & symtab
 		) {
-			return findKindExpression( untyped, symtab );
+			Stats::ResolveTime::start( untyped );
+			auto res = findKindExpression( untyped, symtab );
+			Stats::ResolveTime::stop();
+			return res;
 		}
 	} // anonymous namespace
Index: src/SymTab/Validate.cc
===================================================================
--- src/SymTab/Validate.cc	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ src/SymTab/Validate.cc	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -960,4 +960,16 @@
 	}
 
+	static bool isNonParameterAttribute( Attribute * attr ) {
+		static const std::vector<std::string> bad_names = {
+			"aligned", "__aligned__",
+		};
+		for ( auto name : bad_names ) {
+			if ( name == attr->name ) {
+				return true;
+			}
+		}
+		return false;
+	}
+
 	Type * ReplaceTypedef::postmutate( TypeInstType * typeInst ) {
 		// instances of typedef types will come here. If it is an instance
@@ -968,11 +980,9 @@
 			ret->location = typeInst->location;
 			ret->get_qualifiers() |= typeInst->get_qualifiers();
-			// attributes are not carried over from typedef to function parameters/return values
-			if ( ! inFunctionType ) {
-				ret->attributes.splice( ret->attributes.end(), typeInst->attributes );
-			} else {
-				deleteAll( ret->attributes );
-				ret->attributes.clear();
-			}
+			// GCC ignores certain attributes if they arrive by typedef, this mimics that.
+			if ( inFunctionType ) {
+				ret->attributes.remove_if( isNonParameterAttribute );
+			}
+			ret->attributes.splice( ret->attributes.end(), typeInst->attributes );
 			// place instance parameters on the typedef'd type
 			if ( ! typeInst->parameters.empty() ) {
Index: tests/Makefile.am
===================================================================
--- tests/Makefile.am	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ tests/Makefile.am	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -38,4 +38,6 @@
 # since automake doesn't have support for CFA we have to
 AM_CFLAGS = $(if $(test), 2> $(test), ) \
+	-fdebug-prefix-map=$(abspath ${abs_srcdir})= \
+	-fdebug-prefix-map=/tmp= \
 	-g \
 	-Wall \
@@ -58,4 +60,5 @@
 # adjusted CC but without the actual distcc call
 CFACCLOCAL = $(if $(DISTCC_CFA_PATH),$(DISTCC_CFA_PATH) ${ARCH_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS})
+CFACCLINK = $(CFACCLOCAL) $(if $(test), 2> $(test), ) $($(shell echo "${@}_FLAGSLD" | sed 's/-\|\//_/g'))
 
 PRETTY_PATH=mkdir -p $(dir $(abspath ${@})) && cd ${srcdir} &&
@@ -110,5 +113,6 @@
 % : %.cfa $(CFACCBIN)
 	$(CFACOMPILETEST) -c -o $(abspath ${@}).o
-	$(CFACCLOCAL) $($(shell echo "${@}_FLAGSLD" | sed 's/-\|\//_/g')) $(abspath ${@}).o -o $(abspath ${@})
+	$(CFACCLINK) ${@}.o -o $(abspath ${@})
+	rm $(abspath ${@}).o
 
 # implicit rule for c++ test
@@ -137,4 +141,8 @@
 # CUSTOM TARGET
 #------------------------------------------------------------------------------
+# tests that just validate syntax
+expression : expression.cfa $(CFACCBIN)
+	$(CFACOMPILETEST) -c -fsyntax-only 2> $(abspath ${@})
+
 # expected failures
 # use custom target since they require a custom define and custom dependencies
@@ -170,4 +178,11 @@
 	$(CFACCLOCAL) $($(shell echo "${@}_FLAGSLD" | sed 's/-\|\//_/g')) $(abspath ${@}).o -o $(abspath ${@})
 
+# Linking tests
+# Meta tests to make sure we see linking errors (can't compile with -O2 since it may multiply number of calls)
+linking/linkerror : linking/linkerror.cfa $(CFACCBIN)
+	$(CFACOMPILETEST) -O0 -c -o $(abspath ${@}).o
+	$(CFACCLINK)  -O0 ${@}.o -o $(abspath ${@})
+	rm $(abspath ${@}).o
+
 #------------------------------------------------------------------------------
 # Other targets
Index: tests/concurrent/.expect/join.txt
===================================================================
--- tests/concurrent/.expect/join.txt	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
+++ tests/concurrent/.expect/join.txt	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -0,0 +1,2 @@
+All workers got 42
+All other workers got 27
Index: tests/concurrent/join.cfa
===================================================================
--- tests/concurrent/join.cfa	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
+++ tests/concurrent/join.cfa	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -0,0 +1,52 @@
+#include <fstream.hfa>
+#include <thread.hfa>
+
+thread Worker { volatile int result; };
+
+void main(Worker & this) {
+	this.result = -10;
+	for(50) {
+		yield();
+	}
+	this.result = 42;
+}
+
+thread OtherWorker { volatile int result; };
+
+void main(OtherWorker & this) {
+	this.result = -10;
+	LOOP: for() {
+		waitfor( ^?{} : this) {
+			break LOOP;
+		}
+		or else {
+			yield();
+		}
+	}
+	this.result = 27;
+}
+
+
+int main(int argc, char* argv[]) {
+	{
+		Worker workers[17];
+		for(i; 17) {
+			int res = join( workers[i] ).result;
+			if( res != 42 ) {
+				sout | "Worker" | i | "got incorrect result:" | res;
+			}
+		}
+	}
+	sout | "All workers got 42";
+
+	{
+		OtherWorker workers[17];
+		for(i; 17) {
+			int res = join( workers[i] ).result;
+			if( res != 27 ) {
+				sout | "Other Worker" | i | "got incorrect result:" | res;
+			}
+		}
+	}
+	sout | "All other workers got 27";
+}
Index: tests/concurrent/joinerror.cfa
===================================================================
--- tests/concurrent/joinerror.cfa	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
+++ tests/concurrent/joinerror.cfa	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -0,0 +1,61 @@
+#include <fstream.hfa>
+#include <thread.hfa>
+
+thread Worker { volatile int result; };
+
+void main(Worker & this) {
+	this.result = -10;
+	for(50) {
+		yield();
+	}
+	this.result = 42;
+}
+
+int get_result( Worker & mutex this ) {
+	return this.result;
+}
+
+thread OtherWorker { volatile int result; };
+
+void main(OtherWorker & this) {
+	this.result = -10;
+	LOOP: for() {
+		waitfor( ^?{} : this) {
+			break LOOP;
+		}
+		or else {
+			yield();
+		}
+	}
+	this.result = 27;
+}
+
+int get_result( OtherWorker & mutex this ) {
+	return this.result;
+}
+
+int main(int argc, char* argv[]) {
+	{
+		Worker workers[17];
+		for(i; 17) {
+			join( workers[i] );
+			int res = get_result( workers[i] );
+			if( res != 42 ) {
+				sout | "Worker" | i | "got incorrect result:" | res;
+			}
+		}
+	}
+	sout | "All workers got 42";
+
+	{
+		OtherWorker workers[17];
+		for(i; 17) {
+			join( workers[i] );
+			int res = get_result( workers[i] );
+			if( res != 27 ) {
+				sout | "Other Worker" | i | "got incorrect result:" | res;
+			}
+		}
+	}
+	sout | "All other workers got 27";
+}
Index: tests/linking/.expect/linkerror.txt
===================================================================
--- tests/linking/.expect/linkerror.txt	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
+++ tests/linking/.expect/linkerror.txt	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -0,0 +1,3 @@
+linking/linkerror.o: In function `_X4mainFi___1':
+linking/linkerror.cfa:6: undefined reference to `_X18this_doesnot_existFv_i__1'
+collect2: error: ld returned 1 exit status
Index: tests/linking/linkerror.cfa
===================================================================
--- tests/linking/linkerror.cfa	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
+++ tests/linking/linkerror.cfa	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -0,0 +1,7 @@
+// This is more of a meta test, to confirm the test suite handles link errors correctly.
+
+extern void this_doesnot_exist(int);
+
+int main() {
+	this_doesnot_exist( 6 );
+}
Index: tests/pybin/tools.py
===================================================================
--- tests/pybin/tools.py	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ tests/pybin/tools.py	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -120,5 +120,5 @@
 		return None
 
-	file = open(file, mode)
+	file = open(file, mode, encoding="latin-1") # use latin-1 so all chars mean something.
 	exitstack.push(file)
 	return file
Index: tests/test.py
===================================================================
--- tests/test.py	(revision 31a6f38ca7d2e1a23150072813a3cdf4f57f44f6)
+++ tests/test.py	(revision 08f3ad3fbb7f014c910d4d066d7c81832e707b6c)
@@ -207,5 +207,5 @@
 		else:
 			if os.stat(out_file).st_size < 1048576:
-				with open (out_file, "r") as myfile:
+				with open (out_file, "r", encoding='latin-1') as myfile:  # use latin-1 so all chars mean something.
 					error = myfile.read()
 			else:
