Index: benchmark/io/readv.cfa
===================================================================
--- benchmark/io/readv.cfa	(revision c680a4ba5e93338ec036f1c5135c217e61c20ac1)
+++ benchmark/io/readv.cfa	(revision 44aad8f4ae26b1fe3eef138bd44cc12610ad6e9a)
@@ -18,5 +18,5 @@
 
 extern bool traceHeapOn();
-extern ssize_t async_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
+extern ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
 
 int fd;
@@ -34,5 +34,5 @@
 
 	while(__atomic_load_n(&run, __ATOMIC_RELAXED)) {
-		async_preadv2(fd, &iov, 1, 0, 0);
+		cfa_preadv2(fd, &iov, 1, 0, 0);
 		__atomic_fetch_add( &count, 1, __ATOMIC_SEQ_CST );
 	}
@@ -40,4 +40,8 @@
 
 int main(int argc, char * argv[]) {
+	#if !defined(__CFA_NO_STATISTICS__)
+		print_stats_at_exit( *active_cluster() );
+	#endif
+
 	double duration   = 5.0;
 	unsigned long int nthreads = 2;
@@ -46,5 +50,4 @@
 	printf("Setting local\n");
 	setlocale(LC_NUMERIC, "");
-
 
 	arg_loop:
Index: examples/io/simple/server.cfa
===================================================================
--- examples/io/simple/server.cfa	(revision c680a4ba5e93338ec036f1c5135c217e61c20ac1)
+++ examples/io/simple/server.cfa	(revision 44aad8f4ae26b1fe3eef138bd44cc12610ad6e9a)
@@ -51,7 +51,7 @@
 
 //----------
-extern ssize_t async_recvmsg(int sockfd, struct msghdr *msg, int flags);
-extern int async_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
-extern int async_close(int fd);
+extern ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags);
+extern int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
+extern int cfa_close(int fd);
 
 //----------
@@ -88,5 +88,5 @@
 	struct sockaddr_in cli_addr;
      	__socklen_t clilen = sizeof(cli_addr);
-	int newsock = async_accept4(sock, (struct sockaddr *) &cli_addr, &clilen, 0);
+	int newsock = cfa_accept4(sock, (struct sockaddr *) &cli_addr, &clilen, 0);
      	if (newsock < 0) {
 		error( printer, "accept", -newsock);
@@ -97,5 +97,5 @@
 
 	while(1) {
-		int res = async_recvmsg(newsock, &msg, 0);
+		int res = cfa_recvmsg(newsock, &msg, 0);
 		if(res == 0) break;
 		if(res < 0) {
@@ -107,5 +107,5 @@
 	}
 
-	ret = async_close(newsock);
+	ret = cfa_close(newsock);
       if(ret < 0) {
             error( printer, "close new", -ret);
@@ -113,5 +113,5 @@
       }
 
-	ret = async_close(sock);
+	ret = cfa_close(sock);
       if(ret < 0) {
             error( printer, "close old", -ret);
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision c680a4ba5e93338ec036f1c5135c217e61c20ac1)
+++ libcfa/src/concurrency/io.cfa	(revision 44aad8f4ae26b1fe3eef138bd44cc12610ad6e9a)
@@ -1,2 +1,17 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// io.cfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Thu Apr 23 17:31:00 2020
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
 #include "kernel.hfa"
 
@@ -8,8 +23,4 @@
 	void __kernel_io_shutdown( cluster & this ) {
 		// Nothing to do without io_uring
-	}
-
-	bool is_async( void (*)() ) {
-		return false;
 	}
 
@@ -90,8 +101,10 @@
 
 		// Requires features
-		// // adjust the size according to the parameters
-		// if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
-		// 	cq->ring_sz = sq->ring_sz = max(cq->ring_sz, sq->ring_sz);
-		// }
+		#if defined(IORING_FEAT_SINGLE_MMAP)
+			// adjust the size according to the parameters
+			if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
+				cq->ring_sz = sq->ring_sz = max(cq->ring_sz, sq->ring_sz);
+			}
+		#endif
 
 		// mmap the Submit Queue into existence
@@ -101,10 +114,13 @@
 		}
 
-		// mmap the Completion Queue into existence (may or may not be needed)
 		// Requires features
-		// if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
-		// 	cq->ring_ptr = sq->ring_ptr;
-		// }
-		// else {
+		#if defined(IORING_FEAT_SINGLE_MMAP)
+			// mmap the Completion Queue into existence (may or may not be needed)
+			if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
+				cq->ring_ptr = sq->ring_ptr;
+			}
+			else
+		#endif
+		{
 			// We need multiple call to MMAP
 			cq.ring_ptr = mmap(0, cq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
@@ -113,5 +129,5 @@
 				abort("KERNEL ERROR: IO_URING MMAP2 - %s\n", strerror(errno));
 			}
-		// }
+		}
 
 		// mmap the submit queue entries
@@ -160,4 +176,12 @@
 		(this.io.submit){ min(*sq.num, *cq.num) };
 
+		// Initialize statistics
+		#if !defined(__CFA_NO_STATISTICS__)
+			this.io.submit_q.stats.submit_avg.val = 0;
+			this.io.submit_q.stats.submit_avg.cnt = 0;
+			this.io.completion_q.stats.completed_avg.val = 0;
+			this.io.completion_q.stats.completed_avg.cnt = 0;
+		#endif
+
 		// Create the poller thread
 		this.io.stack = __create_pthread( &this.io.poller, __io_poller, &this );
@@ -174,4 +198,21 @@
 		pthread_join( this.io.poller, 0p );
 		free( this.io.stack );
+
+		// print statistics
+		#if !defined(__CFA_NO_STATISTICS__)
+			if(this.print_stats) {
+				__cfaabi_bits_print_safe( STDERR_FILENO,
+					"----- I/O uRing Stats -----\n"
+					"- total submit calls  : %llu\n"
+					"- avg submit          : %lf\n"
+					"- total wait calls    : %llu\n"
+					"- avg completion/wait : %lf\n",
+					this.io.submit_q.stats.submit_avg.cnt,
+					((double)this.io.submit_q.stats.submit_avg.val) / this.io.submit_q.stats.submit_avg.cnt,
+					this.io.completion_q.stats.completed_avg.cnt,
+					((double)this.io.completion_q.stats.completed_avg.val) / this.io.completion_q.stats.completed_avg.cnt
+				);
+			}
+		#endif
 
 		// Shutdown the io rings
@@ -204,29 +245,56 @@
 	// Process a single completion message from the io_uring
 	// This is NOT thread-safe
-	static bool __io_process(struct io_ring & ring) {
+	int __drain_io( struct io_ring & ring, sigset_t & mask, int waitcnt ) {
+		int ret = syscall( __NR_io_uring_enter, ring.fd, 0, waitcnt, IORING_ENTER_GETEVENTS, &mask, _NSIG / 8);
+		if( ret < 0 ) {
+			switch((int)errno) {
+			case EAGAIN:
+			case EINTR:
+				return -EAGAIN;
+			default:
+				abort( "KERNEL ERROR: IO_URING WAIT - %s\n", strerror(errno) );
+			}
+		}
+
+		// Drain the queue
 		unsigned head = *ring.completion_q.head;
 		unsigned tail = __atomic_load_n(ring.completion_q.tail, __ATOMIC_ACQUIRE);
 
-		if (head == tail) return false;
-
-		unsigned idx = head & (*ring.completion_q.mask);
-		struct io_uring_cqe & cqe = ring.completion_q.cqes[idx];
-
-		/* paranoid */ verify(&cqe);
-
-		struct io_user_data * data = (struct io_user_data *)cqe.user_data;
-		// __cfaabi_bits_print_safe( STDERR_FILENO, "Performed reading io cqe %p, result %d for %p\n", data, cqe.res, data->thrd );
-
-		data->result = cqe.res;
-		__unpark( data->thrd __cfaabi_dbg_ctx2 );
+		// Nothing was new return 0
+		if (head == tail) {
+			#if !defined(__CFA_NO_STATISTICS__)
+				ring.completion_q.stats.completed_avg.cnt += 1;
+			#endif
+			return 0;
+		}
+
+		uint32_t count = tail - head;
+		for(i; count) {
+			unsigned idx = (head + i) & (*ring.completion_q.mask);
+			struct io_uring_cqe & cqe = ring.completion_q.cqes[idx];
+
+			/* paranoid */ verify(&cqe);
+
+			struct io_user_data * data = (struct io_user_data *)cqe.user_data;
+			// __cfaabi_bits_print_safe( STDERR_FILENO, "Performed reading io cqe %p, result %d for %p\n", data, cqe.res, data->thrd );
+
+			data->result = cqe.res;
+			__unpark( data->thrd __cfaabi_dbg_ctx2 );
+		}
 
 		// Allow new submissions to happen
-		V(ring.submit);
+		V(ring.submit, count);
 
 		// Mark to the kernel that the cqe has been seen
 		// Ensure that the kernel only sees the new value of the head index after the CQEs have been read.
-		__atomic_fetch_add( ring.completion_q.head, 1, __ATOMIC_RELAXED );
-
-		return true;
+		__atomic_fetch_add( ring.completion_q.head, count, __ATOMIC_RELAXED );
+
+		// Update statistics
+		#if !defined(__CFA_NO_STATISTICS__)
+			ring.completion_q.stats.completed_avg.val += count;
+			ring.completion_q.stats.completed_avg.cnt += 1;
+		#endif
+
+		return count;
 	}
 
@@ -246,18 +314,6 @@
 		verify( (*ring.completion_q.head) == (*ring.completion_q.tail) );
 
-		LOOP: while(!__atomic_load_n(&ring.done, __ATOMIC_SEQ_CST)) {
-			int ret = syscall( __NR_io_uring_enter, ring.fd, 0, 1, IORING_ENTER_GETEVENTS, &mask, _NSIG / 8);
-			if( ret < 0 ) {
-				switch((int)errno) {
-				case EAGAIN:
-				case EINTR:
-					continue LOOP;
-				default:
-					abort( "KERNEL ERROR: IO_URING WAIT - %s\n", strerror(errno) );
-				}
-			}
-
-			// Drain the queue
-			while(__io_process(ring)) {}
+		while(!__atomic_load_n(&ring.done, __ATOMIC_SEQ_CST)) {
+			__drain_io( ring, mask, 1 );
 		}
 
@@ -293,83 +349,82 @@
 //
 
-static inline [* struct io_uring_sqe, uint32_t] __submit_alloc( struct io_ring & ring ) {
-	// Wait for a spot to be available
-	P(ring.submit);
-
-	// Allocate the sqe
-	uint32_t idx = __atomic_fetch_add(&ring.submit_q.alloc, 1ul32, __ATOMIC_SEQ_CST);
-
-	// Validate that we didn't overflow anything
-	// Check that nothing overflowed
-	/* paranoid */ verify( true );
-
-	// Check that it goes head -> tail -> alloc and never head -> alloc -> tail
-	/* paranoid */ verify( true );
-
-	// Return the sqe
-	return [&ring.submit_q.sqes[ idx & (*ring.submit_q.mask)], idx];
-}
-
-static inline void __submit( struct io_ring & ring, uint32_t idx ) {
-	// get mutual exclusion
-	lock(ring.submit_q.lock __cfaabi_dbg_ctx2);
-
-	// Append to the list of ready entries
-	uint32_t * tail = ring.submit_q.tail;
-	const uint32_t mask = *ring.submit_q.mask;
-
-	ring.submit_q.array[ (*tail) & mask ] = idx & mask;
-	__atomic_fetch_add(tail, 1ul32, __ATOMIC_SEQ_CST);
-
-	// Submit however, many entries need to be submitted
-	int ret = syscall( __NR_io_uring_enter, ring.fd, 1, 0, 0, 0p, 0);
-	// __cfaabi_bits_print_safe( STDERR_FILENO, "Performed io_submit, returned %d\n", ret );
-	if( ret < 0 ) {
-		switch((int)errno) {
-		default:
-			abort( "KERNEL ERROR: IO_URING SUBMIT - %s\n", strerror(errno) );
-		}
-	}
-
-	unlock(ring.submit_q.lock);
-	// Make sure that idx was submitted
-	// Be careful to not get false positive if we cycled the entire list or that someone else submitted for us
-}
-
-static inline void ?{}(struct io_uring_sqe & this, uint8_t opcode, int fd) {
-	this.opcode = opcode;
-	#if !defined(IOSQE_ASYNC)
-		this.flags = 0;
-	#else
-		this.flags = IOSQE_ASYNC;
-	#endif
-	this.ioprio = 0;
-	this.fd = fd;
-	this.off = 0;
-	this.addr = 0;
-	this.len = 0;
-	this.rw_flags = 0;
-	this.__pad2[0] = this.__pad2[1] = this.__pad2[2] = 0;
-}
-
-static inline void ?{}(struct io_uring_sqe & this, uint8_t opcode, int fd, void * addr, uint32_t len, uint64_t off ) {
-	(this){ opcode, fd };
-	this.off = off;
-	this.addr = (uint64_t)addr;
-	this.len = len;
-}
+	static inline [* struct io_uring_sqe, uint32_t] __submit_alloc( struct io_ring & ring ) {
+		// Wait for a spot to be available
+		P(ring.submit);
+
+		// Allocate the sqe
+		uint32_t idx = __atomic_fetch_add(&ring.submit_q.alloc, 1ul32, __ATOMIC_SEQ_CST);
+
+		// Validate that we didn't overflow anything
+		// Check that nothing overflowed
+		/* paranoid */ verify( true );
+
+		// Check that it goes head -> tail -> alloc and never head -> alloc -> tail
+		/* paranoid */ verify( true );
+
+		// Return the sqe
+		return [&ring.submit_q.sqes[ idx & (*ring.submit_q.mask)], idx];
+	}
+
+	static inline void __submit( struct io_ring & ring, uint32_t idx ) {
+		// get mutual exclusion
+		lock(ring.submit_q.lock __cfaabi_dbg_ctx2);
+
+		// Append to the list of ready entries
+		uint32_t * tail = ring.submit_q.tail;
+		const uint32_t mask = *ring.submit_q.mask;
+
+		ring.submit_q.array[ (*tail) & mask ] = idx & mask;
+		__atomic_fetch_add(tail, 1ul32, __ATOMIC_SEQ_CST);
+
+		// Submit however, many entries need to be submitted
+		int ret = syscall( __NR_io_uring_enter, ring.fd, 1, 0, 0, 0p, 0);
+		// __cfaabi_bits_print_safe( STDERR_FILENO, "Performed io_submit, returned %d\n", ret );
+		if( ret < 0 ) {
+			switch((int)errno) {
+			default:
+				abort( "KERNEL ERROR: IO_URING SUBMIT - %s\n", strerror(errno) );
+			}
+		}
+
+		// update statistics
+		#if !defined(__CFA_NO_STATISTICS__)
+			ring.submit_q.stats.submit_avg.val += 1;
+			ring.submit_q.stats.submit_avg.cnt += 1;
+		#endif
+
+		unlock(ring.submit_q.lock);
+		// Make sure that idx was submitted
+		// Be careful to not get false positive if we cycled the entire list or that someone else submitted for us
+	}
+
+	static inline void ?{}(struct io_uring_sqe & this, uint8_t opcode, int fd) {
+		this.opcode = opcode;
+		#if !defined(IOSQE_ASYNC)
+			this.flags = 0;
+		#else
+			this.flags = IOSQE_ASYNC;
+		#endif
+		this.ioprio = 0;
+		this.fd = fd;
+		this.off = 0;
+		this.addr = 0;
+		this.len = 0;
+		this.rw_flags = 0;
+		this.__pad2[0] = this.__pad2[1] = this.__pad2[2] = 0;
+	}
+
+	static inline void ?{}(struct io_uring_sqe & this, uint8_t opcode, int fd, void * addr, uint32_t len, uint64_t off ) {
+		(this){ opcode, fd };
+		this.off = off;
+		this.addr = (uint64_t)addr;
+		this.len = len;
+	}
+#endif
 
 //=============================================================================================
 // I/O Interface
 //=============================================================================================
-	extern "C" {
-		#define __USE_GNU
-		#define _GNU_SOURCE
-		#include <fcntl.h>
-		#include <sys/uio.h>
-		#include <sys/socket.h>
-		#include <sys/stat.h>
-	}
-
+#if defined(HAVE_LINUX_IO_URING_H)
 	#define __submit_prelude \
 		struct io_ring & ring = active_cluster()->io; \
@@ -385,261 +440,290 @@
 		park( __cfaabi_dbg_ctx ); \
 		return data.result;
+#endif
+
+// Some forward declarations
+extern "C" {
+	#include <sys/types.h>
+	struct iovec;
+	extern ssize_t preadv2 (int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
+	extern ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
+
+	extern int fsync(int fd);
+	extern int sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags);
+
+	struct msghdr;
+	struct sockaddr;
+	extern ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags);
+	extern ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags);
+	extern ssize_t send(int sockfd, const void *buf, size_t len, int flags);
+	extern ssize_t recv(int sockfd, void *buf, size_t len, int flags);
+	extern int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
+	extern int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+
+	extern int fallocate(int fd, int mode, uint64_t offset, uint64_t len);
+	extern int posix_fadvise(int fd, uint64_t offset, uint64_t len, int advice);
+	extern int madvise(void *addr, size_t length, int advice);
+
+	extern int openat(int dirfd, const char *pathname, int flags, mode_t mode);
+	extern int close(int fd);
+
+	struct statx;
+	extern int statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf);
+
+	extern ssize_t read (int fd, void *buf, size_t count);
+}
 
 //-----------------------------------------------------------------------------
 // Asynchronous operations
-	ssize_t async_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) {
-		#if !defined(IORING_OP_READV)
-			return preadv2(fd, iov, iovcnt, offset, flags);
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_READV, fd, iov, iovcnt, offset };
-
-			__submit_wait
-		#endif
-	}
-
-	ssize_t async_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) {
-		#if !defined(IORING_OP_WRITEV)
-			return pwritev2(fd, iov, iovcnt, offset, flags);
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_WRITEV, fd, iov, iovcnt, offset };
-
-			__submit_wait
-		#endif
-	}
-
-	int async_fsync(int fd) {
-		#if !defined(IORING_OP_FSYNC)
-			return fsync(fd);
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_FSYNC, fd };
-
-			__submit_wait
-		#endif
-	}
-
-	int async_sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags) {
-		#if !defined(IORING_OP_SYNC_FILE_RANGE)
-			return sync_file_range(fd, offset, nbytes, flags);
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_SYNC_FILE_RANGE, fd };
-			sqe->off = offset;
-			sqe->len = nbytes;
-			sqe->sync_range_flags = flags;
-
-			__submit_wait
-		#endif
-	}
-
-
-	ssize_t async_sendmsg(int sockfd, const struct msghdr *msg, int flags) {
-		#if !defined(IORING_OP_SENDMSG)
-			return recv(sockfd, msg, flags);
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_SENDMSG, sockfd, msg, 1, 0 };
-			sqe->msg_flags = flags;
-
-			__submit_wait
-		#endif
-	}
-
-	ssize_t async_recvmsg(int sockfd, struct msghdr *msg, int flags) {
-		#if !defined(IORING_OP_RECVMSG)
-			return recv(sockfd, msg, flags);
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_RECVMSG, sockfd, msg, 1, 0 };
-			sqe->msg_flags = flags;
-
-			__submit_wait
-		#endif
-	}
-
-	ssize_t async_send(int sockfd, const void *buf, size_t len, int flags) {
-		#if !defined(IORING_OP_SEND)
-			return send( sockfd, buf, len, flags );
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_SEND, sockfd };
-			sqe->addr = (uint64_t)buf;
-			sqe->len = len;
-			sqe->msg_flags = flags;
-
-			__submit_wait
-		#endif
-	}
-
-	ssize_t async_recv(int sockfd, void *buf, size_t len, int flags) {
-		#if !defined(IORING_OP_RECV)
-			return recv( sockfd, buf, len, flags );
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_RECV, sockfd };
-			sqe->addr = (uint64_t)buf;
-			sqe->len = len;
-			sqe->msg_flags = flags;
-
-			__submit_wait
-		#endif
-	}
-
-	int async_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags) {
-		#if !defined(IORING_OP_ACCEPT)
-			__SOCKADDR_ARG _addr;
-			_addr.__sockaddr__ = addr;
-			return accept4( sockfd, _addr, addrlen, flags );
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_ACCEPT, sockfd };
-			sqe->addr = addr;
-			sqe->addr2 = addrlen;
-			sqe->accept_flags = flags;
-
-			__submit_wait
-		#endif
-	}
-
-	int async_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen) {
-		#if !defined(IORING_OP_CONNECT)
-			__CONST_SOCKADDR_ARG _addr;
-			_addr.__sockaddr__ = addr;
-			return connect( sockfd, _addr, addrlen );
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_CONNECT, sockfd };
-			sqe->addr = (uint64_t)addr;
-			sqe->off = addrlen;
-
-			__submit_wait
-		#endif
-	}
-
-	int async_fallocate(int fd, int mode, uint64_t offset, uint64_t len) {
-		#if !defined(IORING_OP_FALLOCATE)
-			return fallocate( fd, mode, offset, len );
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_FALLOCATE, fd };
-			sqe->off = offset;
-			sqe->len = length;
-			sqe->mode = mode;
-
-			__submit_wait
-		#endif
-	}
-
-	int async_fadvise(int fd, uint64_t offset, uint64_t len, int advice) {
-		#if !defined(IORING_OP_FADVISE)
-			return posix_fadvise( fd, offset, len, advice );
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_FADVISE, fd };
-			sqe->off = (uint64_t)offset;
-			sqe->len = length;
-			sqe->fadvise_advice = advice;
-
-			__submit_wait
-		#endif
-	}
-
-	int async_madvise(void *addr, size_t length, int advice) {
-		#if !defined(IORING_OP_MADVISE)
-			return madvise( addr, length, advice );
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_MADVISE, 0 };
-			sqe->addr = (uint64_t)addr;
-			sqe->len = length;
-			sqe->fadvise_advice = advice;
-
-			__submit_wait
-		#endif
-	}
-
-	int async_openat(int dirfd, const char *pathname, int flags, mode_t mode) {
-		#if !defined(IORING_OP_OPENAT)
-			return openat( dirfd, pathname, flags, mode );
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_OPENAT, dirfd };
-			sqe->addr = (uint64_t)pathname;
-			sqe->open_flags = flags;
-			sqe->mode = mode;
-
-			__submit_wait
-		#endif
-	}
-
-	int async_close(int fd) {
-		#if !defined(IORING_OP_CLOSE)
-			return close( fd );
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_CLOSE, fd };
-
-			__submit_wait
-		#endif
-	}
-
-	int async_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf) {
-		#if !defined(IORING_OP_STATX)
-			//return statx( dirfd, pathname, flags, mask, statxbuf );
-			return syscall( __NR_io_uring_setup, dirfd, pathname, flags, mask, statxbuf );
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_STATX, dirfd };
-			sqe->addr = (uint64_t)pathname;
-			sqe->statx_flags = flags;
-			sqe->len = mask;
-			sqe->off = (uint64_t)statxbuf;
-
-			__submit_wait
-		#endif
-	}
-
-
-	ssize_t async_read(int fd, void *buf, size_t count) {
-		#if !defined(IORING_OP_READ)
-			return read( fd, buf, count );
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_READ, fd, buf, count, 0 };
-
-			__submit_wait
-		#endif
-	}
-
-	ssize_t async_write(int fd, void *buf, size_t count) {
-		#if !defined(IORING_OP_WRITE)
-			return read( fd, buf, count );
-		#else
-			__submit_prelude
-
-			(*sqe){ IORING_OP_WRITE, fd, buf, count, 0 };
-
-			__submit_wait
-		#endif
-	}
+ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_READV)
+		return preadv2(fd, iov, iovcnt, offset, flags);
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_READV, fd, iov, iovcnt, offset };
+
+		__submit_wait
+	#endif
+}
+
+ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_WRITEV)
+		return pwritev2(fd, iov, iovcnt, offset, flags);
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_WRITEV, fd, iov, iovcnt, offset };
+
+		__submit_wait
+	#endif
+}
+
+int cfa_fsync(int fd) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_FSYNC)
+		return fsync(fd);
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_FSYNC, fd };
+
+		__submit_wait
+	#endif
+}
+
+int cfa_sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_SYNC_FILE_RANGE)
+		return sync_file_range(fd, offset, nbytes, flags);
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_SYNC_FILE_RANGE, fd };
+		sqe->off = offset;
+		sqe->len = nbytes;
+		sqe->sync_range_flags = flags;
+
+		__submit_wait
+	#endif
+}
+
+
+ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_SENDMSG)
+		return recv(sockfd, msg, flags);
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_SENDMSG, sockfd, msg, 1, 0 };
+		sqe->msg_flags = flags;
+
+		__submit_wait
+	#endif
+}
+
+ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_RECVMSG)
+		return recv(sockfd, msg, flags);
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_RECVMSG, sockfd, msg, 1, 0 };
+		sqe->msg_flags = flags;
+
+		__submit_wait
+	#endif
+}
+
+ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_SEND)
+		return send( sockfd, buf, len, flags );
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_SEND, sockfd };
+		sqe->addr = (uint64_t)buf;
+		sqe->len = len;
+		sqe->msg_flags = flags;
+
+		__submit_wait
+	#endif
+}
+
+ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_RECV)
+		return recv( sockfd, buf, len, flags );
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_RECV, sockfd };
+		sqe->addr = (uint64_t)buf;
+		sqe->len = len;
+		sqe->msg_flags = flags;
+
+		__submit_wait
+	#endif
+}
+
+int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_ACCEPT)
+		return accept4( sockfd, addr, addrlen, flags );
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_ACCEPT, sockfd };
+		sqe->addr = addr;
+		sqe->addr2 = addrlen;
+		sqe->accept_flags = flags;
+
+		__submit_wait
+	#endif
+}
+
+int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_CONNECT)
+		return connect( sockfd, addr, addrlen );
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_CONNECT, sockfd };
+		sqe->addr = (uint64_t)addr;
+		sqe->off = addrlen;
+
+		__submit_wait
+	#endif
+}
+
+int cfa_fallocate(int fd, int mode, uint64_t offset, uint64_t len) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_FALLOCATE)
+		return fallocate( fd, mode, offset, len );
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_FALLOCATE, fd };
+		sqe->off = offset;
+		sqe->len = length;
+		sqe->mode = mode;
+
+		__submit_wait
+	#endif
+}
+
+int cfa_fadvise(int fd, uint64_t offset, uint64_t len, int advice) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_FADVISE)
+		return posix_fadvise( fd, offset, len, advice );
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_FADVISE, fd };
+		sqe->off = (uint64_t)offset;
+		sqe->len = length;
+		sqe->fadvise_advice = advice;
+
+		__submit_wait
+	#endif
+}
+
+int cfa_madvise(void *addr, size_t length, int advice) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_MADVISE)
+		return madvise( addr, length, advice );
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_MADVISE, 0 };
+		sqe->addr = (uint64_t)addr;
+		sqe->len = length;
+		sqe->fadvise_advice = advice;
+
+		__submit_wait
+	#endif
+}
+
+int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_OPENAT)
+		return openat( dirfd, pathname, flags, mode );
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_OPENAT, dirfd };
+		sqe->addr = (uint64_t)pathname;
+		sqe->open_flags = flags;
+		sqe->mode = mode;
+
+		__submit_wait
+	#endif
+}
+
+int cfa_close(int fd) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_CLOSE)
+		return close( fd );
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_CLOSE, fd };
+
+		__submit_wait
+	#endif
+}
+
+int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_STATX)
+		//return statx( dirfd, pathname, flags, mask, statxbuf );
+		return syscall( __NR_io_uring_setup, dirfd, pathname, flags, mask, statxbuf );
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_STATX, dirfd };
+		sqe->addr = (uint64_t)pathname;
+		sqe->statx_flags = flags;
+		sqe->len = mask;
+		sqe->off = (uint64_t)statxbuf;
+
+		__submit_wait
+	#endif
+}
+
+
+ssize_t cfa_read(int fd, void *buf, size_t count) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_READ)
+		return read( fd, buf, count );
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_READ, fd, buf, count, 0 };
+
+		__submit_wait
+	#endif
+}
+
+ssize_t cfa_write(int fd, void *buf, size_t count) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_WRITE)
+		return read( fd, buf, count );
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_WRITE, fd, buf, count, 0 };
+
+		__submit_wait
+	#endif
+}
 
 //-----------------------------------------------------------------------------
@@ -647,103 +731,102 @@
 
 // Macro magic to reduce the size of the following switch case
-	#define IS_DEFINED_APPLY(f, ...) f(__VA_ARGS__)
-	#define IS_DEFINED_SECOND(first, second, ...) second
-	#define IS_DEFINED_TEST(expansion) _CFA_IO_FEATURE_##expansion
-	#define IS_DEFINED(macro) IS_DEFINED_APPLY( IS_DEFINED_SECOND,IS_DEFINED_TEST(macro) false, true)
-
-	bool is_async( fptr_t func ) {
-
+#define IS_DEFINED_APPLY(f, ...) f(__VA_ARGS__)
+#define IS_DEFINED_SECOND(first, second, ...) second
+#define IS_DEFINED_TEST(expansion) _CFA_IO_FEATURE_##expansion
+#define IS_DEFINED(macro) IS_DEFINED_APPLY( IS_DEFINED_SECOND,IS_DEFINED_TEST(macro) false, true)
+
+bool has_user_level_blocking( fptr_t func ) {
+	#if defined(HAVE_LINUX_IO_URING_H)
 		if( /*func == (fptr_t)preadv2 || */
-			func == (fptr_t)async_preadv2 )
+			func == (fptr_t)cfa_preadv2 )
 			#define _CFA_IO_FEATURE_IORING_OP_READV ,
 			return IS_DEFINED(IORING_OP_READV);
 
 		if( /*func == (fptr_t)pwritev2 || */
-		      func == (fptr_t)async_pwritev2 )
+			func == (fptr_t)cfa_pwritev2 )
 			#define _CFA_IO_FEATURE_IORING_OP_WRITEV ,
 			return IS_DEFINED(IORING_OP_WRITEV);
 
 		if( /*func == (fptr_t)fsync || */
-		      func == (fptr_t)async_fsync )
+			func == (fptr_t)cfa_fsync )
 			#define _CFA_IO_FEATURE_IORING_OP_FSYNC ,
 			return IS_DEFINED(IORING_OP_FSYNC);
 
 		if( /*func == (fptr_t)ync_file_range || */
-		      func == (fptr_t)async_sync_file_range )
+			func == (fptr_t)cfa_sync_file_range )
 			#define _CFA_IO_FEATURE_IORING_OP_SYNC_FILE_RANGE ,
 			return IS_DEFINED(IORING_OP_SYNC_FILE_RANGE);
 
 		if( /*func == (fptr_t)sendmsg || */
-		      func == (fptr_t)async_sendmsg )
+			func == (fptr_t)cfa_sendmsg )
 			#define _CFA_IO_FEATURE_IORING_OP_SENDMSG ,
 			return IS_DEFINED(IORING_OP_SENDMSG);
 
 		if( /*func == (fptr_t)recvmsg || */
-		      func == (fptr_t)async_recvmsg )
+			func == (fptr_t)cfa_recvmsg )
 			#define _CFA_IO_FEATURE_IORING_OP_RECVMSG ,
 			return IS_DEFINED(IORING_OP_RECVMSG);
 
 		if( /*func == (fptr_t)send || */
-			func == (fptr_t)async_send )
+			func == (fptr_t)cfa_send )
 			#define _CFA_IO_FEATURE_IORING_OP_SEND ,
 			return IS_DEFINED(IORING_OP_SEND);
 
 		if( /*func == (fptr_t)recv || */
-			func == (fptr_t)async_recv )
+			func == (fptr_t)cfa_recv )
 			#define _CFA_IO_FEATURE_IORING_OP_RECV ,
 			return IS_DEFINED(IORING_OP_RECV);
 
 		if( /*func == (fptr_t)accept4 || */
-			func == (fptr_t)async_accept4 )
+			func == (fptr_t)cfa_accept4 )
 			#define _CFA_IO_FEATURE_IORING_OP_ACCEPT ,
 			return IS_DEFINED(IORING_OP_ACCEPT);
 
 		if( /*func == (fptr_t)connect || */
-			func == (fptr_t)async_connect )
+			func == (fptr_t)cfa_connect )
 			#define _CFA_IO_FEATURE_IORING_OP_CONNECT ,
 			return IS_DEFINED(IORING_OP_CONNECT);
 
 		if( /*func == (fptr_t)fallocate || */
-			func == (fptr_t)async_fallocate )
+			func == (fptr_t)cfa_fallocate )
 			#define _CFA_IO_FEATURE_IORING_OP_FALLOCATE ,
 			return IS_DEFINED(IORING_OP_FALLOCATE);
 
-		if( /*func == (fptr_t)fadvise || */
-			func == (fptr_t)async_fadvise )
+		if( /*func == (fptr_t)posix_fadvise || */
+			func == (fptr_t)cfa_fadvise )
 			#define _CFA_IO_FEATURE_IORING_OP_FADVISE ,
 			return IS_DEFINED(IORING_OP_FADVISE);
 
 		if( /*func == (fptr_t)madvise || */
-			func == (fptr_t)async_madvise )
+			func == (fptr_t)cfa_madvise )
 			#define _CFA_IO_FEATURE_IORING_OP_MADVISE ,
 			return IS_DEFINED(IORING_OP_MADVISE);
 
 		if( /*func == (fptr_t)openat || */
-			func == (fptr_t)async_openat )
+			func == (fptr_t)cfa_openat )
 			#define _CFA_IO_FEATURE_IORING_OP_OPENAT ,
 			return IS_DEFINED(IORING_OP_OPENAT);
 
 		if( /*func == (fptr_t)close || */
-			func == (fptr_t)async_close )
+			func == (fptr_t)cfa_close )
 			#define _CFA_IO_FEATURE_IORING_OP_CLOSE ,
 			return IS_DEFINED(IORING_OP_CLOSE);
 
 		if( /*func == (fptr_t)statx || */
-			func == (fptr_t)async_statx )
+			func == (fptr_t)cfa_statx )
 			#define _CFA_IO_FEATURE_IORING_OP_STATX ,
 			return IS_DEFINED(IORING_OP_STATX);
 
 		if( /*func == (fptr_t)read || */
-		      func == (fptr_t)async_read )
+			func == (fptr_t)cfa_read )
 			#define _CFA_IO_FEATURE_IORING_OP_READ ,
 			return IS_DEFINED(IORING_OP_READ);
 
 		if( /*func == (fptr_t)write || */
-		      func == (fptr_t)async_write )
+			func == (fptr_t)cfa_write )
 			#define _CFA_IO_FEATURE_IORING_OP_WRITE ,
 			return IS_DEFINED(IORING_OP_WRITE);
-
-		return false;
-	}
-
-#endif
+	#endif
+
+	return false;
+}
Index: libcfa/src/concurrency/iofwd.hfa
===================================================================
--- libcfa/src/concurrency/iofwd.hfa	(revision 44aad8f4ae26b1fe3eef138bd44cc12610ad6e9a)
+++ libcfa/src/concurrency/iofwd.hfa	(revision 44aad8f4ae26b1fe3eef138bd44cc12610ad6e9a)
@@ -0,0 +1,39 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// iofwd.hfa --
+//
+// Author           : Thierry Delisle
+// Created On       : Thu Apr 23 17:31:00 2020
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#pragma once
+
+ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
+ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
+int cfa_fsync(int fd);
+int cfa_sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags);
+ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags);
+ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags);
+ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags);
+ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags);
+int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
+int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+int cfa_fallocate(int fd, int mode, uint64_t offset, uint64_t len);
+int cfa_fadvise(int fd, uint64_t offset, uint64_t len, int advice);
+int cfa_madvise(void *addr, size_t length, int advice);
+int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode);
+int cfa_close(int fd);
+int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf);
+ssize_t cfa_read(int fd, void *buf, size_t count);
+ssize_t cfa_write(int fd, void *buf, size_t count)
+
+//-----------------------------------------------------------------------------
+// Check if a function is blocks a only the user thread
+bool has_user_level_blocking( fptr_t func );
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision c680a4ba5e93338ec036f1c5135c217e61c20ac1)
+++ libcfa/src/concurrency/kernel.cfa	(revision 44aad8f4ae26b1fe3eef138bd44cc12610ad6e9a)
@@ -257,4 +257,8 @@
 	ready_queue{};
 	ready_queue_lock{};
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		print_stats = false;
+	#endif
 
 	procs{ __get };
@@ -1004,4 +1008,18 @@
 }
 
+bool V(semaphore & this, unsigned diff) with( this ) {
+	$thread * thrd = 0p;
+	lock( lock __cfaabi_dbg_ctx2 );
+	int release = max(-count, (int)diff);
+	count += diff;
+	for(release) {
+		unpark( pop_head( waiting ) __cfaabi_dbg_ctx2 );
+	}
+
+	unlock( lock );
+
+	return thrd != 0p;
+}
+
 //-----------------------------------------------------------------------------
 // Global Queues
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision c680a4ba5e93338ec036f1c5135c217e61c20ac1)
+++ libcfa/src/concurrency/kernel.hfa	(revision 44aad8f4ae26b1fe3eef138bd44cc12610ad6e9a)
@@ -40,4 +40,5 @@
 void   P (semaphore & this);
 bool   V (semaphore & this);
+bool   V (semaphore & this, unsigned count);
 
 
@@ -144,4 +145,14 @@
 	void * ring_ptr;
 	size_t ring_sz;
+
+	// Statistics
+	#if !defined(__CFA_NO_STATISTICS__)
+		struct {
+			struct {
+				unsigned long long int val;
+				unsigned long long int cnt;
+			} submit_avg;
+		} stats;
+	#endif
 };
 
@@ -164,4 +175,14 @@
 	void * ring_ptr;
 	size_t ring_sz;
+
+	// Statistics
+	#if !defined(__CFA_NO_STATISTICS__)
+		struct {
+			struct {
+				unsigned long long int val;
+				unsigned long long int cnt;
+			} completed_avg;
+		} stats;
+	#endif
 };
 
@@ -213,4 +234,8 @@
 		struct io_ring io;
 	#endif
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		bool print_stats;
+	#endif
 };
 extern Duration default_preemption();
@@ -227,4 +252,10 @@
 static inline struct processor * active_processor() { return TL_GET( this_processor ); } // UNSAFE
 static inline struct cluster   * active_cluster  () { return TL_GET( this_processor )->cltr; }
+
+#if !defined(__CFA_NO_STATISTICS__)
+	static inline void print_stats_at_exit( cluster & this ) {
+		this.print_stats = true;
+	}
+#endif
 
 // Local Variables: //
