Index: benchmark/io/batch-readv.c
===================================================================
--- benchmark/io/batch-readv.c	(revision fc9bb791282f9039edf2fe373d71ff9c77f57c60)
+++ benchmark/io/batch-readv.c	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -1,25 +1,9 @@
 // Program to test the optimial batchsize in a single threaded process
 extern "C" {
-	#ifndef _GNU_SOURCE         /* See feature_test_macros(7) */
-	#define _GNU_SOURCE         /* See feature_test_macros(7) */
-	#endif
-	#include <errno.h>
-	#include <stdio.h>
-	#include <stdint.h>
-	#include <stdlib.h>
-	#include <string.h>
+	#include <getopt.h>
 	#include <locale.h>
-	#include <getopt.h>
-	#include <unistd.h>
-	#include <sys/mman.h>
-	#include <sys/syscall.h>
-	#include <sys/uio.h>
-	#include <fcntl.h>
 	#include <time.h>										// timespec
 	#include <sys/time.h>									// timeval
-
-	#include <linux/io_uring.h>
 }
-
 
 enum { TIMEGRAN = 1000000000LL };					// nanosecond granularity, except for timeval
@@ -27,76 +11,7 @@
 #include <omp.h>
 
-# ifndef __NR_io_uring_setup
-#  define __NR_io_uring_setup		425
-# endif
-# ifndef __NR_io_uring_enter
-#  define __NR_io_uring_enter		426
-# endif
-# ifndef __NR_io_uring_register
-#  define __NR_io_uring_register	427
-# endif
+#include "io_uring.h"
 
-struct io_uring_sq {
-	// Head and tail of the ring (associated with array)
-	volatile uint32_t * head;
-	volatile uint32_t * tail;
 
-	// The actual kernel ring which uses head/tail
-	// indexes into the sqes arrays
-	uint32_t * array;
-
-	// number of entries and mask to go with it
-	const uint32_t * num;
-	const uint32_t * mask;
-
-	// Submission flags (Not sure what for)
-	uint32_t * flags;
-
-	// number of sqes not submitted (whatever that means)
-	uint32_t * dropped;
-
-	// Like head/tail but not seen by the kernel
-	volatile uint32_t alloc;
-
-	// A buffer of sqes (not the actual ring)
-	struct io_uring_sqe * sqes;
-
-	// The location and size of the mmaped area
-	void * ring_ptr;
-	size_t ring_sz;
-};
-
-struct io_uring_cq {
-	// Head and tail of the ring
-	volatile uint32_t * head;
-	volatile uint32_t * tail;
-
-	// number of entries and mask to go with it
-	const uint32_t * mask;
-	const uint32_t * num;
-
-	// number of cqes not submitted (whatever that means)
-	uint32_t * overflow;
-
-	// the kernel ring
-	struct io_uring_cqe * cqes;
-
-	// The location and size of the mmaped area
-	void * ring_ptr;
-	size_t ring_sz;
-};
-
-struct io_ring {
-	struct io_uring_sq submit_q;
-	struct io_uring_cq completion_q;
-	uint32_t flags;
-	int fd;
-};
-
-struct fred {
-	io_ring io;
-};
-
-fred self;
 int myfd;
 
@@ -217,83 +132,5 @@
 	myfd = open(__FILE__, 0);
 
-	// Step 1 : call to setup
-	struct io_uring_params params;
-	memset(&params, 0, sizeof(params));
-
-	uint32_t nentries = 2048;
-
-	int fd = syscall(__NR_io_uring_setup, nentries, &params );
-	if(fd < 0) {
-		fprintf(stderr, "KERNEL ERROR: IO_URING SETUP - %s\n", strerror(errno));
-		abort();
-	}
-
-	// Step 2 : mmap result
-	memset(&self.io, 0, sizeof(struct io_ring));
-	struct io_uring_sq & sq = self.io.submit_q;
-	struct io_uring_cq & cq = self.io.completion_q;
-
-	// calculate the right ring size
-	sq.ring_sz = params.sq_off.array + (params.sq_entries * sizeof(unsigned)           );
-	cq.ring_sz = params.cq_off.cqes  + (params.cq_entries * sizeof(struct io_uring_cqe));
-
-	// Requires features
-	// // adjust the size according to the parameters
-	// if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
-	// 	cq->ring_sz = sq->ring_sz = max(cq->ring_sz, sq->ring_sz);
-	// }
-
-	// mmap the Submit Queue into existence
-	sq.ring_ptr = mmap(0, sq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
-	if (sq.ring_ptr == (void*)MAP_FAILED) {
-		fprintf(stderr, "KERNEL ERROR: IO_URING MMAP1 - %s\n", strerror(errno));
-		abort();
-	}
-
-	// mmap the Completion Queue into existence (may or may not be needed)
-	// Requires features
-	// if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
-	// 	cq->ring_ptr = sq->ring_ptr;
-	// }
-	// else {
-		// We need multiple call to MMAP
-		cq.ring_ptr = mmap(0, cq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
-		if (cq.ring_ptr == (void*)MAP_FAILED) {
-			munmap(sq.ring_ptr, sq.ring_sz);
-			fprintf(stderr, "KERNEL ERROR: IO_URING MMAP2 - %s\n", strerror(errno));
-			abort();
-		}
-	// }
-
-	// mmap the submit queue entries
-	size_t size = params.sq_entries * sizeof(struct io_uring_sqe);
-	sq.sqes = (struct io_uring_sqe *)mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
-	if (sq.sqes == (struct io_uring_sqe *)MAP_FAILED) {
-		munmap(sq.ring_ptr, sq.ring_sz);
-		if (cq.ring_ptr != sq.ring_ptr) munmap(cq.ring_ptr, cq.ring_sz);
-		fprintf(stderr, "KERNEL ERROR: IO_URING MMAP3 - %s\n", strerror(errno));
-		abort();
-	}
-
-	// Get the pointers from the kernel to fill the structure
-	// submit queue
-	sq.head    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
-	sq.tail    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
-	sq.mask    = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
-	sq.num     = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
-	sq.flags   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
-	sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
-	sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
-	sq.alloc = *sq.tail;
-
-	// completion queue
-	cq.head     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
-	cq.tail     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
-	cq.mask     = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
-	cq.num      = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
-	cq.overflow = (         uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
-	cq.cqes   = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
-
-	self.io.fd = fd;
+	init_uring(2048);
 
 	// Allocate the sqe
@@ -344,7 +181,9 @@
 
 	printf("Took %'ld ms\n", to_miliseconds(end - start));
-	printf("Submitted       %'llu\n", submits);
-	printf("Completed       %'llu\n", completes);
-	printf("Submitted / sec %'.f\n", submits   / to_fseconds(end - start));
-	printf("Completed / sec %'.f\n", completes / to_fseconds(end - start));
+	printf("Submitted        %'llu\n", submits);
+	printf("Completed        %'llu\n", completes);
+	printf("Submitted / sec  %'.f\n", submits   / to_fseconds(end - start));
+	printf("Completed / sec  %'.f\n", completes / to_fseconds(end - start));
+	printf("ns per Submitted %'.f\n", 1000000000.0 * to_fseconds(end - start) / (submits   / batch) );
+	printf("ns per Completed %'.f\n", 1000000000.0 * to_fseconds(end - start) / (completes / batch) );
 }
Index: benchmark/io/io_uring.h
===================================================================
--- benchmark/io/io_uring.h	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ benchmark/io/io_uring.h	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,171 @@
+extern "C" {
+	#ifndef _GNU_SOURCE         /* See feature_test_macros(7) */
+	#define _GNU_SOURCE         /* See feature_test_macros(7) */
+	#endif
+	#include <errno.h>
+	#include <stdio.h>
+	#include <stdint.h>
+	#include <stdlib.h>
+	#include <string.h>
+	#include <unistd.h>
+	#include <sys/mman.h>
+	#include <sys/syscall.h>
+	#include <sys/uio.h>
+	#include <fcntl.h>
+
+	#include <linux/io_uring.h>
+}
+
+# ifndef __NR_io_uring_setup
+#  define __NR_io_uring_setup		425
+# endif
+# ifndef __NR_io_uring_enter
+#  define __NR_io_uring_enter		426
+# endif
+# ifndef __NR_io_uring_register
+#  define __NR_io_uring_register	427
+# endif
+
+struct io_uring_sq {
+	// Head and tail of the ring (associated with array)
+	volatile uint32_t * head;
+	volatile uint32_t * tail;
+
+	// The actual kernel ring which uses head/tail
+	// indexes into the sqes arrays
+	uint32_t * array;
+
+	// number of entries and mask to go with it
+	const uint32_t * num;
+	const uint32_t * mask;
+
+	// Submission flags (Not sure what for)
+	uint32_t * flags;
+
+	// number of sqes not submitted (whatever that means)
+	uint32_t * dropped;
+
+	// Like head/tail but not seen by the kernel
+	volatile uint32_t alloc;
+
+	// A buffer of sqes (not the actual ring)
+	struct io_uring_sqe * sqes;
+
+	// The location and size of the mmaped area
+	void * ring_ptr;
+	size_t ring_sz;
+};
+
+struct io_uring_cq {
+	// Head and tail of the ring
+	volatile uint32_t * head;
+	volatile uint32_t * tail;
+
+	// number of entries and mask to go with it
+	const uint32_t * mask;
+	const uint32_t * num;
+
+	// number of cqes not submitted (whatever that means)
+	uint32_t * overflow;
+
+	// the kernel ring
+	struct io_uring_cqe * cqes;
+
+	// The location and size of the mmaped area
+	void * ring_ptr;
+	size_t ring_sz;
+};
+
+struct io_ring {
+	struct io_uring_sq submit_q;
+	struct io_uring_cq completion_q;
+	uint32_t flags;
+	int fd;
+};
+
+struct IO_singleton {
+	io_ring io;
+};
+
+IO_singleton self;
+
+void init_uring(uint32_t nentries) {
+      // Step 1 : call to setup
+	struct io_uring_params params;
+	memset(&params, 0, sizeof(params));
+	// params.flags = IORING_SETUP_SQPOLL;
+
+	int fd = syscall(__NR_io_uring_setup, nentries, &params );
+	if(fd < 0) {
+		fprintf(stderr, "KERNEL ERROR: IO_URING SETUP - %s\n", strerror(errno));
+		abort();
+	}
+
+	// Step 2 : mmap result
+	memset(&self.io, 0, sizeof(struct io_ring));
+	struct io_uring_sq & sq = self.io.submit_q;
+	struct io_uring_cq & cq = self.io.completion_q;
+
+	// calculate the right ring size
+	sq.ring_sz = params.sq_off.array + (params.sq_entries * sizeof(unsigned)           );
+	cq.ring_sz = params.cq_off.cqes  + (params.cq_entries * sizeof(struct io_uring_cqe));
+
+	// Requires features
+	// // adjust the size according to the parameters
+	// if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
+	// 	cq->ring_sz = sq->ring_sz = max(cq->ring_sz, sq->ring_sz);
+	// }
+
+	// mmap the Submit Queue into existence
+	sq.ring_ptr = mmap(0, sq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
+	if (sq.ring_ptr == (void*)MAP_FAILED) {
+		fprintf(stderr, "KERNEL ERROR: IO_URING MMAP1 - %s\n", strerror(errno));
+		abort();
+	}
+
+	// mmap the Completion Queue into existence (may or may not be needed)
+	// Requires features
+	// if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
+	// 	cq->ring_ptr = sq->ring_ptr;
+	// }
+	// else {
+		// We need multiple call to MMAP
+		cq.ring_ptr = mmap(0, cq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
+		if (cq.ring_ptr == (void*)MAP_FAILED) {
+			munmap(sq.ring_ptr, sq.ring_sz);
+			fprintf(stderr, "KERNEL ERROR: IO_URING MMAP2 - %s\n", strerror(errno));
+			abort();
+		}
+	// }
+
+	// mmap the submit queue entries
+	size_t size = params.sq_entries * sizeof(struct io_uring_sqe);
+	sq.sqes = (struct io_uring_sqe *)mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
+	if (sq.sqes == (struct io_uring_sqe *)MAP_FAILED) {
+		munmap(sq.ring_ptr, sq.ring_sz);
+		if (cq.ring_ptr != sq.ring_ptr) munmap(cq.ring_ptr, cq.ring_sz);
+		fprintf(stderr, "KERNEL ERROR: IO_URING MMAP3 - %s\n", strerror(errno));
+		abort();
+	}
+
+	// Get the pointers from the kernel to fill the structure
+	// submit queue
+	sq.head    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
+	sq.tail    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
+	sq.mask    = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
+	sq.num     = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
+	sq.flags   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
+	sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
+	sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
+	sq.alloc = *sq.tail;
+
+	// completion queue
+	cq.head     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
+	cq.tail     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
+	cq.mask     = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
+	cq.num      = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
+	cq.overflow = (         uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
+	cq.cqes   = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
+
+	self.io.fd = fd;
+}
Index: benchmark/io/readv.cfa
===================================================================
--- benchmark/io/readv.cfa	(revision fc9bb791282f9039edf2fe373d71ff9c77f57c60)
+++ benchmark/io/readv.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -1,2 +1,4 @@
+#define _GNU_SOURCE
+
 #include <stdlib.h>
 #include <stdio.h>
@@ -22,4 +24,6 @@
 extern bool traceHeapOn();
 extern ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
+extern ssize_t cfa_preadv2_fixed(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
+extern void register_fixed_files( cluster &, int *, unsigned count );
 
 int fd;
@@ -28,8 +32,18 @@
 
 unsigned long int buflen = 50;
+bool fixed_file = false;
 
 thread __attribute__((aligned(128))) Reader {};
 void ?{}( Reader & this ) {
 	((thread&)this){ "Reader Thread", *the_benchmark_cluster };
+}
+
+int do_read(int fd, struct iovec * iov) {
+	if(fixed_file) {
+		return cfa_preadv2_fixed(fd, iov, 1, 0, 0);
+	}
+	else {
+		return cfa_preadv2(fd, iov, 1, 0, 0);
+	}
 }
 
@@ -42,5 +56,5 @@
 
 	while(__atomic_load_n(&run, __ATOMIC_RELAXED)) {
-		int r = cfa_preadv2(fd, &iov, 1, 0, 0);
+		int r = do_read(fd, &iov);
 		if(r < 0) abort("%s\n", strerror(-r));
 
@@ -52,4 +66,5 @@
 	BENCH_DECL
 	unsigned flags = 0;
+	int file_flags = 0;
 	unsigned sublen = 16;
 
@@ -58,13 +73,16 @@
 		static struct option options[] = {
 			BENCH_OPT_LONG
-			{"bufsize",      required_argument, 0, 'b'},
-			{"userthread",   no_argument      , 0, 'u'},
-			{"submitthread", no_argument      , 0, 's'},
-			{"submitlength", required_argument, 0, 'l'},
+			{"bufsize",       required_argument, 0, 'b'},
+			{"userthread",    no_argument      , 0, 'u'},
+			{"submitthread",  no_argument      , 0, 's'},
+			{"eagersubmit",   no_argument      , 0, 'e'},
+			{"kpollsubmit",   no_argument      , 0, 'k'},
+			{"kpollcomplete", no_argument      , 0, 'i'},
+			{"submitlength",  required_argument, 0, 'l'},
 			{0, 0, 0, 0}
 		};
 
 		int idx = 0;
-		int opt = getopt_long(argc, argv, BENCH_OPT_SHORT "b:usl:", options, &idx);
+		int opt = getopt_long(argc, argv, BENCH_OPT_SHORT "b:usekil:", options, &idx);
 
 		const char * arg = optarg ? optarg : "";
@@ -88,4 +106,15 @@
 				flags |= CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS;
 				break;
+			case 'e':
+				flags |= CFA_CLUSTER_IO_EAGER_SUBMITS;
+				break;
+			case 'k':
+				flags |= CFA_CLUSTER_IO_KERNEL_POLL_SUBMITS;
+				fixed_file = true;
+				break;
+			case 'i':
+				flags |= CFA_CLUSTER_IO_KERNEL_POLL_COMPLETES;
+				file_flags |= O_DIRECT;
+				break;
 			case 'l':
 				sublen = strtoul(arg, &end, 10);
@@ -103,10 +132,14 @@
 				fprintf( stderr, "  -u, --userthread         If set, cluster uses user-thread to poll I/O\n" );
 				fprintf( stderr, "  -s, --submitthread       If set, cluster uses polling thread to submit I/O\n" );
+				fprintf( stderr, "  -e, --eagersubmit        If set, cluster submits I/O eagerly but still aggregates submits\n" );
+				fprintf( stderr, "  -k, --kpollsubmit        If set, cluster uses IORING_SETUP_SQPOLL\n" );
+				fprintf( stderr, "  -i, --kpollcomplete      If set, cluster uses IORING_SETUP_IOPOLL\n" );
+				fprintf( stderr, "  -l, --submitlength=LEN   Max number of submitions that can be submitted together\n" );
 				exit(EXIT_FAILURE);
 		}
 	}
 
-	fd = open(__FILE__, 0);
-	if(fd < 0) {
+	int lfd = open(__FILE__, file_flags);
+	if(lfd < 0) {
 		fprintf(stderr, "Could not open source file\n");
 		exit(EXIT_FAILURE);
@@ -118,4 +151,13 @@
 		Time start, end;
 		BenchCluster cl = { flags, CFA_STATS_READY_Q | CFA_STATS_IO };
+
+		if(fixed_file) {
+			fd = 0;
+			register_fixed_files( cl.self, &lfd, 1 );
+		}
+		else {
+			fd = lfd;
+		}
+
 		{
 			BenchProc procs[nprocs];
@@ -145,4 +187,4 @@
 	}
 
-	close(fd);
+	close(lfd);
 }
Index: libcfa/src/Makefile.am
===================================================================
--- libcfa/src/Makefile.am	(revision fc9bb791282f9039edf2fe373d71ff9c77f57c60)
+++ libcfa/src/Makefile.am	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -40,5 +40,7 @@
 if BUILDLIB
 headers_nosrc = bitmanip.hfa exception.hfa math.hfa gmp.hfa time_t.hfa clock.hfa \
-		bits/align.hfa bits/containers.hfa bits/defs.hfa bits/debug.hfa bits/locks.hfa containers/list.hfa containers/stackLockFree.hfa
+		bits/align.hfa bits/containers.hfa bits/defs.hfa bits/debug.hfa bits/locks.hfa \
+		containers/list.hfa containers/stackLockFree.hfa concurrency/iofwd.hfa
+
 headers = common.hfa fstream.hfa heap.hfa iostream.hfa iterator.hfa limits.hfa rational.hfa \
 		time.hfa stdlib.hfa memory.hfa \
Index: libcfa/src/Makefile.in
===================================================================
--- libcfa/src/Makefile.in	(revision fc9bb791282f9039edf2fe373d71ff9c77f57c60)
+++ libcfa/src/Makefile.in	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -246,8 +246,8 @@
 	bits/align.hfa bits/containers.hfa bits/defs.hfa \
 	bits/debug.hfa bits/locks.hfa containers/list.hfa \
-	containers/stackLockFree.hfa concurrency/coroutine.hfa \
-	concurrency/thread.hfa concurrency/kernel.hfa \
-	concurrency/monitor.hfa concurrency/mutex.hfa \
-	concurrency/invoke.h
+	containers/stackLockFree.hfa concurrency/iofwd.hfa \
+	concurrency/coroutine.hfa concurrency/thread.hfa \
+	concurrency/kernel.hfa concurrency/monitor.hfa \
+	concurrency/mutex.hfa concurrency/invoke.h
 HEADERS = $(nobase_cfa_include_HEADERS)
 am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
@@ -470,5 +470,6 @@
 #----------------------------------------------------------------------------------------------------------------
 @BUILDLIB_TRUE@headers_nosrc = bitmanip.hfa exception.hfa math.hfa gmp.hfa time_t.hfa clock.hfa \
-@BUILDLIB_TRUE@		bits/align.hfa bits/containers.hfa bits/defs.hfa bits/debug.hfa bits/locks.hfa containers/list.hfa containers/stackLockFree.hfa
+@BUILDLIB_TRUE@		bits/align.hfa bits/containers.hfa bits/defs.hfa bits/debug.hfa bits/locks.hfa \
+@BUILDLIB_TRUE@		containers/list.hfa containers/stackLockFree.hfa concurrency/iofwd.hfa
 
 @BUILDLIB_FALSE@headers = 
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision fc9bb791282f9039edf2fe373d71ff9c77f57c60)
+++ libcfa/src/concurrency/io.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -14,6 +14,8 @@
 //
 
-// #define __CFA_DEBUG_PRINT_IO__
-// #define __CFA_DEBUG_PRINT_IO_CORE__
+#if defined(__CFA_DEBUG__)
+	// #define __CFA_DEBUG_PRINT_IO__
+	#define __CFA_DEBUG_PRINT_IO_CORE__
+#endif
 
 #include "kernel.hfa"
@@ -109,4 +111,5 @@
 		volatile uint32_t * head;
 		volatile uint32_t * tail;
+		volatile uint32_t prev_head;
 
 		// The actual kernel ring which uses head/tail
@@ -129,4 +132,5 @@
 
 		__spinlock_t lock;
+		__spinlock_t release_lock;
 
 		// A buffer of sqes (not the actual ring)
@@ -182,4 +186,8 @@
 //=============================================================================================
 	void __kernel_io_startup( cluster & this, unsigned io_flags, bool main_cluster ) {
+		if( (io_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS) && (io_flags & CFA_CLUSTER_IO_EAGER_SUBMITS) ) {
+			abort("CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS and CFA_CLUSTER_IO_EAGER_SUBMITS cannot be mixed\n");
+		}
+
 		this.io = malloc();
 
@@ -187,4 +195,6 @@
 		struct io_uring_params params;
 		memset(&params, 0, sizeof(params));
+		if( io_flags & CFA_CLUSTER_IO_KERNEL_POLL_SUBMITS   ) params.flags |= IORING_SETUP_SQPOLL;
+		if( io_flags & CFA_CLUSTER_IO_KERNEL_POLL_COMPLETES ) params.flags |= IORING_SETUP_IOPOLL;
 
 		uint32_t nentries = entries_per_cluster();
@@ -208,5 +218,5 @@
 			// adjust the size according to the parameters
 			if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
-				cq->ring_sz = sq->ring_sz = max(cq->ring_sz, sq->ring_sz);
+				cq.ring_sz = sq.ring_sz = max(cq.ring_sz, sq.ring_sz);
 			}
 		#endif
@@ -222,5 +232,5 @@
 			// mmap the Completion Queue into existence (may or may not be needed)
 			if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
-				cq->ring_ptr = sq->ring_ptr;
+				cq.ring_ptr = sq.ring_ptr;
 			}
 			else
@@ -253,4 +263,5 @@
 		sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
 		sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
+		sq.prev_head = *sq.head;
 
 		{
@@ -261,5 +272,8 @@
 		}
 
-		if( io_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
+		(sq.lock){};
+		(sq.release_lock){};
+
+		if( io_flags & ( CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS | CFA_CLUSTER_IO_EAGER_SUBMITS ) ) {
 			/* paranoid */ verify( is_pow2( io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET ) || ((io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET) < 8)  );
 			sq.ready_cnt = max(io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET, 8);
@@ -313,5 +327,5 @@
 
 		// Create the poller thread
-		__cfadbg_print_safe(io_core, "Kernel I/O : Creating slow poller for cluter %p\n", &this);
+		__cfadbg_print_safe(io_core, "Kernel I/O : Creating slow poller for cluster %p\n", &this);
 		this.io->poller.slow.blocked = false;
 		this.io->poller.slow.stack = __create_pthread( &this.io->poller.slow.kthrd, __io_poller_slow, &this );
@@ -418,66 +432,91 @@
 	}
 
+	int __io_uring_enter( struct __io_data & ring, unsigned to_submit, bool get, sigset_t * mask ) {
+		bool need_sys_to_submit = false;
+		bool need_sys_to_complete = false;
+		unsigned min_complete = 0;
+		unsigned flags = 0;
+
+
+		TO_SUBMIT:
+		if( to_submit > 0 ) {
+			if( !(ring.ring_flags & IORING_SETUP_SQPOLL) ) {
+				need_sys_to_submit = true;
+				break TO_SUBMIT;
+			}
+			if( (*ring.submit_q.flags) & IORING_SQ_NEED_WAKEUP ) {
+				need_sys_to_submit = true;
+				flags |= IORING_ENTER_SQ_WAKEUP;
+			}
+		}
+
+		TO_COMPLETE:
+		if( get && !(ring.ring_flags & IORING_SETUP_SQPOLL) ) {
+			flags |= IORING_ENTER_GETEVENTS;
+			if( mask ) {
+				need_sys_to_complete = true;
+				min_complete = 1;
+				break TO_COMPLETE;
+			}
+			if( (ring.ring_flags & IORING_SETUP_IOPOLL) ) {
+				need_sys_to_complete = true;
+			}
+		}
+
+		int ret = 0;
+		if( need_sys_to_submit || need_sys_to_complete ) {
+			ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, min_complete, flags, mask, _NSIG / 8);
+			if( ret < 0 ) {
+				switch((int)errno) {
+				case EAGAIN:
+				case EINTR:
+					ret = -1;
+					break;
+				default:
+					abort( "KERNEL ERROR: IO_URING SYSCALL - (%d) %s\n", (int)errno, strerror(errno) );
+				}
+			}
+		}
+
+		// Memory barrier
+		__atomic_thread_fence( __ATOMIC_SEQ_CST );
+		return ret;
+	}
+
 //=============================================================================================
 // I/O Polling
 //=============================================================================================
+	static unsigned __collect_submitions( struct __io_data & ring );
+	static uint32_t __release_consumed_submission( struct __io_data & ring );
+
 	// Process a single completion message from the io_uring
 	// This is NOT thread-safe
-	static [int, bool] __drain_io( & struct __io_data ring, * sigset_t mask, int waitcnt, bool in_kernel ) {
+	static [int, bool] __drain_io( & struct __io_data ring, * sigset_t mask ) {
+		/* paranoid */ verify( !kernelTLS.preemption_state.enabled );
+
 		unsigned to_submit = 0;
 		if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
-
 			// If the poller thread also submits, then we need to aggregate the submissions which are ready
-			uint32_t tail = *ring.submit_q.tail;
-			const uint32_t mask = *ring.submit_q.mask;
-
-			// Go through the list of ready submissions
-			for( i; ring.submit_q.ready_cnt ) {
-				// replace any submission with the sentinel, to consume it.
-				uint32_t idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
-
-				// If it was already the sentinel, then we are done
-				if( idx == -1ul32 ) continue;
-
-				// If we got a real submission, append it to the list
-				ring.submit_q.array[ (tail + to_submit) & mask ] = idx & mask;
-				to_submit++;
-			}
-
-			// Increment the tail based on how many we are ready to submit
-			__atomic_fetch_add(ring.submit_q.tail, to_submit, __ATOMIC_SEQ_CST);
-		}
-
-		const uint32_t smask = *ring.submit_q.mask;
-		uint32_t shead = *ring.submit_q.head;
-		int ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, waitcnt, IORING_ENTER_GETEVENTS, mask, _NSIG / 8);
+			to_submit = __collect_submitions( ring );
+		}
+
+		int ret = __io_uring_enter(ring, to_submit, true, mask);
 		if( ret < 0 ) {
-			switch((int)errno) {
-			case EAGAIN:
-			case EINTR:
-				return -EAGAIN;
-			default:
-				abort( "KERNEL ERROR: IO_URING WAIT - %s\n", strerror(errno) );
-			}
+			return [0, true];
+		}
+
+		// update statistics
+		if (to_submit > 0) {
+			__STATS__( true,
+				if( to_submit > 0 ) {
+					io.submit_q.submit_avg.rdy += to_submit;
+					io.submit_q.submit_avg.csm += ret;
+					io.submit_q.submit_avg.cnt += 1;
+				}
+			)
 		}
 
 		// Release the consumed SQEs
-		for( i; ret ) {
-			uint32_t idx = ring.submit_q.array[ (i + shead) & smask ];
-			ring.submit_q.sqes[ idx ].user_data = 0;
-		}
-
-		uint32_t avail = 0;
-		uint32_t sqe_num = *ring.submit_q.num;
-		for(i; sqe_num) {
-			if( ring.submit_q.sqes[ i ].user_data == 0 ) avail++;
-		}
-
-		// update statistics
-		#if !defined(__CFA_NO_STATISTICS__)
-			__tls_stats()->io.submit_q.submit_avg.rdy += to_submit;
-			__tls_stats()->io.submit_q.submit_avg.csm += ret;
-			__tls_stats()->io.submit_q.submit_avg.avl += avail;
-			__tls_stats()->io.submit_q.submit_avg.cnt += 1;
-		#endif
+		__release_consumed_submission( ring );
 
 		// Drain the queue
@@ -486,13 +525,11 @@
 		const uint32_t mask = *ring.completion_q.mask;
 
-		// Memory barrier
-		__atomic_thread_fence( __ATOMIC_SEQ_CST );
-
 		// Nothing was new return 0
 		if (head == tail) {
-			return 0;
+			return [0, to_submit > 0];
 		}
 
 		uint32_t count = tail - head;
+		/* paranoid */ verify( count != 0 );
 		for(i; count) {
 			unsigned idx = (head + i) & mask;
@@ -505,6 +542,6 @@
 
 			data->result = cqe.res;
-			if(!in_kernel) { unpark( data->thrd __cfaabi_dbg_ctx2 ); }
-			else         { __unpark( &ring.poller.slow.id, data->thrd __cfaabi_dbg_ctx2 ); }
+			if(!mask) { unpark( data->thrd __cfaabi_dbg_ctx2 ); }
+			else      { __unpark( &ring.poller.slow.id, data->thrd __cfaabi_dbg_ctx2 ); }
 		}
 
@@ -554,13 +591,13 @@
 				int count;
 				bool again;
-				[count, again] = __drain_io( ring, &mask, 1, true );
+				[count, again] = __drain_io( ring, &mask );
 
 				__atomic_store_n( &ring.poller.slow.blocked, false, __ATOMIC_SEQ_CST );
 
 				// Update statistics
-				#if !defined(__CFA_NO_STATISTICS__)
-					__tls_stats()->io.complete_q.completed_avg.val += count;
-					__tls_stats()->io.complete_q.completed_avg.slow_cnt += 1;
-				#endif
+				__STATS__( true,
+					io.complete_q.completed_avg.val += count;
+					io.complete_q.completed_avg.slow_cnt += 1;
+				)
 
 				if(again) {
@@ -576,11 +613,11 @@
 				int count;
 				bool again;
-				[count, again] = __drain_io( ring, &mask, 0, true );
+				[count, again] = __drain_io( ring, &mask );
 
 				// Update statistics
-				#if !defined(__CFA_NO_STATISTICS__)
-					__tls_stats()->io.complete_q.completed_avg.val += count;
-					__tls_stats()->io.complete_q.completed_avg.slow_cnt += 1;
-				#endif
+				__STATS__( true,
+					io.complete_q.completed_avg.val += count;
+					io.complete_q.completed_avg.slow_cnt += 1;
+				)
 			}
 		}
@@ -614,13 +651,13 @@
 			bool again;
 			disable_interrupts();
-				[count, again] = __drain_io( *this.ring, 0p, 0, false );
+				[count, again] = __drain_io( *this.ring, 0p );
 
 				if(!again) reset++;
 
 				// Update statistics
-				#if !defined(__CFA_NO_STATISTICS__)
-					__tls_stats()->io.complete_q.completed_avg.val += count;
-					__tls_stats()->io.complete_q.completed_avg.fast_cnt += 1;
-				#endif
+				__STATS__( true,
+					io.complete_q.completed_avg.val += count;
+					io.complete_q.completed_avg.fast_cnt += 1;
+				)
 			enable_interrupts( __cfaabi_dbg_ctx );
 
@@ -658,20 +695,12 @@
 
 // Submition steps :
-// 1 - We need to make sure we don't overflow any of the buffer, P(ring.submit) to make sure
-//     entries are available. The semaphore make sure that there is no more operations in
-//     progress then the number of entries in the buffer. This probably limits concurrency
-//     more than necessary since submitted but not completed operations don't need any
-//     entries in user space. However, I don't know what happens if we overflow the buffers
-//     because too many requests completed at once. This is a safe approach in all cases.
-//     Furthermore, with hundreds of entries, this may be okay.
-//
-// 2 - Allocate a queue entry. The ring already has memory for all entries but only the ones
+// 1 - Allocate a queue entry. The ring already has memory for all entries but only the ones
 //     listed in sq.array are visible by the kernel. For those not listed, the kernel does not
 //     offer any assurance that an entry is not being filled by multiple flags. Therefore, we
 //     need to write an allocator that allows allocating concurrently.
 //
-// 3 - Actually fill the submit entry, this is the only simple and straightforward step.
+// 2 - Actually fill the submit entry, this is the only simple and straightforward step.
 //
-// 4 - Append the entry index to the array and adjust the tail accordingly. This operation
+// 3 - Append the entry index to the array and adjust the tail accordingly. This operation
 //     needs to arrive to two concensus at the same time:
 //     A - The order in which entries are listed in the array: no two threads must pick the
@@ -682,6 +711,5 @@
 
 	[* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring, uint64_t data ) {
-		verify( data != 0 );
-
+		/* paranoid */ verify( data != 0 );
 
 		// Prepare the data we need
@@ -708,11 +736,9 @@
 				{
 					// update statistics
-					#if !defined(__CFA_NO_STATISTICS__)
-						disable_interrupts();
-							__tls_stats()->io.submit_q.alloc_avg.val   += len;
-							__tls_stats()->io.submit_q.alloc_avg.block += block;
-							__tls_stats()->io.submit_q.alloc_avg.cnt   += 1;
-						enable_interrupts( __cfaabi_dbg_ctx );
-					#endif
+					__STATS__( false,
+						io.submit_q.alloc_avg.val   += len;
+						io.submit_q.alloc_avg.block += block;
+						io.submit_q.alloc_avg.cnt   += 1;
+					)
 
 
@@ -757,15 +783,19 @@
 
 			block++;
-			yield();
+			if( try_lock(ring.submit_q.lock __cfaabi_dbg_ctx2) ) {
+				__release_consumed_submission( ring );
+				unlock( ring.submit_q.lock );
+			}
+			else {
+				yield();
+			}
 		}
 
 		// update statistics
-		#if !defined(__CFA_NO_STATISTICS__)
-		disable_interrupts();
-			__tls_stats()->io.submit_q.look_avg.val   += len;
-			__tls_stats()->io.submit_q.look_avg.block += block;
-			__tls_stats()->io.submit_q.look_avg.cnt   += 1;
-		enable_interrupts( __cfaabi_dbg_ctx );
-		#endif
+		__STATS__( false,
+			io.submit_q.look_avg.val   += len;
+			io.submit_q.look_avg.block += block;
+			io.submit_q.look_avg.cnt   += 1;
+		)
 
 		return picked;
@@ -780,5 +810,4 @@
 		if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
 			// If the poller thread submits, then we just need to add this to the ready array
-
 			__submit_to_ready_array( ring, idx, mask );
 
@@ -786,4 +815,53 @@
 
 			__cfadbg_print_safe( io, "Kernel I/O : Added %u to ready for %p\n", idx, active_thread() );
+		}
+		else if( ring.cltr_flags & CFA_CLUSTER_IO_EAGER_SUBMITS ) {
+			uint32_t picked = __submit_to_ready_array( ring, idx, mask );
+
+			for() {
+				yield();
+
+				// If some one else collected our index, we are done
+				#warning ABA problem
+				if( ring.submit_q.ready[picked] != idx ) {
+					__STATS__( false,
+						io.submit_q.helped += 1;
+					)
+					return;
+				}
+
+				if( try_lock(ring.submit_q.lock __cfaabi_dbg_ctx2) ) {
+					__STATS__( false,
+						io.submit_q.leader += 1;
+					)
+					break;
+				}
+
+				__STATS__( false,
+					io.submit_q.busy += 1;
+				)
+			}
+
+			// We got the lock
+			unsigned to_submit = __collect_submitions( ring );
+			int ret = __io_uring_enter( ring, to_submit, false, 0p );
+			if( ret < 0 ) {
+				unlock(ring.submit_q.lock);
+				return;
+			}
+
+			/* paranoid */ verify( ret > 0 || (ring.ring_flags & IORING_SETUP_SQPOLL) );
+
+			// Release the consumed SQEs
+			__release_consumed_submission( ring );
+
+			// update statistics
+			__STATS__( true,
+				io.submit_q.submit_avg.rdy += to_submit;
+				io.submit_q.submit_avg.csm += ret;
+				io.submit_q.submit_avg.cnt += 1;
+			)
+
+			unlock(ring.submit_q.lock);
 		}
 		else {
@@ -791,13 +869,24 @@
 			lock(ring.submit_q.lock __cfaabi_dbg_ctx2);
 
+			/* paranoid */ verifyf( ring.submit_q.sqes[ idx ].user_data != 0,
+			/* paranoid */ 	"index %u already reclaimed\n"
+			/* paranoid */ 	"head %u, prev %u, tail %u\n"
+			/* paranoid */ 	"[-0: %u,-1: %u,-2: %u,-3: %u]\n",
+			/* paranoid */ 	idx,
+			/* paranoid */ 	*ring.submit_q.head, ring.submit_q.prev_head, *tail
+			/* paranoid */ 	,ring.submit_q.array[ ((*ring.submit_q.head) - 0) & (*ring.submit_q.mask) ]
+			/* paranoid */ 	,ring.submit_q.array[ ((*ring.submit_q.head) - 1) & (*ring.submit_q.mask) ]
+			/* paranoid */ 	,ring.submit_q.array[ ((*ring.submit_q.head) - 2) & (*ring.submit_q.mask) ]
+			/* paranoid */ 	,ring.submit_q.array[ ((*ring.submit_q.head) - 3) & (*ring.submit_q.mask) ]
+			/* paranoid */ );
+
 			// Append to the list of ready entries
 
 			/* paranoid */ verify( idx <= mask );
-
-			ring.submit_q.array[ (*tail) & mask ] = idx & mask;
+			ring.submit_q.array[ (*tail) & mask ] = idx;
 			__atomic_fetch_add(tail, 1ul32, __ATOMIC_SEQ_CST);
 
 			// Submit however, many entries need to be submitted
-			int ret = syscall( __NR_io_uring_enter, ring.fd, 1, 0, 0, 0p, 0);
+			int ret = __io_uring_enter( ring, 1, false, 0p );
 			if( ret < 0 ) {
 				switch((int)errno) {
@@ -808,10 +897,11 @@
 
 			// update statistics
-			#if !defined(__CFA_NO_STATISTICS__)
-				__tls_stats()->io.submit_q.submit_avg.csm += 1;
-				__tls_stats()->io.submit_q.submit_avg.cnt += 1;
-			#endif
-
-			ring.submit_q.sqes[ idx & mask ].user_data = 0;
+			__STATS__( false,
+				io.submit_q.submit_avg.csm += 1;
+				io.submit_q.submit_avg.cnt += 1;
+			)
+
+			// Release the consumed SQEs
+			__release_consumed_submission( ring );
 
 			unlock(ring.submit_q.lock);
@@ -820,3 +910,60 @@
 		}
 	}
+
+	static unsigned __collect_submitions( struct __io_data & ring ) {
+		/* paranoid */ verify( ring.submit_q.ready != 0p );
+		/* paranoid */ verify( ring.submit_q.ready_cnt > 0 );
+
+		unsigned to_submit = 0;
+		uint32_t tail = *ring.submit_q.tail;
+		const uint32_t mask = *ring.submit_q.mask;
+
+		// Go through the list of ready submissions
+		for( i; ring.submit_q.ready_cnt ) {
+			// replace any submission with the sentinel, to consume it.
+			uint32_t idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
+
+			// If it was already the sentinel, then we are done
+			if( idx == -1ul32 ) continue;
+
+			// If we got a real submission, append it to the list
+			ring.submit_q.array[ (tail + to_submit) & mask ] = idx & mask;
+			to_submit++;
+		}
+
+		// Increment the tail based on how many we are ready to submit
+		__atomic_fetch_add(ring.submit_q.tail, to_submit, __ATOMIC_SEQ_CST);
+
+		return to_submit;
+	}
+
+	static uint32_t __release_consumed_submission( struct __io_data & ring ) {
+		const uint32_t smask = *ring.submit_q.mask;
+
+		if( !try_lock(ring.submit_q.release_lock __cfaabi_dbg_ctx2) ) return 0;
+		uint32_t chead = *ring.submit_q.head;
+		uint32_t phead = ring.submit_q.prev_head;
+		ring.submit_q.prev_head = chead;
+		unlock(ring.submit_q.release_lock);
+
+		uint32_t count = chead - phead;
+		for( i; count ) {
+			uint32_t idx = ring.submit_q.array[ (phead + i) & smask ];
+			ring.submit_q.sqes[ idx ].user_data = 0;
+		}
+		return count;
+	}
+
+//=============================================================================================
+// I/O Submissions
+//=============================================================================================
+
+	void register_fixed_files( cluster & cl, int * files, unsigned count ) {
+		int ret = syscall( __NR_io_uring_register, cl.io->fd, IORING_REGISTER_FILES, files, count );
+		if( ret < 0 ) {
+			abort( "KERNEL ERROR: IO_URING SYSCALL - (%d) %s\n", (int)errno, strerror(errno) );
+		}
+
+		__cfadbg_print_safe( io_core, "Kernel I/O : Performed io_register for %p, returned %d\n", active_thread(), ret );
+	}
 #endif
Index: libcfa/src/concurrency/iocall.cfa
===================================================================
--- libcfa/src/concurrency/iocall.cfa	(revision fc9bb791282f9039edf2fe373d71ff9c77f57c60)
+++ libcfa/src/concurrency/iocall.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -108,4 +108,7 @@
 
 	extern ssize_t read (int fd, void *buf, size_t count);
+
+	extern ssize_t splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags);
+	extern ssize_t tee(int fd_in, int fd_out, size_t len, unsigned int flags);
 }
 
@@ -128,4 +131,17 @@
 		#endif
 	}
+
+	ssize_t cfa_preadv2_fixed(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) {
+		#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_READV)
+			return preadv2(fd, iov, iovcnt, offset, flags);
+		#else
+			__submit_prelude
+
+			(*sqe){ IORING_OP_READV, fd, iov, iovcnt, offset };
+			sqe->flags |= IOSQE_FIXED_FILE;
+
+			__submit_wait
+		#endif
+	}
 #endif
 
@@ -329,5 +345,4 @@
 }
 
-
 ssize_t cfa_read(int fd, void *buf, size_t count) {
 	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_READ)
@@ -349,4 +364,33 @@
 
 		(*sqe){ IORING_OP_WRITE, fd, buf, count, 0 };
+
+		__submit_wait
+	#endif
+}
+
+ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_SPLICE)
+		return splice( fd_in, off_in, fd_out, off_out, len, flags );
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_SPLICE, fd_out, 0p, len, off_out };
+		sqe->splice_fd_in  = fd_in;
+		sqe->splice_off_in = off_in;
+		sqe->splice_flags  = flags;
+
+		__submit_wait
+	#endif
+}
+
+ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags) {
+	#if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_TEE)
+		return tee( fd_in, fd_out, len, flags );
+	#else
+		__submit_prelude
+
+		(*sqe){ IORING_OP_TEE, fd_out, 0p, len, 0 };
+		sqe->splice_fd_in = fd_in;
+		sqe->splice_flags = flags;
 
 		__submit_wait
@@ -453,4 +497,14 @@
 			#define _CFA_IO_FEATURE_IORING_OP_WRITE ,
 			return IS_DEFINED(IORING_OP_WRITE);
+
+		if( /*func == (fptr_t)splice || */
+			func == (fptr_t)cfa_splice )
+			#define _CFA_IO_FEATURE_IORING_OP_SPLICE ,
+			return IS_DEFINED(IORING_OP_SPLICE);
+
+		if( /*func == (fptr_t)tee || */
+			func == (fptr_t)cfa_tee )
+			#define _CFA_IO_FEATURE_IORING_OP_TEE ,
+			return IS_DEFINED(IORING_OP_TEE);
 	#endif
 
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision fc9bb791282f9039edf2fe373d71ff9c77f57c60)
+++ libcfa/src/concurrency/kernel.hfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -129,7 +129,9 @@
 struct __io_data;
 
-#define CFA_CLUSTER_IO_POLLER_USER_THREAD    1 << 0 // 0x1
-#define CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS 1 << 1 // 0x2
-// #define CFA_CLUSTER_IO_POLLER_KERNEL_SIDE 1 << 2 // 0x4
+#define CFA_CLUSTER_IO_POLLER_USER_THREAD    (1 << 0) // 0x01
+#define CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS (1 << 1) // 0x02
+#define CFA_CLUSTER_IO_EAGER_SUBMITS         (1 << 2) // 0x04
+#define CFA_CLUSTER_IO_KERNEL_POLL_SUBMITS   (1 << 3) // 0x08
+#define CFA_CLUSTER_IO_KERNEL_POLL_COMPLETES (1 << 4) // 0x10
 #define CFA_CLUSTER_IO_BUFFLEN_OFFSET        16
 
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision fc9bb791282f9039edf2fe373d71ff9c77f57c60)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -286,9 +286,19 @@
 // Statics call at the end of each thread to register statistics
 #if !defined(__CFA_NO_STATISTICS__)
-static inline struct __stats_t * __tls_stats() {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	/* paranoid */ verify( kernelTLS.this_stats );
-	return kernelTLS.this_stats;
-}
+	static inline struct __stats_t * __tls_stats() {
+		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+		/* paranoid */ verify( kernelTLS.this_stats );
+		return kernelTLS.this_stats;
+	}
+
+	#define __STATS__(in_kernel, ...) { \
+		if( !(in_kernel) ) disable_interrupts(); \
+		with( *__tls_stats() ) { \
+			__VA_ARGS__ \
+		} \
+		if( !(in_kernel) ) enable_interrupts( __cfaabi_dbg_ctx ); \
+	}
+#else
+	#define __STATS__(in_kernel, ...)
 #endif
 
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision fc9bb791282f9039edf2fe373d71ff9c77f57c60)
+++ libcfa/src/concurrency/preemption.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -186,4 +186,5 @@
 	void enable_interrupts( __cfaabi_dbg_ctx_param ) {
 		processor   * proc = kernelTLS.this_processor; // Cache the processor now since interrupts can start happening after the atomic store
+		/* paranoid */ verify( proc );
 
 		with( kernelTLS.preemption_state ){
Index: libcfa/src/concurrency/stats.cfa
===================================================================
--- libcfa/src/concurrency/stats.cfa	(revision fc9bb791282f9039edf2fe373d71ff9c77f57c60)
+++ libcfa/src/concurrency/stats.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -27,5 +27,4 @@
 			stats->io.submit_q.submit_avg.rdy = 0;
 			stats->io.submit_q.submit_avg.csm = 0;
-			stats->io.submit_q.submit_avg.avl = 0;
 			stats->io.submit_q.submit_avg.cnt = 0;
 			stats->io.submit_q.look_avg.val   = 0;
@@ -35,4 +34,7 @@
 			stats->io.submit_q.alloc_avg.cnt   = 0;
 			stats->io.submit_q.alloc_avg.block = 0;
+			stats->io.submit_q.helped = 0;
+			stats->io.submit_q.leader = 0;
+			stats->io.submit_q.busy   = 0;
 			stats->io.complete_q.completed_avg.val = 0;
 			stats->io.complete_q.completed_avg.slow_cnt = 0;
@@ -68,4 +70,7 @@
 			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.cnt           , proc->io.submit_q.alloc_avg.cnt           , __ATOMIC_SEQ_CST );
 			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.block         , proc->io.submit_q.alloc_avg.block         , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.helped                  , proc->io.submit_q.helped                  , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.leader                  , proc->io.submit_q.leader                  , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.busy                    , proc->io.submit_q.busy                    , __ATOMIC_SEQ_CST );
 			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.val     , proc->io.complete_q.completed_avg.val     , __ATOMIC_SEQ_CST );
 			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.slow_cnt, proc->io.complete_q.completed_avg.slow_cnt, __ATOMIC_SEQ_CST );
@@ -120,5 +125,4 @@
 				double avgrdy = ((double)io.submit_q.submit_avg.rdy) / io.submit_q.submit_avg.cnt;
 				double avgcsm = ((double)io.submit_q.submit_avg.csm) / io.submit_q.submit_avg.cnt;
-				double avgavl = ((double)io.submit_q.submit_avg.avl) / io.submit_q.submit_avg.cnt;
 
 				double lavgv = 0;
@@ -141,5 +145,7 @@
 					"- avg ready entries      : %'18.2lf\n"
 					"- avg submitted entries  : %'18.2lf\n"
-					"- avg available entries  : %'18.2lf\n"
+					"- total helped entries   : %'15" PRIu64 "\n"
+					"- total leader entries   : %'15" PRIu64 "\n"
+					"- total busy submit      : %'15" PRIu64 "\n"
 					"- total ready search     : %'15" PRIu64 "\n"
 					"- avg ready search len   : %'18.2lf\n"
@@ -153,5 +159,6 @@
 					, cluster ? "Cluster" : "Processor",  name, id
 					, io.submit_q.submit_avg.cnt
-					, avgrdy, avgcsm, avgavl
+					, avgrdy, avgcsm
+					, io.submit_q.helped, io.submit_q.leader, io.submit_q.busy
 					, io.submit_q.look_avg.cnt
 					, lavgv, lavgb
Index: libcfa/src/concurrency/stats.hfa
===================================================================
--- libcfa/src/concurrency/stats.hfa	(revision fc9bb791282f9039edf2fe373d71ff9c77f57c60)
+++ libcfa/src/concurrency/stats.hfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -83,4 +83,7 @@
 					volatile uint64_t block;
 				} alloc_avg;
+				volatile uint64_t helped;
+				volatile uint64_t leader;
+				volatile uint64_t busy;
 			} submit_q;
 			struct {
Index: src/Makefile.in
===================================================================
--- src/Makefile.in	(revision fc9bb791282f9039edf2fe373d71ff9c77f57c60)
+++ src/Makefile.in	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -626,5 +626,5 @@
 
 BUILT_SOURCES = Parser/parser.hh
-AM_YFLAGS = -d -t -v
+AM_YFLAGS = -d -t -v -Wno-yacc
 SRC_RESOLVEXPR = \
       ResolvExpr/AdjustExprType.cc \
Index: src/Parser/module.mk
===================================================================
--- src/Parser/module.mk	(revision fc9bb791282f9039edf2fe373d71ff9c77f57c60)
+++ src/Parser/module.mk	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -17,5 +17,5 @@
 BUILT_SOURCES = Parser/parser.hh
 
-AM_YFLAGS = -d -t -v
+AM_YFLAGS = -d -t -v -Wno-yacc
 
 SRC += \
Index: tests/bugs/.expect/10.txt
===================================================================
--- tests/bugs/.expect/10.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/.expect/10.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,1 @@
+bugs/10.cfa:10:53: error: ‘_sizeof_Y1T’ undeclared here (not in a function)
Index: tests/bugs/.expect/194.txt
===================================================================
--- tests/bugs/.expect/194.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/.expect/194.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,40 @@
+bugs/194.cfa:13:1 error: Cannot choose between 2 alternatives for expression
+Applying untyped:
+  Name: foo
+...to:
+ Alternatives are:
+Cost ( 0, 0, 0, 0, 1, 0, 0 ): Application of
+      Variable Expression: foo: forall
+        T: sized data type
+        function
+      ... returning
+        _retval_foo: reference to instance of type T (not function type)
+        ... with attributes:
+          Attribute with name: unused
+
+
+
+  (types:
+    reference to instance of type _79_5_T (not function type)
+  )
+  Environment: -> pointer to signed int
+
+
+Cost ( 0, 0, 0, 0, 1, 0, 0 ): Application of
+      Variable Expression: foo: forall
+        T: sized data type
+        function
+      ... returning
+        _retval_foo: pointer to instance of type T (not function type)
+        ... with attributes:
+          Attribute with name: unused
+
+
+
+  (types:
+    pointer to instance of type _79_6_T (not function type)
+  )
+  Environment: -> signed int (no widening)
+
+
+
Index: tests/bugs/.expect/20.txt
===================================================================
--- tests/bugs/.expect/20.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/.expect/20.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,6 @@
+error: No reasonable alternatives for expression Untyped Init Expression
+  constant expression (0 0: zero_t)  InitAlternative: instance of struct foo with body 1
+bugs/20.cfa:15:1 error: No reasonable alternatives for expression Applying untyped:
+  Name: bar
+...to:
+
Index: tests/bugs/.expect/44.txt
===================================================================
--- tests/bugs/.expect/44.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/.expect/44.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,6 @@
+bugs/44.cfa:9:1 error: No reasonable alternatives for expression Applying untyped:
+  Name: ?=?
+...to:
+  Name: x
+  Name: f
+
Index: tests/bugs/.expect/46.txt
===================================================================
--- tests/bugs/.expect/46.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/.expect/46.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,4 @@
+bugs/46.cfa: In function ‘_X4mainFi___1’:
+bugs/46.cfa:10:100: error: lvalue required as unary ‘&’ operand
+   3 ? x[2] : val;
+                                                                                                    ^
Index: tests/bugs/.expect/5.txt
===================================================================
--- tests/bugs/.expect/5.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/.expect/5.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,8 @@
+bugs/5.cfa: In function ‘_X4mainFi___1’:
+bugs/5.cfa:8:43: error: ‘_unq0_finished_’ undeclared (first use in this function)
+   foo(bar());
+                                           ^              
+bugs/5.cfa:8:43: note: each undeclared identifier is reported only once for each function it appears in
+bugs/5.cfa:8:225: warning: left-hand operand of comma expression has no effect [-Wunused-value]
+   foo(bar());
+                                                                                                                                                                                                                                 ^
Index: tests/bugs/.expect/66.txt
===================================================================
--- tests/bugs/.expect/66.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/.expect/66.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,69 @@
+bugs/66.cfa:11:1 error: Cannot choose between 2 alternatives for expression
+Applying untyped:
+  Name: ?!=?
+...to:
+  Name: next
+  constant expression (0 0: zero_t)
+ Alternatives are:
+Cost ( 0, 2, 17, 0, 1, -2, 0 ): Application of
+      Variable Expression: ?!=?: forall
+        FT: function type
+        function
+      ... with parameters
+        intrinsic pointer to instance of type FT ( function type)
+        intrinsic pointer to instance of type FT ( function type)
+      ... returning
+        _retval__operator_notequal: signed int
+        ... with attributes:
+          Attribute with name: unused
+
+
+    ... to arguments
+      Variable Expression: next: function
+        accepting unspecified arguments
+      ... returning nothing
+
+      Generated Cast of:
+        constant expression (0 0: zero_t)
+      ... to:
+        pointer to function
+          accepting unspecified arguments
+        ... returning nothing
+
+
+  (types:
+    signed int
+  )
+  Environment: -> function
+        accepting unspecified arguments
+      ... returning nothing
+ (no widening)
+
+
+Cost ( 0, 2, 17, 0, 1, -2, 0 ): Application of
+      Variable Expression: ?!=?: forall
+        DT: data type
+        function
+      ... with parameters
+        intrinsic pointer to instance of type DT (not function type)
+        intrinsic pointer to instance of type DT (not function type)
+      ... returning
+        _retval__operator_notequal: signed int
+        ... with attributes:
+          Attribute with name: unused
+
+
+    ... to arguments
+      Variable Expression: next: pointer to signed int
+      Generated Cast of:
+        constant expression (0 0: zero_t)
+      ... to:
+        pointer to signed int
+
+  (types:
+    signed int
+  )
+  Environment: -> signed int (no widening)
+
+
+
Index: tests/bugs/.expect/91.txt
===================================================================
--- tests/bugs/.expect/91.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/.expect/91.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,2 @@
+error: No reasonable alternatives for expression Untyped Init Expression
+  Name: f  InitAlternative: _Bool
Index: tests/bugs/.expect/92.txt
===================================================================
--- tests/bugs/.expect/92.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/.expect/92.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,16 @@
+bugs/92.cfa:6:1 error: No reasonable alternatives for expression Applying untyped:
+  Name: ?{}
+...to:
+  Generated Cast of:
+    Variable Expression: _ret: instance of struct superFred with body 1
+  ... to:
+    reference to instance of struct superFred with body 1
+  Generated Cast of:
+    Variable Expression: _dst: reference to instance of struct superFred with body 1
+  ... to:
+    instance of struct superFred with body 1
+  ... with environment:
+    Types:
+    Non-types:
+
+
Index: tests/bugs/.expect/95.txt
===================================================================
--- tests/bugs/.expect/95.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/.expect/95.txt	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,1 @@
+bugs/95.cfa:8:1 error: List of mutex member is currently unimplemented.
Index: tests/bugs/10.cfa
===================================================================
--- tests/bugs/10.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/10.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,9 @@
+// Trac ticket
+// https://cforall.uwaterloo.ca/trac/ticket/10
+
+forall(otype T)
+struct result {
+      union {
+            T value;
+      };
+};
Index: tests/bugs/104.cfa
===================================================================
--- tests/bugs/104.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/104.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,11 @@
+// Trac ticket
+// https://cforall.uwaterloo.ca/trac/ticket/104
+
+[ float, float ] modf_( float x );
+
+forall(otype T | { [T, T] modf_(T); })
+void modf(T);
+
+int main() {
+    modf(7.0f);
+}
Index: tests/bugs/194.cfa
===================================================================
--- tests/bugs/194.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/194.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,14 @@
+// Trac ticket
+// https://cforall.uwaterloo.ca/trac/ticket/194
+
+forall( dtype T | sized(T) ) T * foo( void ) {
+      printf( "foo1\n" );
+	return (T *)0;
+}
+forall( dtype T | sized(T) ) T & foo( void ) {
+	printf( "foo2\n" );
+	return (T &)*(T *)0;
+}
+int main( void ) {
+      int * i = foo();
+}
Index: tests/bugs/20.cfa
===================================================================
--- tests/bugs/20.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/20.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,16 @@
+// Trac ticket
+// https://cforall.uwaterloo.ca/trac/ticket/20
+
+struct foo {
+	int i;
+};
+
+void ?{}( foo & this, zero_t zero ) {
+	this.i = zero;
+}
+
+extern void bar( foo this = 0 );
+
+int main() {
+	bar();
+}
Index: tests/bugs/44.cfa
===================================================================
--- tests/bugs/44.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/44.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,10 @@
+// Trac ticket
+// https://cforall.uwaterloo.ca/trac/ticket/44
+
+typedef void (*fptr_t)();
+void f(int);
+
+int main() {
+  fptr_t x;
+  x = f;
+}
Index: tests/bugs/46.cfa
===================================================================
--- tests/bugs/46.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/46.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,11 @@
+// Trac ticket
+// https://cforall.uwaterloo.ca/trac/ticket/46
+
+enum E {
+  val
+};
+
+int main() {
+  E x[3];
+  3 ? x[2] : val;
+}
Index: tests/bugs/5.cfa
===================================================================
--- tests/bugs/5.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/5.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,9 @@
+// Trac ticket
+// https://cforall.uwaterloo.ca/trac/ticket/5
+
+[int, int] bar() { return [3, 4]; }
+[void] foo([int] x, [int] y) {}
+
+int main() {
+  foo(bar());
+}
Index: tests/bugs/66.cfa
===================================================================
--- tests/bugs/66.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/66.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,12 @@
+// Trac ticket
+// https://cforall.uwaterloo.ca/trac/ticket/66
+
+void next() {}
+
+int main() {
+	int * next = (void*)0;
+	if( next ) {
+		return 1;
+	}
+	return 0;
+}
Index: tests/bugs/7.cfa
===================================================================
--- tests/bugs/7.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/7.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,41 @@
+// Trac ticket
+// https://cforall.uwaterloo.ca/trac/ticket/7
+
+#include <stdlib.hfa>
+extern "C" {
+#include <stdbool.h>
+}
+
+// (Bug 1 unresolved as of this test.)
+forall(otype T)
+struct stack_node;
+
+forall(otype T)
+struct stack_node {
+    stack_node(T) * next;
+    T item;
+};
+
+forall(otype T)
+struct stack {
+    stack_node(T) * head;
+};
+
+trait stack_errors(otype T) {
+    T emptyStackHandler (stack(T) * this);
+};
+
+forall(otype T | stack_errors(T))
+T pop (stack(T) * this) {
+    return (T){};
+}
+
+int emptyStackHandler (stack(int) * this) {
+    return 0;
+}
+
+int main (int argc, char * argv[]) {
+    stack(int) stackOfInts;
+    pop(&stackOfInts);
+    return 0;
+}
Index: tests/bugs/91.cfa
===================================================================
--- tests/bugs/91.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/91.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,13 @@
+// Trac ticket
+// https://cforall.uwaterloo.ca/trac/ticket/91
+
+#include <stdbool.h>
+
+struct fred {};
+bool ?!=?(const fred & this, zero_t) { return true; }
+
+void foo() {
+	fred f;
+	if(f) {}
+	bool test = f;
+}
Index: tests/bugs/92.cfa
===================================================================
--- tests/bugs/92.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/92.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,17 @@
+// Trac ticket
+// https://cforall.uwaterloo.ca/trac/ticket/92
+
+struct fred{};
+
+void  ?{}(fred & this, int count = 1);
+void  ?{}(fred & this, fred & other) = void;
+void ^?{}(fred & this);
+fred ?=?(fred & this, const fred & other) = void;
+
+struct superFred {
+	fred f;
+};
+
+void foo() {
+	superFred f;
+}
Index: tests/bugs/95.cfa
===================================================================
--- tests/bugs/95.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
+++ tests/bugs/95.cfa	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -0,0 +1,9 @@
+// Trac ticket
+// https://cforall.uwaterloo.ca/trac/ticket/95
+
+#include <monitor.hfa>
+monitor M { condition e; } m;
+void rtn( M & mutex m );
+void bar( M & mutex m ) {
+ 	waitfor( rtn, m ); // not ambiguous, select parameter
+}
Index: tests/pybin/test_run.py
===================================================================
--- tests/pybin/test_run.py	(revision fc9bb791282f9039edf2fe373d71ff9c77f57c60)
+++ tests/pybin/test_run.py	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -69,13 +69,4 @@
 			else :						text = "FAILED with code %d" % retcode
 
-		text += "    C%s - R%s" % (cls.fmtDur(duration[0]), cls.fmtDur(duration[1]))
+		text += "    C%s - R%s" % (fmtDur(duration[0]), fmtDur(duration[1]))
 		return text
-
-	@staticmethod
-	def fmtDur( duration ):
-		if duration :
-			hours, rem = divmod(duration, 3600)
-			minutes, rem = divmod(rem, 60)
-			seconds, millis = divmod(rem, 1)
-			return "%2d:%02d.%03d" % (minutes, seconds, millis * 1000)
-		return " n/a"
Index: tests/pybin/tools.py
===================================================================
--- tests/pybin/tools.py	(revision fc9bb791282f9039edf2fe373d71ff9c77f57c60)
+++ tests/pybin/tools.py	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -387,2 +387,10 @@
 		while True:
 			yield i.next(max(expire - time.time(), 0))
+
+def fmtDur( duration ):
+	if duration :
+		hours, rem = divmod(duration, 3600)
+		minutes, rem = divmod(rem, 60)
+		seconds, millis = divmod(rem, 1)
+		return "%2d:%02d.%03d" % (minutes, seconds, millis * 1000)
+	return " n/a"
Index: tests/test.py
===================================================================
--- tests/test.py	(revision fc9bb791282f9039edf2fe373d71ff9c77f57c60)
+++ tests/test.py	(revision d34575bd6570f3b8d33ebcfeefd4138a053de439)
@@ -361,34 +361,35 @@
 
 		# for each build configurations, run the test
-		for arch, debug, install in itertools.product(settings.all_arch, settings.all_debug, settings.all_install):
-			settings.arch    = arch
-			settings.debug   = debug
-			settings.install = install
-
-			# filter out the tests for a different architecture
-			# tests are the same across debug/install
-			local_tests = settings.arch.filter( tests )
-			options.jobs, forceJobs = job_count( options, local_tests )
-			settings.update_make_cmd(forceJobs, options.jobs)
-
-			# check the build configuration works
-			settings.validate()
-
-			# print configuration
-			print('%s %i tests on %i cores (%s:%s)' % (
-				'Regenerating' if settings.generating else 'Running',
-				len(local_tests),
-				options.jobs,
-				settings.arch.string,
-				settings.debug.string
-			))
-
-			# otherwise run all tests and make sure to return the correct error code
-			failed = run_tests(local_tests, options.jobs)
-			if failed:
-				result = 1
-				if not settings.continue_:
-					break
-
-
+		with Timed() as total_dur:
+			for arch, debug, install in itertools.product(settings.all_arch, settings.all_debug, settings.all_install):
+				settings.arch    = arch
+				settings.debug   = debug
+				settings.install = install
+
+				# filter out the tests for a different architecture
+				# tests are the same across debug/install
+				local_tests = settings.arch.filter( tests )
+				options.jobs, forceJobs = job_count( options, local_tests )
+				settings.update_make_cmd(forceJobs, options.jobs)
+
+				# check the build configuration works
+				settings.validate()
+
+				# print configuration
+				print('%s %i tests on %i cores (%s:%s)' % (
+					'Regenerating' if settings.generating else 'Running',
+					len(local_tests),
+					options.jobs,
+					settings.arch.string,
+					settings.debug.string
+				))
+
+				# otherwise run all tests and make sure to return the correct error code
+				failed = run_tests(local_tests, options.jobs)
+				if failed:
+					result = 1
+					if not settings.continue_:
+						break
+
+		print('Tests took %s' % fmtDur( total_dur.duration ))
 		sys.exit( failed )
