Index: benchmark/benchcltr.hfa
===================================================================
--- benchmark/benchcltr.hfa	(revision dab09ad17d681d164e04da583cd9bde8fb8e96ff)
+++ benchmark/benchcltr.hfa	(revision cd021085d1b1fd9053f0376b7616c1aaaf7600d8)
@@ -1,8 +1,16 @@
 #pragma once
+#include <assert.h>
+#include <stdint.h>
 
-#include <assert.h>
-#include <kernel.hfa>
-#include <thread.hfa>
-#include <stats.hfa>
+#ifdef __cforall
+	#include <kernel.hfa>
+	#include <thread.hfa>
+	#include <stats.hfa>
+#else
+#include <time.h>										// timespec
+#include <sys/time.h>									// timeval
+
+enum { TIMEGRAN = 1000000000LL };					// nanosecond granularity, except for timeval
+#endif
 
 #define BENCH_OPT_SHORT "d:p:t:SPV"
@@ -14,9 +22,4 @@
 	{"procstat",     no_argument      , 0, 'P'}, \
 	{"viewhalts",    no_argument      , 0, 'V'},
-
-#define BENCH_DECL \
-	double duration = 5; \
-	int nprocs = 1; \
-	int nthreads = 1;
 
 #define BENCH_OPT_CASE \
@@ -52,7 +55,24 @@
 		break;
 
+double duration = 5;
+int nprocs = 1;
+int nthreads = 1;
 bool silent = false;
+bool continuous = false;
 bool procstats = false;
 bool viewhalts = false;
+
+#define BENCH_OPT_CFA \
+	{'d', "duration",  "Duration of the experiments in seconds", duration }, \
+	{'t', "nthreads",  "Number of threads to use", nthreads }, \
+	{'p', "nprocs",    "Number of processors to use", nprocs }, \
+	{'S', "nostats",   "Don't print statistics", silent, parse_settrue }, \
+	{'C', "constats",  "Regularly print statistics", continuous, parse_settrue }, \
+	{'P', "procstat",  "Print statistics for each processors", procstats, parse_settrue }, \
+	{'V', "viewhalts", "Visualize halts, prints timestamp and Processor id for each halt.", viewhalts, parse_settrue },
+
+#ifdef __cforall
+#include <parseargs.hfa>
+
 struct cluster * the_benchmark_cluster = 0p;
 struct BenchCluster {
@@ -60,6 +80,6 @@
 };
 
-void ?{}( BenchCluster & this, int flags, int stats ) {
-	(this.self){ "Benchmark Cluster", flags };
+void ?{}( BenchCluster & this, int num_io, const io_context_params & io_params, int stats ) {
+	(this.self){ "Benchmark Cluster", num_io, io_params };
 
 	assert( the_benchmark_cluster == 0p );
@@ -105,4 +125,32 @@
 	}
 }
+#else
+uint64_t getTimeNsec() {
+	timespec curr;
+	clock_gettime( CLOCK_REALTIME, &curr );
+	return (int64_t)curr.tv_sec * TIMEGRAN + curr.tv_nsec;
+}
+
+uint64_t to_miliseconds( uint64_t durtn ) { return durtn / (TIMEGRAN / 1000LL); }
+double to_fseconds(uint64_t durtn ) { return durtn / (double)TIMEGRAN; }
+uint64_t from_fseconds(double sec) { return sec * TIMEGRAN; }
+
+
+void wait_duration(double duration, uint64_t & start, uint64_t & end, bool is_tty) {
+	for(;;) {
+		usleep(100000);
+		end = getTimeNsec();
+		uint64_t delta = end - start;
+		/*if(is_tty)*/ {
+			printf(" %.1f\r", to_fseconds(delta));
+			fflush(stdout);
+		}
+		if( delta >= from_fseconds(duration) ) {
+			break;
+		}
+	}
+}
+#endif
+
 
 void bench_usage( char * argv [] ) {
Index: benchmark/io/readv-posix.c
===================================================================
--- benchmark/io/readv-posix.c	(revision cd021085d1b1fd9053f0376b7616c1aaaf7600d8)
+++ benchmark/io/readv-posix.c	(revision cd021085d1b1fd9053f0376b7616c1aaaf7600d8)
@@ -0,0 +1,139 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+extern "C" {
+	#include <locale.h>
+	#include <getopt.h>
+	#include <fcntl.h>
+	#include <sys/uio.h>
+}
+
+#include <unistd.h>
+
+#include <pthread.h>
+
+#include "../benchcltr.hfa"
+
+int fd;
+volatile bool run = false;
+volatile size_t count = 0;
+
+unsigned long int buflen = 50;
+
+void * reader_main( void * arg ) {
+      pthread_barrier_wait( (pthread_barrier_t*) arg );
+	/* paranoid */ assert( true == __atomic_load_n(&run, __ATOMIC_RELAXED) );
+
+	char data[buflen];
+	struct iovec iov = { data, buflen };
+
+	while(__atomic_load_n(&run, __ATOMIC_RELAXED)) {
+		int r = preadv2(fd, &iov, 1, 0, 0);
+		if(r < 0) {
+                  fprintf(stderr, "%s\n", strerror(-r));
+                  abort();
+            }
+
+		__atomic_fetch_add( &count, 1, __ATOMIC_SEQ_CST );
+	}
+
+      return NULL;
+}
+
+int main(int argc, char * argv[]) {
+	BENCH_DECL
+	unsigned flags = 0;
+	unsigned sublen = 16;
+
+      setlocale(LC_ALL, "");
+
+	for(;;) {
+		static struct option options[] = {
+			BENCH_OPT_LONG
+			{"polled-io",    required_argument, 0, 'i'},
+			{"bufsize",      required_argument, 0, 'b'},
+			{0, 0, 0, 0}
+		};
+
+		int idx = 0;
+		int opt = getopt_long(argc, argv, BENCH_OPT_SHORT "ib:", options, &idx);
+
+		const char * arg = optarg ? optarg : "";
+		char * end;
+		switch(opt) {
+			// Exit Case
+			case -1:
+				goto arg_loop;
+			BENCH_OPT_CASE
+			case 'i':
+				flags |= O_DIRECT;
+				break;
+			case 'b':
+				buflen = strtoul(arg, &end, 10);
+				if(*end != '\0' && buflen < 10) {
+					fprintf(stderr, "Buffer size must be at least 10, was %s\n", arg);
+					goto usage;
+				}
+				break;
+			default: /* ? */
+				fprintf(stderr, "%d\n", opt);
+			usage:
+				bench_usage( argv );
+				fprintf( stderr, "  -i, --polled-io          If set opens the file with O_DIRECT\n" );
+				fprintf( stderr, "  -b, --buflen=SIZE        Number of bytes to read per request\n" );
+				exit(EXIT_FAILURE);
+		}
+	}
+      arg_loop:
+
+	fd = open(__FILE__, flags);
+	if(fd < 0) {
+		fprintf(stderr, "Could not open source file\n");
+		exit(EXIT_FAILURE);
+	}
+
+	printf("Running %d threads, reading %lu bytes each, over %d processors for %f seconds\n", nthreads, buflen, nprocs, duration);
+
+	{
+		uint64_t start, end;
+		{
+			pthread_barrier_t barrier;
+                  pthread_barrier_init(&barrier, NULL, nthreads + 1);
+			{
+				pthread_t threads[nthreads];
+                        for(int i = 0; i < nthreads; i++) {
+                        	pthread_attr_t attr;
+                              pthread_attr_init( &attr );
+                              pthread_create( &threads[i], &attr, reader_main, &barrier );
+                        }
+
+				printf("Starting\n");
+				bool is_tty = isatty(STDOUT_FILENO);
+				start = getTimeNsec();
+				run = true;
+
+				pthread_barrier_wait( &barrier );
+				wait_duration(duration, start, end, is_tty);
+
+				run = false;
+				end = getTimeNsec();
+				printf("\nDone\n");
+
+                        for(int i = 0; i < nthreads; i++) {
+                              void * ret;
+                              pthread_join( threads[i], &ret );
+                        }
+			}
+                  pthread_barrier_destroy(&barrier);
+		}
+		printf("Took %'ld ms\n", to_miliseconds(end - start));
+		printf("Total reads      : %'15zu\n", count);
+		printf("Reads per second : %'18.2lf\n", ((double)count) / to_fseconds(end - start));
+		printf("Total read size  : %'15zu\n", buflen * count);
+		printf("Bytes per second : %'18.2lf\n", ((double)count * buflen) / to_fseconds(end - start));
+	}
+
+	close(fd);
+}
Index: benchmark/io/readv.cfa
===================================================================
--- benchmark/io/readv.cfa	(revision dab09ad17d681d164e04da583cd9bde8fb8e96ff)
+++ benchmark/io/readv.cfa	(revision cd021085d1b1fd9053f0376b7616c1aaaf7600d8)
@@ -40,5 +40,11 @@
 int do_read(int fd, struct iovec * iov) {
 	// extern ssize_t cfa_preadv2(int, const struct iovec *, int, off_t, int, int = 0, Duration = -1`s, io_cancellation * = 0p, io_context * = 0p);
-	int sflags = 0;
+	int sflags = 0
+	#if defined(CFA_HAVE_IOSQE_ASYNC)
+		| CFA_IO_ASYNC
+	#else
+	#warning no CFA_IO_ASYNC support
+	#endif
+	;
 	if(fixed_file) {
 		sflags |= CFA_IO_FIXED_FD1;
@@ -130,5 +136,4 @@
 				bench_usage( argv );
 				fprintf( stderr, "  -b, --buflen=SIZE        Number of bytes to read per request\n" );
-				fprintf( stderr, "  -u, --userthread         If set, cluster uses user-thread to poll I/O\n" );
 				fprintf( stderr, "  -s, --submitthread       If set, cluster uses polling thread to submit I/O\n" );
 				fprintf( stderr, "  -e, --eagersubmit        If set, cluster submits I/O eagerly but still aggregates submits\n" );
@@ -139,4 +144,7 @@
 		}
 	}
+
+	if(params.poll_submit  ) fixed_file = true;
+	if(params.poll_complete) file_flags |= O_DIRECT;
 
 	int lfd = open(__FILE__, file_flags);
Index: benchmark/readyQ/yield.cfa
===================================================================
--- benchmark/readyQ/yield.cfa	(revision dab09ad17d681d164e04da583cd9bde8fb8e96ff)
+++ benchmark/readyQ/yield.cfa	(revision cd021085d1b1fd9053f0376b7616c1aaaf7600d8)
@@ -43,29 +43,14 @@
 
 int main(int argc, char * argv[]) {
-	BENCH_DECL
+	unsigned num_io = 1;
+	io_context_params params;
 
-	for(;;) {
-		static struct option options[] = {
-			BENCH_OPT_LONG
-			{0, 0, 0, 0}
-		};
+	cfa_option opt[] = {
+		BENCH_OPT_CFA
+	};
+	int opt_cnt = sizeof(opt) / sizeof(cfa_option);
 
-		int idx = 0;
-		int opt = getopt_long(argc, argv, BENCH_OPT_SHORT, options, &idx);
-
-		const char * arg = optarg ? optarg : "";
-		char * end;
-		switch(opt) {
-			case -1:
-				goto run;
-			BENCH_OPT_CASE
-			default: /* ? */
-				fprintf( stderr, "Unkown option '%c'\n", opt);
-			usage:
-				bench_usage( argv );
-				exit(1);
-		}
-	}
-	run:
+	char **left;
+	parse_args( argc, argv, opt, opt_cnt, "[OPTIONS]...\ncforall yield benchmark", left );
 
 	{
@@ -73,5 +58,5 @@
 
 		Time start, end;
-		BenchCluster cl = { 0, CFA_STATS_READY_Q };
+		BenchCluster cl = { num_io, params, CFA_STATS_READY_Q };
 		{
 			BenchProc procs[nprocs];
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision dab09ad17d681d164e04da583cd9bde8fb8e96ff)
+++ libcfa/src/concurrency/io.cfa	(revision cd021085d1b1fd9053f0376b7616c1aaaf7600d8)
@@ -359,12 +359,13 @@
 
 			// We got the lock
+			// Collect the submissions
 			unsigned to_submit = __collect_submitions( ring );
+
+			// Release the lock now so syscalls can overlap
+			unlock(ring.submit_q.lock);
+
+			// Actually submit
 			int ret = __io_uring_enter( ring, to_submit, false );
-			if( ret < 0 ) {
-				unlock(ring.submit_q.lock);
-				return;
-			}
-
-			/* paranoid */ verify( ret > 0 || to_submit == 0 || (ring.ring_flags & IORING_SETUP_SQPOLL) );
+			if( ret < 0 ) return;
 
 			// Release the consumed SQEs
@@ -372,11 +373,9 @@
 
 			// update statistics
-			__STATS__( true,
+			__STATS__( false,
 				io.submit_q.submit_avg.rdy += to_submit;
 				io.submit_q.submit_avg.csm += ret;
 				io.submit_q.submit_avg.cnt += 1;
 			)
-
-			unlock(ring.submit_q.lock);
 		}
 		else {
Index: libcfa/src/parseargs.cfa
===================================================================
--- libcfa/src/parseargs.cfa	(revision dab09ad17d681d164e04da583cd9bde8fb8e96ff)
+++ libcfa/src/parseargs.cfa	(revision cd021085d1b1fd9053f0376b7616c1aaaf7600d8)
@@ -19,4 +19,5 @@
 	extern          long long int strtoll (const char* str, char** endptr, int base);
 	extern unsigned long long int strtoull(const char* str, char** endptr, int base);
+	extern                 double strtod  (const char* str, char** endptr);
 }
 
@@ -170,4 +171,13 @@
 }
 
+bool parse(const char * arg, int & value) {
+	char * end;
+	int r = strtoll(arg, &end, 10);
+	if(*end != '\0') return false;
+
+	value = r;
+	return true;
+}
+
 bool parse(const char * arg, unsigned & value) {
 	char * end;
@@ -200,10 +210,10 @@
 }
 
-bool parse(const char * arg, int & value) {
-	char * end;
-	int r = strtoll(arg, &end, 10);
-	if(*end != '\0') return false;
-
-	value = r;
-	return true;
-}
+bool parse(const char * arg, double & value) {
+	char * end;
+	double r = strtod(arg, &end);
+	if(*end != '\0') return false;
+
+	value = r;
+	return true;
+}
Index: libcfa/src/parseargs.hfa
===================================================================
--- libcfa/src/parseargs.hfa	(revision dab09ad17d681d164e04da583cd9bde8fb8e96ff)
+++ libcfa/src/parseargs.hfa	(revision cd021085d1b1fd9053f0376b7616c1aaaf7600d8)
@@ -39,6 +39,7 @@
 
 bool parse(const char *, const char * & );
+bool parse(const char *, int & );
 bool parse(const char *, unsigned & );
 bool parse(const char *, unsigned long & );
 bool parse(const char *, unsigned long long & );
-bool parse(const char *, int & );
+bool parse(const char *, double & );
