Index: benchmark/io/batch-readv.c
===================================================================
--- benchmark/io/batch-readv.c	(revision 69237cd254dcd27cee0f1916f1afb33878b67b55)
+++ benchmark/io/batch-readv.c	(revision 0376da9c4fec964e19fcbdf8f6c4d868220fc60e)
@@ -1,25 +1,9 @@
 // Program to test the optimial batchsize in a single threaded process
 extern "C" {
-	#ifndef _GNU_SOURCE         /* See feature_test_macros(7) */
-	#define _GNU_SOURCE         /* See feature_test_macros(7) */
-	#endif
-	#include <errno.h>
-	#include <stdio.h>
-	#include <stdint.h>
-	#include <stdlib.h>
-	#include <string.h>
+	#include <getopt.h>
 	#include <locale.h>
-	#include <getopt.h>
-	#include <unistd.h>
-	#include <sys/mman.h>
-	#include <sys/syscall.h>
-	#include <sys/uio.h>
-	#include <fcntl.h>
 	#include <time.h>										// timespec
 	#include <sys/time.h>									// timeval
-
-	#include <linux/io_uring.h>
 }
-
 
 enum { TIMEGRAN = 1000000000LL };					// nanosecond granularity, except for timeval
@@ -27,76 +11,7 @@
 #include <omp.h>
 
-# ifndef __NR_io_uring_setup
-#  define __NR_io_uring_setup		425
-# endif
-# ifndef __NR_io_uring_enter
-#  define __NR_io_uring_enter		426
-# endif
-# ifndef __NR_io_uring_register
-#  define __NR_io_uring_register	427
-# endif
+#include "io_uring.h"
 
-struct io_uring_sq {
-	// Head and tail of the ring (associated with array)
-	volatile uint32_t * head;
-	volatile uint32_t * tail;
 
-	// The actual kernel ring which uses head/tail
-	// indexes into the sqes arrays
-	uint32_t * array;
-
-	// number of entries and mask to go with it
-	const uint32_t * num;
-	const uint32_t * mask;
-
-	// Submission flags (Not sure what for)
-	uint32_t * flags;
-
-	// number of sqes not submitted (whatever that means)
-	uint32_t * dropped;
-
-	// Like head/tail but not seen by the kernel
-	volatile uint32_t alloc;
-
-	// A buffer of sqes (not the actual ring)
-	struct io_uring_sqe * sqes;
-
-	// The location and size of the mmaped area
-	void * ring_ptr;
-	size_t ring_sz;
-};
-
-struct io_uring_cq {
-	// Head and tail of the ring
-	volatile uint32_t * head;
-	volatile uint32_t * tail;
-
-	// number of entries and mask to go with it
-	const uint32_t * mask;
-	const uint32_t * num;
-
-	// number of cqes not submitted (whatever that means)
-	uint32_t * overflow;
-
-	// the kernel ring
-	struct io_uring_cqe * cqes;
-
-	// The location and size of the mmaped area
-	void * ring_ptr;
-	size_t ring_sz;
-};
-
-struct io_ring {
-	struct io_uring_sq submit_q;
-	struct io_uring_cq completion_q;
-	uint32_t flags;
-	int fd;
-};
-
-struct fred {
-	io_ring io;
-};
-
-fred self;
 int myfd;
 
@@ -217,83 +132,5 @@
 	myfd = open(__FILE__, 0);
 
-	// Step 1 : call to setup
-	struct io_uring_params params;
-	memset(&params, 0, sizeof(params));
-
-	uint32_t nentries = 2048;
-
-	int fd = syscall(__NR_io_uring_setup, nentries, &params );
-	if(fd < 0) {
-		fprintf(stderr, "KERNEL ERROR: IO_URING SETUP - %s\n", strerror(errno));
-		abort();
-	}
-
-	// Step 2 : mmap result
-	memset(&self.io, 0, sizeof(struct io_ring));
-	struct io_uring_sq & sq = self.io.submit_q;
-	struct io_uring_cq & cq = self.io.completion_q;
-
-	// calculate the right ring size
-	sq.ring_sz = params.sq_off.array + (params.sq_entries * sizeof(unsigned)           );
-	cq.ring_sz = params.cq_off.cqes  + (params.cq_entries * sizeof(struct io_uring_cqe));
-
-	// Requires features
-	// // adjust the size according to the parameters
-	// if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
-	// 	cq->ring_sz = sq->ring_sz = max(cq->ring_sz, sq->ring_sz);
-	// }
-
-	// mmap the Submit Queue into existence
-	sq.ring_ptr = mmap(0, sq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
-	if (sq.ring_ptr == (void*)MAP_FAILED) {
-		fprintf(stderr, "KERNEL ERROR: IO_URING MMAP1 - %s\n", strerror(errno));
-		abort();
-	}
-
-	// mmap the Completion Queue into existence (may or may not be needed)
-	// Requires features
-	// if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
-	// 	cq->ring_ptr = sq->ring_ptr;
-	// }
-	// else {
-		// We need multiple call to MMAP
-		cq.ring_ptr = mmap(0, cq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
-		if (cq.ring_ptr == (void*)MAP_FAILED) {
-			munmap(sq.ring_ptr, sq.ring_sz);
-			fprintf(stderr, "KERNEL ERROR: IO_URING MMAP2 - %s\n", strerror(errno));
-			abort();
-		}
-	// }
-
-	// mmap the submit queue entries
-	size_t size = params.sq_entries * sizeof(struct io_uring_sqe);
-	sq.sqes = (struct io_uring_sqe *)mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
-	if (sq.sqes == (struct io_uring_sqe *)MAP_FAILED) {
-		munmap(sq.ring_ptr, sq.ring_sz);
-		if (cq.ring_ptr != sq.ring_ptr) munmap(cq.ring_ptr, cq.ring_sz);
-		fprintf(stderr, "KERNEL ERROR: IO_URING MMAP3 - %s\n", strerror(errno));
-		abort();
-	}
-
-	// Get the pointers from the kernel to fill the structure
-	// submit queue
-	sq.head    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
-	sq.tail    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
-	sq.mask    = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
-	sq.num     = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
-	sq.flags   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
-	sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
-	sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
-	sq.alloc = *sq.tail;
-
-	// completion queue
-	cq.head     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
-	cq.tail     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
-	cq.mask     = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
-	cq.num      = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
-	cq.overflow = (         uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
-	cq.cqes   = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
-
-	self.io.fd = fd;
+	init_uring(2048);
 
 	// Allocate the sqe
@@ -344,7 +181,9 @@
 
 	printf("Took %'ld ms\n", to_miliseconds(end - start));
-	printf("Submitted       %'llu\n", submits);
-	printf("Completed       %'llu\n", completes);
-	printf("Submitted / sec %'.f\n", submits   / to_fseconds(end - start));
-	printf("Completed / sec %'.f\n", completes / to_fseconds(end - start));
+	printf("Submitted        %'llu\n", submits);
+	printf("Completed        %'llu\n", completes);
+	printf("Submitted / sec  %'.f\n", submits   / to_fseconds(end - start));
+	printf("Completed / sec  %'.f\n", completes / to_fseconds(end - start));
+	printf("ns per Submitted %'.f\n", 1000000000.0 * to_fseconds(end - start) / (submits   / batch) );
+	printf("ns per Completed %'.f\n", 1000000000.0 * to_fseconds(end - start) / (completes / batch) );
 }
Index: benchmark/io/io_uring.h
===================================================================
--- benchmark/io/io_uring.h	(revision 0376da9c4fec964e19fcbdf8f6c4d868220fc60e)
+++ benchmark/io/io_uring.h	(revision 0376da9c4fec964e19fcbdf8f6c4d868220fc60e)
@@ -0,0 +1,171 @@
+extern "C" {
+	#ifndef _GNU_SOURCE         /* See feature_test_macros(7) */
+	#define _GNU_SOURCE         /* See feature_test_macros(7) */
+	#endif
+	#include <errno.h>
+	#include <stdio.h>
+	#include <stdint.h>
+	#include <stdlib.h>
+	#include <string.h>
+	#include <unistd.h>
+	#include <sys/mman.h>
+	#include <sys/syscall.h>
+	#include <sys/uio.h>
+	#include <fcntl.h>
+
+	#include <linux/io_uring.h>
+}
+
+# ifndef __NR_io_uring_setup
+#  define __NR_io_uring_setup		425
+# endif
+# ifndef __NR_io_uring_enter
+#  define __NR_io_uring_enter		426
+# endif
+# ifndef __NR_io_uring_register
+#  define __NR_io_uring_register	427
+# endif
+
+struct io_uring_sq {
+	// Head and tail of the ring (associated with array)
+	volatile uint32_t * head;
+	volatile uint32_t * tail;
+
+	// The actual kernel ring which uses head/tail
+	// indexes into the sqes arrays
+	uint32_t * array;
+
+	// number of entries and mask to go with it
+	const uint32_t * num;
+	const uint32_t * mask;
+
+	// Submission flags (Not sure what for)
+	uint32_t * flags;
+
+	// number of sqes not submitted (whatever that means)
+	uint32_t * dropped;
+
+	// Like head/tail but not seen by the kernel
+	volatile uint32_t alloc;
+
+	// A buffer of sqes (not the actual ring)
+	struct io_uring_sqe * sqes;
+
+	// The location and size of the mmaped area
+	void * ring_ptr;
+	size_t ring_sz;
+};
+
+struct io_uring_cq {
+	// Head and tail of the ring
+	volatile uint32_t * head;
+	volatile uint32_t * tail;
+
+	// number of entries and mask to go with it
+	const uint32_t * mask;
+	const uint32_t * num;
+
+	// number of cqes not submitted (whatever that means)
+	uint32_t * overflow;
+
+	// the kernel ring
+	struct io_uring_cqe * cqes;
+
+	// The location and size of the mmaped area
+	void * ring_ptr;
+	size_t ring_sz;
+};
+
+struct io_ring {
+	struct io_uring_sq submit_q;
+	struct io_uring_cq completion_q;
+	uint32_t flags;
+	int fd;
+};
+
+struct IO_singleton {
+	io_ring io;
+};
+
+IO_singleton self;
+
+void init_uring(uint32_t nentries) {
+      // Step 1 : call to setup
+	struct io_uring_params params;
+	memset(&params, 0, sizeof(params));
+	// params.flags = IORING_SETUP_SQPOLL;
+
+	int fd = syscall(__NR_io_uring_setup, nentries, &params );
+	if(fd < 0) {
+		fprintf(stderr, "KERNEL ERROR: IO_URING SETUP - %s\n", strerror(errno));
+		abort();
+	}
+
+	// Step 2 : mmap result
+	memset(&self.io, 0, sizeof(struct io_ring));
+	struct io_uring_sq & sq = self.io.submit_q;
+	struct io_uring_cq & cq = self.io.completion_q;
+
+	// calculate the right ring size
+	sq.ring_sz = params.sq_off.array + (params.sq_entries * sizeof(unsigned)           );
+	cq.ring_sz = params.cq_off.cqes  + (params.cq_entries * sizeof(struct io_uring_cqe));
+
+	// Requires features
+	// // adjust the size according to the parameters
+	// if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
+	// 	cq->ring_sz = sq->ring_sz = max(cq->ring_sz, sq->ring_sz);
+	// }
+
+	// mmap the Submit Queue into existence
+	sq.ring_ptr = mmap(0, sq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
+	if (sq.ring_ptr == (void*)MAP_FAILED) {
+		fprintf(stderr, "KERNEL ERROR: IO_URING MMAP1 - %s\n", strerror(errno));
+		abort();
+	}
+
+	// mmap the Completion Queue into existence (may or may not be needed)
+	// Requires features
+	// if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
+	// 	cq->ring_ptr = sq->ring_ptr;
+	// }
+	// else {
+		// We need multiple call to MMAP
+		cq.ring_ptr = mmap(0, cq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
+		if (cq.ring_ptr == (void*)MAP_FAILED) {
+			munmap(sq.ring_ptr, sq.ring_sz);
+			fprintf(stderr, "KERNEL ERROR: IO_URING MMAP2 - %s\n", strerror(errno));
+			abort();
+		}
+	// }
+
+	// mmap the submit queue entries
+	size_t size = params.sq_entries * sizeof(struct io_uring_sqe);
+	sq.sqes = (struct io_uring_sqe *)mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
+	if (sq.sqes == (struct io_uring_sqe *)MAP_FAILED) {
+		munmap(sq.ring_ptr, sq.ring_sz);
+		if (cq.ring_ptr != sq.ring_ptr) munmap(cq.ring_ptr, cq.ring_sz);
+		fprintf(stderr, "KERNEL ERROR: IO_URING MMAP3 - %s\n", strerror(errno));
+		abort();
+	}
+
+	// Get the pointers from the kernel to fill the structure
+	// submit queue
+	sq.head    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
+	sq.tail    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
+	sq.mask    = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
+	sq.num     = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
+	sq.flags   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
+	sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
+	sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
+	sq.alloc = *sq.tail;
+
+	// completion queue
+	cq.head     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
+	cq.tail     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
+	cq.mask     = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
+	cq.num      = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
+	cq.overflow = (         uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
+	cq.cqes   = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
+
+	self.io.fd = fd;
+}
