// Program to test the optimial batchsize in a single threaded process
extern "C" {
	#include <getopt.h>
	#include <locale.h>
	#include <time.h>										// timespec
	#include <sys/time.h>									// timeval
}

enum { TIMEGRAN = 1000000000LL };					// nanosecond granularity, except for timeval

#include <omp.h>

#include "io_uring.h"


int myfd;

long long unsigned submits   = 0;
long long unsigned completes = 0;

void submit_and_drain(struct iovec * iov, int n) {
	for(int i = 0; i < n; i++) {
		struct io_uring_sqe * sqe =  &self.io.submit_q.sqes[ 0 ];

		sqe->opcode = IORING_OP_READV;
		#if !defined(IOSQE_ASYNC)
			sqe->flags = 0;
		#else
			sqe->flags = IOSQE_ASYNC;
		#endif
		sqe->ioprio = 0;
		sqe->fd = myfd;
		sqe->off = 0;
		sqe->addr = (__u64)iov;
		sqe->len = 1;
		sqe->rw_flags = 0;
		sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
	}

	volatile uint32_t * tail = self.io.submit_q.tail;
	__atomic_fetch_add(tail, n, __ATOMIC_SEQ_CST);

	int ret = syscall( __NR_io_uring_enter, self.io.fd, n, n, IORING_ENTER_GETEVENTS, nullptr, 0);
	if( ret < 0 ) {
		switch((int)errno) {
		case EAGAIN:
		case EINTR:
		default:
			fprintf(stderr, "KERNEL ERROR: IO_URING WAIT - %s\n", strerror(errno) );
			abort();
		}
	}

	submits += ret;

	uint32_t chead = *self.io.completion_q.head;
	uint32_t ctail = *self.io.completion_q.tail;
	const uint32_t mask = *self.io.completion_q.mask;

	// Memory barrier
	__atomic_thread_fence( __ATOMIC_SEQ_CST );

	uint32_t count = ctail - chead;
	__atomic_fetch_add( self.io.completion_q.head, count, __ATOMIC_RELAXED );
	completes += count;
}

uint64_t timeHiRes() {
	timespec curr;
	clock_gettime( CLOCK_REALTIME, &curr );
	return (int64_t)curr.tv_sec * TIMEGRAN + curr.tv_nsec;
}

uint64_t to_miliseconds( uint64_t durtn ) { return durtn / (TIMEGRAN / 1000LL); }
double to_fseconds(uint64_t durtn ) { return durtn / (double)TIMEGRAN; }
uint64_t from_fseconds(double sec) { return sec * TIMEGRAN; }

int main(int argc, char * argv[]) {
	int buflen = 50;
	int batch  = 1;
	double duration = 5;

	setlocale(LC_ALL, "");

	for(;;) {
		static struct option options[] = {
			{"duration",     required_argument, 0, 'd'},
			{"batchsize",   required_argument, 0, 'b'},
			{"buflen",      required_argument, 0, 'l'},
			{0, 0, 0, 0}
		};

		int idx = 0;
		int opt = getopt_long(argc, argv, "d:l:b:", options, &idx);

		const char * arg = optarg ? optarg : "";
		char * end;
		switch(opt) {
			// Exit Case
			case -1:
				goto arg_loop;
			case 'd': \
				duration = strtod(arg, &end); \
				if(*end != '\0') { \
					fprintf(stderr, "Duration must be a valid double, was %s\n", arg); \
					goto usage; \
				} \
				break;
			case 'l':
				buflen = strtoul(arg, &end, 10);
				if(*end != '\0' && buflen < 10) {
					fprintf(stderr, "Buffer size must be at least 10, was %s\n", arg);
					goto usage;
				}
			case 'b':
				batch = strtoul(arg, &end, 10);
				if(*end != '\0' && batch < 0) {
					fprintf(stderr, "Batch size must be at least 1, was %s\n", arg);
					goto usage;
				}
				break;
			default: /* ? */
				fprintf(stderr, "%d\n", opt);
			usage:
				fprintf( stderr, "  -l, --buflen=SIZE        Number of bytes to read per request\n" );
				fprintf( stderr, "  -b, --batchsize=COUNT    Number of request to batch together\n" );
				exit(EXIT_FAILURE);
		}
	}
	arg_loop:

	myfd = open(__FILE__, 0);

	init_uring(2048);

	// Allocate the sqe
	uint32_t idx = 0;

	// Return the sqe
	struct io_uring_sqe * sqe =  &self.io.submit_q.sqes[ idx & (*self.io.submit_q.mask)];

	char data[buflen];
	struct iovec iov = { data, (size_t)buflen };

	sqe->opcode = IORING_OP_READV;
	#if !defined(IOSQE_ASYNC)
		sqe->flags = 0;
	#else
		sqe->flags = IOSQE_ASYNC;
	#endif
	sqe->ioprio = 0;
	sqe->fd = myfd;
	sqe->off = 0;
	sqe->addr = (__u64)&iov;
	sqe->len = 1;
	sqe->rw_flags = 0;
	sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;

	// Append to the list of ready entries
	for(unsigned i = 0; i < *self.io.submit_q.num; i++) {
		self.io.submit_q.array[ i ] = 0;
	}

	printf("Running for %f second, reading %d bytes in batches of %d\n", duration, buflen, batch);
	uint64_t start = timeHiRes();
	uint64_t end   = timeHiRes();
	uint64_t prev  = timeHiRes();
	for(;;) {
		submit_and_drain(&iov, batch);
		end = timeHiRes();
		uint64_t delta = end - start;
		if( to_fseconds(end - prev) > 0.1 ) {
			printf(" %.1f\r", to_fseconds(delta));
			fflush(stdout);
			prev = end;
		}
		if( delta >= from_fseconds(duration) ) {
			break;
		}
	}

	printf("Took %'ld ms\n", to_miliseconds(end - start));
	printf("Submitted        %'llu\n", submits);
	printf("Completed        %'llu\n", completes);
	printf("Submitted / sec  %'.f\n", submits   / to_fseconds(end - start));
	printf("Completed / sec  %'.f\n", completes / to_fseconds(end - start));
	printf("ns per Submitted %'.f\n", 1000000000.0 * to_fseconds(end - start) / (submits   / batch) );
	printf("ns per Completed %'.f\n", 1000000000.0 * to_fseconds(end - start) / (completes / batch) );
}