Context Navigation

io.cfa@ 03045f18

Visit:

ADT arm-eh ast-experimental enum forall-pointer-decay jacob/cs343-translation new-ast new-ast-unique-expr pthread-emulation qualifiedEnum

Last change on this file since 03045f18 was 0e4df2e, checked in by Thierry Delisle <tdelisle@…>, 5 years ago
Merge branch 'master' into relaxed_ready
Property mode set to `100644`
File size: 39.8 KB

Rev	Line
[ecf6b46]	1	//
	2	// Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
	3	//
	4	// The contents of this file are covered under the licence agreement in the
	5	// file "LICENCE" distributed with Cforall.
	6	//
	7	// io.cfa --
	8	//
	9	// Author : Thierry Delisle
	10	// Created On : Thu Apr 23 17:31:00 2020
	11	// Last Modified By :
	12	// Last Modified On :
	13	// Update Count :
	14	//
	15
[4069faad]	16	// #define __CFA_DEBUG_PRINT_IO__
[0a805f2]	17	// #define __CFA_DEBUG_PRINT_IO_CORE__
[4069faad]	18
[92976d9]	19	#include "kernel.hfa"
[5c581cc]	20	#include "bitmanip.hfa"
[92976d9]	21
	22	#if !defined(HAVE_LINUX_IO_URING_H)
[dd4e2d7]	23	void __kernel_io_startup( cluster &, unsigned, bool ) {
[92976d9]	24	// Nothing to do without io_uring
	25	}
	26
[3f7d0b4]	27	void __kernel_io_finish_start( cluster & ) {
[f6660520]	28	// Nothing to do without io_uring
	29	}
	30
[3f7d0b4]	31	void __kernel_io_prepare_stop( cluster & ) {
[f6660520]	32	// Nothing to do without io_uring
	33	}
	34
[3f7d0b4]	35	void __kernel_io_shutdown( cluster &, bool ) {
[92976d9]	36	// Nothing to do without io_uring
	37	}
	38
	39	#else
	40	extern "C" {
	41	#define _GNU_SOURCE /* See feature_test_macros(7) */
	42	#include <errno.h>
	43	#include <stdint.h>
	44	#include <string.h>
	45	#include <unistd.h>
	46	#include <sys/mman.h>
	47	#include <sys/syscall.h>
	48
	49	#include <linux/io_uring.h>
	50	}
	51
	52	#include "bits/signal.hfa"
	53	#include "kernel_private.hfa"
	54	#include "thread.hfa"
	55
	56	uint32_t entries_per_cluster() {
	57	return 256;
	58	}
	59
[f6660520]	60	static void * __io_poller_slow( void * arg );
	61
	62	// Weirdly, some systems that do support io_uring don't actually define these
	63	#ifdef __alpha__
	64	/*
	65	* alpha is the only exception, all other architectures
	66	* have common numbers for new system calls.
	67	*/
	68	#ifndef __NR_io_uring_setup
	69	#define __NR_io_uring_setup 535
	70	#endif
	71	#ifndef __NR_io_uring_enter
	72	#define __NR_io_uring_enter 536
	73	#endif
	74	#ifndef __NR_io_uring_register
	75	#define __NR_io_uring_register 537
	76	#endif
	77	#else /* !__alpha__ */
	78	#ifndef __NR_io_uring_setup
	79	#define __NR_io_uring_setup 425
	80	#endif
	81	#ifndef __NR_io_uring_enter
	82	#define __NR_io_uring_enter 426
	83	#endif
	84	#ifndef __NR_io_uring_register
	85	#define __NR_io_uring_register 427
	86	#endif
	87	#endif
	88
[61dd73d]	89	// Fast poller user-thread
	90	// Not using the "thread" keyword because we want to control
	91	// more carefully when to start/stop it
	92	struct __io_poller_fast {
	93	struct __io_data * ring;
	94	$thread thrd;
	95	};
	96
	97	void ?{}( __io_poller_fast & this, struct cluster & cltr ) {
	98	this.ring = cltr.io;
	99	(this.thrd){ "Fast I/O Poller", cltr };
	100	}
	101	void ^?{}( __io_poller_fast & mutex this );
	102	void main( __io_poller_fast & this );
	103	static inline $thread * get_thread( __io_poller_fast & this ) { return &this.thrd; }
	104	void ^?{}( __io_poller_fast & mutex this ) {}
	105
	106	struct __submition_data {
	107	// Head and tail of the ring (associated with array)
	108	volatile uint32_t * head;
	109	volatile uint32_t * tail;
	110
	111	// The actual kernel ring which uses head/tail
	112	// indexes into the sqes arrays
	113	uint32_t * array;
	114
	115	// number of entries and mask to go with it
	116	const uint32_t * num;
	117	const uint32_t * mask;
	118
	119	// Submission flags (Not sure what for)
	120	uint32_t * flags;
	121
	122	// number of sqes not submitted (whatever that means)
	123	uint32_t * dropped;
	124
	125	// Like head/tail but not seen by the kernel
[5dadc9b7]	126	volatile uint32_t * ready;
	127	uint32_t ready_cnt;
[61dd73d]	128
	129	__spinlock_t lock;
	130
	131	// A buffer of sqes (not the actual ring)
	132	struct io_uring_sqe * sqes;
	133
	134	// The location and size of the mmaped area
	135	void * ring_ptr;
	136	size_t ring_sz;
	137
	138	// Statistics
	139	#if !defined(__CFA_NO_STATISTICS__)
	140	struct {
	141	struct {
[6f121b8]	142	volatile unsigned long long int rdy;
	143	volatile unsigned long long int csm;
	144	volatile unsigned long long int avl;
[05cfa4d]	145	volatile unsigned long long int cnt;
[61dd73d]	146	} submit_avg;
[5dadc9b7]	147	struct {
	148	volatile unsigned long long int val;
	149	volatile unsigned long long int cnt;
	150	volatile unsigned long long int block;
	151	} look_avg;
[6f121b8]	152	struct {
	153	volatile unsigned long long int val;
	154	volatile unsigned long long int cnt;
	155	volatile unsigned long long int block;
	156	} alloc_avg;
[61dd73d]	157	} stats;
	158	#endif
	159	};
	160
	161	struct __completion_data {
	162	// Head and tail of the ring
	163	volatile uint32_t * head;
	164	volatile uint32_t * tail;
	165
	166	// number of entries and mask to go with it
	167	const uint32_t * mask;
	168	const uint32_t * num;
	169
	170	// number of cqes not submitted (whatever that means)
	171	uint32_t * overflow;
	172
	173	// the kernel ring
	174	struct io_uring_cqe * cqes;
	175
	176	// The location and size of the mmaped area
	177	void * ring_ptr;
	178	size_t ring_sz;
	179
	180	// Statistics
	181	#if !defined(__CFA_NO_STATISTICS__)
	182	struct {
	183	struct {
	184	unsigned long long int val;
	185	unsigned long long int slow_cnt;
	186	unsigned long long int fast_cnt;
	187	} completed_avg;
	188	} stats;
	189	#endif
	190	};
	191
	192	struct __io_data {
	193	struct __submition_data submit_q;
	194	struct __completion_data completion_q;
[b6f2b213]	195	uint32_t ring_flags;
	196	int cltr_flags;
[61dd73d]	197	int fd;
	198	semaphore submit;
	199	volatile bool done;
	200	struct {
	201	struct {
	202	void * stack;
	203	pthread_t kthrd;
[5c581cc]	204	volatile bool blocked;
[61dd73d]	205	} slow;
	206	__io_poller_fast fast;
	207	__bin_sem_t sem;
	208	} poller;
	209	};
[185efe6]	210
[92976d9]	211	//=============================================================================================
	212	// I/O Startup / Shutdown logic
	213	//=============================================================================================
[dd4e2d7]	214	void __kernel_io_startup( cluster & this, unsigned io_flags, bool main_cluster ) {
[61dd73d]	215	this.io = malloc();
	216
[92976d9]	217	// Step 1 : call to setup
	218	struct io_uring_params params;
	219	memset(&params, 0, sizeof(params));
	220
[2d8f7b0]	221	uint32_t nentries = entries_per_cluster();
	222
	223	int fd = syscall(__NR_io_uring_setup, nentries, &params );
[92976d9]	224	if(fd < 0) {
	225	abort("KERNEL ERROR: IO_URING SETUP - %s\n", strerror(errno));
	226	}
	227
	228	// Step 2 : mmap result
[61dd73d]	229	memset( this.io, 0, sizeof(struct __io_data) );
	230	struct __submition_data & sq = this.io->submit_q;
	231	struct __completion_data & cq = this.io->completion_q;
[92976d9]	232
	233	// calculate the right ring size
[2d8f7b0]	234	sq.ring_sz = params.sq_off.array + (params.sq_entries * sizeof(unsigned) );
	235	cq.ring_sz = params.cq_off.cqes + (params.cq_entries * sizeof(struct io_uring_cqe));
[92976d9]	236
	237	// Requires features
[d384787]	238	#if defined(IORING_FEAT_SINGLE_MMAP)
	239	// adjust the size according to the parameters
	240	if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
	241	cq->ring_sz = sq->ring_sz = max(cq->ring_sz, sq->ring_sz);
	242	}
	243	#endif
[92976d9]	244
	245	// mmap the Submit Queue into existence
[2d8f7b0]	246	sq.ring_ptr = mmap(0, sq.ring_sz, PROT_READ \| PROT_WRITE, MAP_SHARED \| MAP_POPULATE, fd, IORING_OFF_SQ_RING);
	247	if (sq.ring_ptr == (void*)MAP_FAILED) {
[92976d9]	248	abort("KERNEL ERROR: IO_URING MMAP1 - %s\n", strerror(errno));
	249	}
	250
	251	// Requires features
[d384787]	252	#if defined(IORING_FEAT_SINGLE_MMAP)
	253	// mmap the Completion Queue into existence (may or may not be needed)
	254	if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
	255	cq->ring_ptr = sq->ring_ptr;
	256	}
	257	else
	258	#endif
	259	{
[92976d9]	260	// We need multiple call to MMAP
[2d8f7b0]	261	cq.ring_ptr = mmap(0, cq.ring_sz, PROT_READ \| PROT_WRITE, MAP_SHARED \| MAP_POPULATE, fd, IORING_OFF_CQ_RING);
	262	if (cq.ring_ptr == (void*)MAP_FAILED) {
	263	munmap(sq.ring_ptr, sq.ring_sz);
[92976d9]	264	abort("KERNEL ERROR: IO_URING MMAP2 - %s\n", strerror(errno));
	265	}
[d384787]	266	}
[92976d9]	267
	268	// mmap the submit queue entries
	269	size_t size = params.sq_entries * sizeof(struct io_uring_sqe);
[2d8f7b0]	270	sq.sqes = (struct io_uring_sqe *)mmap(0, size, PROT_READ \| PROT_WRITE, MAP_SHARED \| MAP_POPULATE, fd, IORING_OFF_SQES);
	271	if (sq.sqes == (struct io_uring_sqe *)MAP_FAILED) {
	272	munmap(sq.ring_ptr, sq.ring_sz);
	273	if (cq.ring_ptr != sq.ring_ptr) munmap(cq.ring_ptr, cq.ring_sz);
[92976d9]	274	abort("KERNEL ERROR: IO_URING MMAP3 - %s\n", strerror(errno));
	275	}
	276
	277	// Get the pointers from the kernel to fill the structure
	278	// submit queue
[2d8f7b0]	279	sq.head = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
	280	sq.tail = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
	281	sq.mask = ( const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
	282	sq.num = ( const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
	283	sq.flags = ( uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
	284	sq.dropped = ( uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
	285	sq.array = ( uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
[6f121b8]	286
	287	{
	288	const uint32_t num = *sq.num;
	289	for( i; num ) {
	290	sq.sqes[i].user_data = 0ul64;
	291	}
	292	}
[5dadc9b7]	293
	294	if( io_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
[5c581cc]	295	/* paranoid */ verify( is_pow2( io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET ) \|\| ((io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET) < 8) );
[dd4e2d7]	296	sq.ready_cnt = max(io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET, 8);
[0335620]	297	sq.ready = alloc_align( 64, sq.ready_cnt );
[5dadc9b7]	298	for(i; sq.ready_cnt) {
	299	sq.ready[i] = -1ul32;
	300	}
	301	}
	302	else {
	303	sq.ready_cnt = 0;
	304	sq.ready = 0p;
	305	}
[92976d9]	306
	307	// completion queue
[2d8f7b0]	308	cq.head = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
	309	cq.tail = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
	310	cq.mask = ( const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
	311	cq.num = ( const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
	312	cq.overflow = ( uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
	313	cq.cqes = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
	314
	315	// some paranoid checks
	316	/* paranoid / verifyf( (cq.mask) == ((cq.num) - 1ul32), "IO_URING Expected mask to be %u (%u entries), was %u", (cq.num) - 1ul32, cq.num, cq.mask );
	317	/* paranoid / verifyf( (cq.num) >= nentries, "IO_URING Expected %u entries, got %u", nentries, *cq.num );
	318	/* paranoid / verifyf( (cq.head) == 0, "IO_URING Expected head to be 0, got %u", *cq.head );
	319	/* paranoid / verifyf( (cq.tail) == 0, "IO_URING Expected tail to be 0, got %u", *cq.tail );
	320
	321	/* paranoid / verifyf( (sq.mask) == ((sq.num) - 1ul32), "IO_URING Expected mask to be %u (%u entries), was %u", (sq.num) - 1ul32, sq.num, sq.mask );
	322	/* paranoid / verifyf( (sq.num) >= nentries, "IO_URING Expected %u entries, got %u", nentries, *sq.num );
	323	/* paranoid / verifyf( (sq.head) == 0, "IO_URING Expected head to be 0, got %u", *sq.head );
	324	/* paranoid / verifyf( (sq.tail) == 0, "IO_URING Expected tail to be 0, got %u", *sq.tail );
[92976d9]	325
	326	// Update the global ring info
[b6f2b213]	327	this.io->ring_flags = params.flags;
	328	this.io->cltr_flags = io_flags;
	329	this.io->fd = fd;
	330	this.io->done = false;
[61dd73d]	331	(this.io->submit){ min(sq.num, cq.num) };
[92976d9]	332
[d384787]	333	// Initialize statistics
[038be32]	334	#if !defined(__CFA_NO_STATISTICS__)
[6f121b8]	335	this.io->submit_q.stats.submit_avg.rdy = 0;
	336	this.io->submit_q.stats.submit_avg.csm = 0;
	337	this.io->submit_q.stats.submit_avg.avl = 0;
	338	this.io->submit_q.stats.submit_avg.cnt = 0;
[5dadc9b7]	339	this.io->submit_q.stats.look_avg.val = 0;
	340	this.io->submit_q.stats.look_avg.cnt = 0;
	341	this.io->submit_q.stats.look_avg.block = 0;
[6f121b8]	342	this.io->submit_q.stats.alloc_avg.val = 0;
	343	this.io->submit_q.stats.alloc_avg.cnt = 0;
	344	this.io->submit_q.stats.alloc_avg.block = 0;
[61dd73d]	345	this.io->completion_q.stats.completed_avg.val = 0;
	346	this.io->completion_q.stats.completed_avg.slow_cnt = 0;
	347	this.io->completion_q.stats.completed_avg.fast_cnt = 0;
[038be32]	348	#endif
[d384787]	349
[f6660520]	350	if(!main_cluster) {
	351	__kernel_io_finish_start( this );
	352	}
	353	}
	354
	355	void __kernel_io_finish_start( cluster & this ) {
[b6f2b213]	356	if( this.io->cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD ) {
	357	__cfadbg_print_safe(io_core, "Kernel I/O : Creating fast poller for cluter %p\n", &this);
	358	(this.io->poller.fast){ this };
	359	__thrd_start( this.io->poller.fast, main );
	360	}
[f6660520]	361
[92976d9]	362	// Create the poller thread
[0a805f2]	363	__cfadbg_print_safe(io_core, "Kernel I/O : Creating slow poller for cluter %p\n", &this);
[5c581cc]	364	this.io->poller.slow.blocked = false;
[61dd73d]	365	this.io->poller.slow.stack = __create_pthread( &this.io->poller.slow.kthrd, __io_poller_slow, &this );
[92976d9]	366	}
	367
[f6660520]	368	void __kernel_io_prepare_stop( cluster & this ) {
[0a805f2]	369	__cfadbg_print_safe(io_core, "Kernel I/O : Stopping pollers for cluster\n", &this);
[92976d9]	370	// Notify the poller thread of the shutdown
[61dd73d]	371	__atomic_store_n(&this.io->done, true, __ATOMIC_SEQ_CST);
[f6660520]	372
	373	// Stop the IO Poller
[92976d9]	374	sigval val = { 1 };
[61dd73d]	375	pthread_sigqueue( this.io->poller.slow.kthrd, SIGUSR1, val );
	376	post( this.io->poller.sem );
[92976d9]	377
	378	// Wait for the poller thread to finish
[61dd73d]	379	pthread_join( this.io->poller.slow.kthrd, 0p );
	380	free( this.io->poller.slow.stack );
[f6660520]	381
[0a805f2]	382	__cfadbg_print_safe(io_core, "Kernel I/O : Slow poller stopped for cluster\n", &this);
[4069faad]	383
[b6f2b213]	384	if( this.io->cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD ) {
[05cfa4d]	385	with( this.io->poller.fast ) {
	386	/* paranoid */ verify( this.procs.head == 0p \|\| &this == mainCluster );
	387	/* paranoid */ verify( this.idles.head == 0p \|\| &this == mainCluster );
	388
	389	// We need to adjust the clean-up based on where the thread is
[5dadc9b7]	390	if( thrd.state == Ready \|\| thrd.preempted != __NO_PREEMPTION ) {
[05cfa4d]	391
	392	// This is the tricky case
	393	// The thread was preempted and now it is on the ready queue
[2f1cb37]	394
[1b143de]	395	/* paranoid */ verify( thrd.next != 0p ); // The thread should be the last on the list
[05cfa4d]	396	/* paranoid */ verify( this.ready_queue.head == &thrd ); // The thread should be the only thing on the list
	397
	398	// Remove the thread from the ready queue of this cluster
	399	this.ready_queue.head = 1p;
	400	thrd.next = 0p;
[6f121b8]	401	__cfaabi_dbg_debug_do( thrd.unpark_stale = true );
[05cfa4d]	402
	403	// Fixup the thread state
	404	thrd.state = Blocked;
	405	thrd.preempted = __NO_PREEMPTION;
	406
	407	// Pretend like the thread was blocked all along
	408	}
	409	// !!! This is not an else if !!!
	410	if( thrd.state == Blocked ) {
[6502a2b]	411
[05cfa4d]	412	// This is the "easy case"
	413	// The thread is parked and can easily be moved to active cluster
	414	verify( thrd.curr_cluster != active_cluster() \|\| thrd.curr_cluster == mainCluster );
	415	thrd.curr_cluster = active_cluster();
[6502a2b]	416
[f6660520]	417	// unpark the fast io_poller
[05cfa4d]	418	unpark( &thrd __cfaabi_dbg_ctx2 );
	419	}
	420	else {
	421
	422	// The thread is in a weird state
	423	// I don't know what to do here
	424	abort("Fast poller thread is in unexpected state, cannot clean-up correctly\n");
	425	}
	426
	427	}
[f6660520]	428
[61dd73d]	429	^(this.io->poller.fast){};
[4069faad]	430
[0a805f2]	431	__cfadbg_print_safe(io_core, "Kernel I/O : Fast poller stopped for cluster\n", &this);
[b6f2b213]	432	}
[f6660520]	433	}
	434
	435	void __kernel_io_shutdown( cluster & this, bool main_cluster ) {
	436	if(!main_cluster) {
	437	__kernel_io_prepare_stop( this );
	438	}
[92976d9]	439
[d384787]	440	// print statistics
[038be32]	441	#if !defined(__CFA_NO_STATISTICS__)
	442	if(this.print_stats) {
[61dd73d]	443	with(this.io->submit_q.stats, this.io->completion_q.stats) {
[6f121b8]	444	double avgrdy = ((double)submit_avg.rdy) / submit_avg.cnt;
	445	double avgcsm = ((double)submit_avg.csm) / submit_avg.cnt;
	446	double avgavl = ((double)submit_avg.avl) / submit_avg.cnt;
	447
[5dadc9b7]	448	double lavgv = 0;
	449	double lavgb = 0;
	450	if(look_avg.cnt != 0) {
	451	lavgv = ((double)look_avg.val ) / look_avg.cnt;
	452	lavgb = ((double)look_avg.block) / look_avg.cnt;
	453	}
	454
[6f121b8]	455	double aavgv = 0;
	456	double aavgb = 0;
	457	if(alloc_avg.cnt != 0) {
	458	aavgv = ((double)alloc_avg.val ) / alloc_avg.cnt;
	459	aavgb = ((double)alloc_avg.block) / alloc_avg.cnt;
	460	}
	461
[068a202]	462	__cfaabi_bits_print_safe( STDOUT_FILENO,
[61dd73d]	463	"----- I/O uRing Stats -----\n"
[5dadc9b7]	464	"- total submit calls : %'15llu\n"
[6f121b8]	465	"- avg ready entries : %'18.2lf\n"
	466	"- avg submitted entries : %'18.2lf\n"
	467	"- avg available entries : %'18.2lf\n"
[5dadc9b7]	468	"- total ready search : %'15llu\n"
	469	"- avg ready search len : %'18.2lf\n"
	470	"- avg ready search block : %'18.2lf\n"
[6f121b8]	471	"- total alloc search : %'15llu\n"
	472	"- avg alloc search len : %'18.2lf\n"
	473	"- avg alloc search block : %'18.2lf\n"
[5dadc9b7]	474	"- total wait calls : %'15llu (%'llu slow, %'llu fast)\n"
	475	"- avg completion/wait : %'18.2lf\n",
[61dd73d]	476	submit_avg.cnt,
[6f121b8]	477	avgrdy,
	478	avgcsm,
	479	avgavl,
[5dadc9b7]	480	look_avg.cnt,
	481	lavgv,
	482	lavgb,
[6f121b8]	483	alloc_avg.cnt,
	484	aavgv,
	485	aavgb,
[61dd73d]	486	completed_avg.slow_cnt + completed_avg.fast_cnt,
	487	completed_avg.slow_cnt, completed_avg.fast_cnt,
	488	((double)completed_avg.val) / (completed_avg.slow_cnt + completed_avg.fast_cnt)
	489	);
	490	}
[038be32]	491	}
	492	#endif
[d384787]	493
[92976d9]	494	// Shutdown the io rings
[61dd73d]	495	struct __submition_data & sq = this.io->submit_q;
	496	struct __completion_data & cq = this.io->completion_q;
[92976d9]	497
	498	// unmap the submit queue entries
[2d8f7b0]	499	munmap(sq.sqes, (sq.num) sizeof(struct io_uring_sqe));
[92976d9]	500
	501	// unmap the Submit Queue ring
	502	munmap(sq.ring_ptr, sq.ring_sz);
	503
	504	// unmap the Completion Queue ring, if it is different
	505	if (cq.ring_ptr != sq.ring_ptr) {
	506	munmap(cq.ring_ptr, cq.ring_sz);
	507	}
	508
	509	// close the file descriptor
[61dd73d]	510	close(this.io->fd);
	511
[5dadc9b7]	512	free( this.io->submit_q.ready ); // Maybe null, doesn't matter
[61dd73d]	513	free( this.io );
[92976d9]	514	}
	515
	516	//=============================================================================================
	517	// I/O Polling
	518	//=============================================================================================
	519	struct io_user_data {
	520	int32_t result;
	521	$thread * thrd;
	522	};
	523
	524	// Process a single completion message from the io_uring
	525	// This is NOT thread-safe
[5dadc9b7]	526	static [int, bool] __drain_io( & struct __io_data ring, * sigset_t mask, int waitcnt, bool in_kernel ) {
	527	unsigned to_submit = 0;
	528	if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
	529
	530	// If the poller thread also submits, then we need to aggregate the submissions which are ready
[6f121b8]	531	uint32_t tail = *ring.submit_q.tail;
[5dadc9b7]	532	const uint32_t mask = *ring.submit_q.mask;
	533
	534	// Go through the list of ready submissions
	535	for( i; ring.submit_q.ready_cnt ) {
	536	// replace any submission with the sentinel, to consume it.
	537	uint32_t idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
	538
	539	// If it was already the sentinel, then we are done
	540	if( idx == -1ul32 ) continue;
	541
	542	// If we got a real submission, append it to the list
[6f121b8]	543	ring.submit_q.array[ (tail + to_submit) & mask ] = idx & mask;
[5dadc9b7]	544	to_submit++;
	545	}
	546
	547	// Increment the tail based on how many we are ready to submit
[6f121b8]	548	__atomic_fetch_add(ring.submit_q.tail, to_submit, __ATOMIC_SEQ_CST);
[5dadc9b7]	549	}
	550
[6f121b8]	551	const uint32_t smask = *ring.submit_q.mask;
	552	uint32_t shead = *ring.submit_q.head;
[5dadc9b7]	553	int ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, waitcnt, IORING_ENTER_GETEVENTS, mask, _NSIG / 8);
[d384787]	554	if( ret < 0 ) {
	555	switch((int)errno) {
	556	case EAGAIN:
	557	case EINTR:
	558	return -EAGAIN;
	559	default:
	560	abort( "KERNEL ERROR: IO_URING WAIT - %s\n", strerror(errno) );
	561	}
	562	}
	563
[6f121b8]	564	verify( (shead + ret) == *ring.submit_q.head );
	565
	566	// Release the consumed SQEs
	567	for( i; ret ) {
	568	uint32_t idx = ring.submit_q.array[ (i + shead) & smask ];
	569	ring.submit_q.sqes[ idx ].user_data = 0;
	570	}
	571
	572	uint32_t avail = 0;
	573	uint32_t sqe_num = *ring.submit_q.num;
	574	for(i; sqe_num) {
	575	if( ring.submit_q.sqes[ i ].user_data == 0 ) avail++;
	576	}
	577
	578	// update statistics
	579	#if !defined(__CFA_NO_STATISTICS__)
	580	ring.submit_q.stats.submit_avg.rdy += to_submit;
	581	ring.submit_q.stats.submit_avg.csm += ret;
	582	ring.submit_q.stats.submit_avg.avl += avail;
	583	ring.submit_q.stats.submit_avg.cnt += 1;
	584	#endif
	585
[d384787]	586	// Drain the queue
[92976d9]	587	unsigned head = *ring.completion_q.head;
[6f121b8]	588	unsigned tail = *ring.completion_q.tail;
	589	const uint32_t mask = *ring.completion_q.mask;
	590
	591	// Memory barrier
	592	__atomic_thread_fence( __ATOMIC_SEQ_CST );
[92976d9]	593
[d384787]	594	// Nothing was new return 0
	595	if (head == tail) {
	596	return 0;
	597	}
[92976d9]	598
[d384787]	599	uint32_t count = tail - head;
	600	for(i; count) {
[6f121b8]	601	unsigned idx = (head + i) & mask;
[d384787]	602	struct io_uring_cqe & cqe = ring.completion_q.cqes[idx];
[92976d9]	603
[d384787]	604	/* paranoid */ verify(&cqe);
[92976d9]	605
[d384787]	606	struct io_user_data * data = (struct io_user_data *)cqe.user_data;
[4069faad]	607	__cfadbg_print_safe( io, "Kernel I/O : Performed reading io cqe %p, result %d for %p\n", data, cqe.res, data->thrd );
[2d8f7b0]	608
[d384787]	609	data->result = cqe.res;
[f6660520]	610	if(!in_kernel) { unpark( data->thrd __cfaabi_dbg_ctx2 ); }
	611	else { __unpark( data->thrd __cfaabi_dbg_ctx2 ); }
[d384787]	612	}
[2d8f7b0]	613
	614	// Allow new submissions to happen
[6f121b8]	615	// V(ring.submit, count);
[92976d9]	616
	617	// Mark to the kernel that the cqe has been seen
	618	// Ensure that the kernel only sees the new value of the head index after the CQEs have been read.
[6f121b8]	619	__atomic_thread_fence( __ATOMIC_SEQ_CST );
[d384787]	620	__atomic_fetch_add( ring.completion_q.head, count, __ATOMIC_RELAXED );
[92976d9]	621
[5dadc9b7]	622	return [count, count > 0 \|\| to_submit > 0];
[92976d9]	623	}
	624
[f6660520]	625	static void * __io_poller_slow( void * arg ) {
[92976d9]	626	cluster * cltr = (cluster *)arg;
[61dd73d]	627	struct __io_data & ring = *cltr->io;
[92976d9]	628
	629	sigset_t mask;
	630	sigfillset(&mask);
	631	if ( pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
	632	abort( "KERNEL ERROR: IO_URING - pthread_sigmask" );
	633	}
	634
	635	sigdelset( &mask, SIGUSR1 );
	636
	637	verify( (ring.submit_q.head) == (ring.submit_q.tail) );
	638	verify( (ring.completion_q.head) == (ring.completion_q.tail) );
	639
[1539bbd]	640	__cfadbg_print_safe(io_core, "Kernel I/O : Slow poller for ring %p ready\n", &ring);
	641
[b6f2b213]	642	if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD ) {
	643	while(!__atomic_load_n(&ring.done, __ATOMIC_SEQ_CST)) {
[5dadc9b7]	644
[5c581cc]	645	__atomic_store_n( &ring.poller.slow.blocked, true, __ATOMIC_SEQ_CST );
	646
[f6660520]	647	// In the user-thread approach drain and if anything was drained,
	648	// batton pass to the user-thread
[5dadc9b7]	649	int count;
	650	bool again;
[5c581cc]	651	[count, again] = __drain_io( ring, &mask, 1, true );
	652
	653	__atomic_store_n( &ring.poller.slow.blocked, false, __ATOMIC_SEQ_CST );
[3c039b0]	654
	655	// Update statistics
	656	#if !defined(__CFA_NO_STATISTICS__)
	657	ring.completion_q.stats.completed_avg.val += count;
	658	ring.completion_q.stats.completed_avg.slow_cnt += 1;
	659	#endif
	660
[5dadc9b7]	661	if(again) {
[0a805f2]	662	__cfadbg_print_safe(io_core, "Kernel I/O : Moving to ring %p to fast poller\n", &ring);
[f6660520]	663	__unpark( &ring.poller.fast.thrd __cfaabi_dbg_ctx2 );
	664	wait( ring.poller.sem );
	665	}
[b6f2b213]	666	}
	667	}
	668	else {
	669	while(!__atomic_load_n(&ring.done, __ATOMIC_SEQ_CST)) {
[f6660520]	670	//In the naive approach, just poll the io completion queue directly
[5dadc9b7]	671	int count;
	672	bool again;
	673	[count, again] = __drain_io( ring, &mask, 1, true );
[3c039b0]	674
	675	// Update statistics
	676	#if !defined(__CFA_NO_STATISTICS__)
	677	ring.completion_q.stats.completed_avg.val += count;
	678	ring.completion_q.stats.completed_avg.slow_cnt += 1;
	679	#endif
[b6f2b213]	680	}
[92976d9]	681	}
	682
[1539bbd]	683	__cfadbg_print_safe(io_core, "Kernel I/O : Slow poller for ring %p stopping\n", &ring);
	684
[92976d9]	685	return 0p;
	686	}
	687
[61dd73d]	688	void main( __io_poller_fast & this ) {
[b6f2b213]	689	verify( this.ring->cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD );
	690
[61dd73d]	691	// Start parked
	692	park( __cfaabi_dbg_ctx );
[f6660520]	693
[61dd73d]	694	__cfadbg_print_safe(io_core, "Kernel I/O : Fast poller for ring %p ready\n", &this.ring);
[1539bbd]	695
[4e74466]	696	int reset = 0;
	697
[61dd73d]	698	// Then loop until we need to start
	699	while(!__atomic_load_n(&this.ring->done, __ATOMIC_SEQ_CST)) {
[5dadc9b7]	700
[61dd73d]	701	// Drain the io
[5dadc9b7]	702	int count;
	703	bool again;
	704	[count, again] = __drain_io( *this.ring, 0p, 0, false );
	705
	706	if(!again) reset++;
[3c039b0]	707
[61dd73d]	708	// Update statistics
	709	#if !defined(__CFA_NO_STATISTICS__)
	710	this.ring->completion_q.stats.completed_avg.val += count;
	711	this.ring->completion_q.stats.completed_avg.fast_cnt += 1;
	712	#endif
[3c039b0]	713
[5dadc9b7]	714	// If we got something, just yield and check again
[4e74466]	715	if(reset < 5) {
[61dd73d]	716	yield();
	717	}
[5dadc9b7]	718	// We didn't get anything baton pass to the slow poller
[61dd73d]	719	else {
	720	__cfadbg_print_safe(io_core, "Kernel I/O : Moving to ring %p to slow poller\n", &this.ring);
[5dadc9b7]	721	reset = 0;
	722
	723	// wake up the slow poller
[61dd73d]	724	post( this.ring->poller.sem );
[5dadc9b7]	725
	726	// park this thread
[61dd73d]	727	park( __cfaabi_dbg_ctx );
[f6660520]	728	}
	729	}
[61dd73d]	730
	731	__cfadbg_print_safe(io_core, "Kernel I/O : Fast poller for ring %p stopping\n", &this.ring);
	732	}
[f6660520]	733
[0335620]	734	static inline void __wake_poller( struct __io_data & ring ) __attribute__((artificial));
[5dadc9b7]	735	static inline void __wake_poller( struct __io_data & ring ) {
[5c581cc]	736	if(!__atomic_load_n( &ring.poller.slow.blocked, __ATOMIC_SEQ_CST)) return;
	737
	738	sigval val = { 1 };
	739	pthread_sigqueue( ring.poller.slow.kthrd, SIGUSR1, val );
[5dadc9b7]	740	}
	741
[92976d9]	742	//=============================================================================================
	743	// I/O Submissions
	744	//=============================================================================================
	745
[2d8f7b0]	746	// Submition steps :
	747	// 1 - We need to make sure we don't overflow any of the buffer, P(ring.submit) to make sure
	748	// entries are available. The semaphore make sure that there is no more operations in
	749	// progress then the number of entries in the buffer. This probably limits concurrency
	750	// more than necessary since submitted but not completed operations don't need any
	751	// entries in user space. However, I don't know what happens if we overflow the buffers
	752	// because too many requests completed at once. This is a safe approach in all cases.
	753	// Furthermore, with hundreds of entries, this may be okay.
	754	//
	755	// 2 - Allocate a queue entry. The ring already has memory for all entries but only the ones
	756	// listed in sq.array are visible by the kernel. For those not listed, the kernel does not
	757	// offer any assurance that an entry is not being filled by multiple flags. Therefore, we
	758	// need to write an allocator that allows allocating concurrently.
	759	//
	760	// 3 - Actually fill the submit entry, this is the only simple and straightforward step.
	761	//
	762	// 4 - Append the entry index to the array and adjust the tail accordingly. This operation
	763	// needs to arrive to two concensus at the same time:
	764	// A - The order in which entries are listed in the array: no two threads must pick the
	765	// same index for their entries
	766	// B - When can the tail be update for the kernel. EVERY entries in the array between
	767	// head and tail must be fully filled and shouldn't ever be touched again.
	768	//
	769
[6f121b8]	770	static inline [* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring, uint64_t data ) {
	771	verify( data != 0 );
	772
	773	// Prepare the data we need
	774	__attribute((unused)) int len = 0;
	775	__attribute((unused)) int block = 0;
	776	uint32_t cnt = *ring.submit_q.num;
	777	uint32_t mask = *ring.submit_q.mask;
	778	uint32_t off = __tls_rand();
	779
	780	// Loop around looking for an available spot
	781	LOOKING: for() {
	782	// Look through the list starting at some offset
	783	for(i; cnt) {
	784	uint64_t expected = 0;
	785	uint32_t idx = (i + off) & mask;
	786	struct io_uring_sqe * sqe = &ring.submit_q.sqes[idx];
	787	volatile uint64_t * udata = &sqe->user_data;
	788
	789	if( *udata == expected &&
	790	__atomic_compare_exchange_n( udata, &expected, data, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED ) )
	791	{
	792	// update statistics
	793	#if !defined(__CFA_NO_STATISTICS__)
	794	__atomic_fetch_add( &ring.submit_q.stats.alloc_avg.val, len, __ATOMIC_RELAXED );
	795	__atomic_fetch_add( &ring.submit_q.stats.alloc_avg.block, block, __ATOMIC_RELAXED );
	796	__atomic_fetch_add( &ring.submit_q.stats.alloc_avg.cnt, 1, __ATOMIC_RELAXED );
	797	#endif
	798
	799	// Success return the data
	800	return [sqe, idx];
	801	}
	802	verify(expected != data);
[2489d31]	803
[6f121b8]	804	len ++;
	805	}
[2489d31]	806
[6f121b8]	807	block++;
	808	yield();
	809	}
[2489d31]	810	}
	811
[61dd73d]	812	static inline void __submit( struct __io_data & ring, uint32_t idx ) {
[5dadc9b7]	813	// Get now the data we definetely need
	814	uint32_t * const tail = ring.submit_q.tail;
[2489d31]	815	const uint32_t mask = *ring.submit_q.mask;
	816
[5dadc9b7]	817	// There are 2 submission schemes, check which one we are using
	818	if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
	819	// If the poller thread submits, then we just need to add this to the ready array
[2489d31]	820
[5dadc9b7]	821	/* paranoid */ verify( idx <= mask );
	822	/* paranoid */ verify( idx != -1ul32 );
	823
	824	// We need to find a spot in the ready array
	825	__attribute((unused)) int len = 0;
	826	__attribute((unused)) int block = 0;
[5c581cc]	827	uint32_t ready_mask = ring.submit_q.ready_cnt - 1;
	828	uint32_t off = __tls_rand();
	829	LOOKING: for() {
[5dadc9b7]	830	for(i; ring.submit_q.ready_cnt) {
[5c581cc]	831	uint32_t ii = (i + off) & ready_mask;
[6f121b8]	832	uint32_t expected = -1ul32;
[5c581cc]	833	if( __atomic_compare_exchange_n( &ring.submit_q.ready[ii], &expected, idx, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED ) ) {
[5dadc9b7]	834	break LOOKING;
	835	}
[6f121b8]	836	verify(expected != idx);
[5dadc9b7]	837
	838	len ++;
	839	}
[2489d31]	840
[5dadc9b7]	841	block++;
	842	yield();
[2489d31]	843	}
[5dadc9b7]	844
	845	__wake_poller( ring );
	846
	847	// update statistics
	848	#if !defined(__CFA_NO_STATISTICS__)
	849	__atomic_fetch_add( &ring.submit_q.stats.look_avg.val, len, __ATOMIC_RELAXED );
	850	__atomic_fetch_add( &ring.submit_q.stats.look_avg.block, block, __ATOMIC_RELAXED );
	851	__atomic_fetch_add( &ring.submit_q.stats.look_avg.cnt, 1, __ATOMIC_RELAXED );
	852	#endif
[dd4e2d7]	853
	854	__cfadbg_print_safe( io, "Kernel I/O : Added %u to ready for %p\n", idx, active_thread() );
[2d8f7b0]	855	}
[5dadc9b7]	856	else {
	857	// get mutual exclusion
	858	lock(ring.submit_q.lock __cfaabi_dbg_ctx2);
[2489d31]	859
[5dadc9b7]	860	// Append to the list of ready entries
	861
	862	/* paranoid */ verify( idx <= mask );
	863
	864	ring.submit_q.array[ (*tail) & mask ] = idx & mask;
	865	__atomic_fetch_add(tail, 1ul32, __ATOMIC_SEQ_CST);
[d384787]	866
[5dadc9b7]	867	// Submit however, many entries need to be submitted
	868	int ret = syscall( __NR_io_uring_enter, ring.fd, 1, 0, 0, 0p, 0);
	869	if( ret < 0 ) {
	870	switch((int)errno) {
	871	default:
	872	abort( "KERNEL ERROR: IO_URING SUBMIT - %s\n", strerror(errno) );
	873	}
	874	}
[d384787]	875
[5dadc9b7]	876	// update statistics
	877	#if !defined(__CFA_NO_STATISTICS__)
[6f121b8]	878	ring.submit_q.stats.submit_avg.csm += 1;
[5dadc9b7]	879	ring.submit_q.stats.submit_avg.cnt += 1;
	880	#endif
	881
	882	unlock(ring.submit_q.lock);
[dd4e2d7]	883
	884	__cfadbg_print_safe( io, "Kernel I/O : Performed io_submit for %p, returned %d\n", active_thread(), ret );
[5dadc9b7]	885	}
[2489d31]	886	}
	887
	888	static inline void ?{}(struct io_uring_sqe & this, uint8_t opcode, int fd) {
	889	this.opcode = opcode;
	890	#if !defined(IOSQE_ASYNC)
	891	this.flags = 0;
	892	#else
	893	this.flags = IOSQE_ASYNC;
	894	#endif
	895	this.ioprio = 0;
	896	this.fd = fd;
	897	this.off = 0;
	898	this.addr = 0;
	899	this.len = 0;
	900	this.rw_flags = 0;
	901	this.__pad2[0] = this.__pad2[1] = this.__pad2[2] = 0;
[2d8f7b0]	902	}
	903
[2489d31]	904	static inline void ?{}(struct io_uring_sqe & this, uint8_t opcode, int fd, void * addr, uint32_t len, uint64_t off ) {
	905	(this){ opcode, fd };
	906	this.off = off;
	907	this.addr = (uint64_t)addr;
	908	this.len = len;
	909	}
[f6660520]	910
[92976d9]	911
	912	//=============================================================================================
	913	// I/O Interface
	914	//=============================================================================================
[f6660520]	915
[2d8f7b0]	916	#define __submit_prelude \
[6f121b8]	917	io_user_data data = { 0, active_thread() }; \
	918	struct __io_data & ring = *data.thrd->curr_cluster->io; \
[2d8f7b0]	919	struct io_uring_sqe * sqe; \
	920	uint32_t idx; \
[6f121b8]	921	[sqe, idx] = __submit_alloc( ring, (uint64_t)&data );
[2d8f7b0]	922
	923	#define __submit_wait \
[185efe6]	924	/__cfaabi_bits_print_safe( STDERR_FILENO, "Preparing user data %p for %p\n", &data, data.thrd );/ \
[6f121b8]	925	verify( sqe->user_data == (uint64_t)&data ); \
[2d8f7b0]	926	__submit( ring, idx ); \
	927	park( __cfaabi_dbg_ctx ); \
	928	return data.result;
[ecf6b46]	929	#endif
[2d8f7b0]	930
[0ea6c5a]	931	// Some forward declarations
	932	extern "C" {
[1268ad8]	933	#include <unistd.h>
[0ea6c5a]	934	#include <sys/types.h>
[93f7c001]	935	#include <sys/socket.h>
[6136ecc]	936	#include <sys/syscall.h>
[08a994e]	937
	938	#if defined(HAVE_PREADV2)
[0ea6c5a]	939	struct iovec;
	940	extern ssize_t preadv2 (int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
[08a994e]	941	#endif
	942	#if defined(HAVE_PWRITEV2)
	943	struct iovec;
[0ea6c5a]	944	extern ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
[08a994e]	945	#endif
[0ea6c5a]	946
	947	extern int fsync(int fd);
	948	extern int sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags);
	949
	950	struct msghdr;
	951	struct sockaddr;
	952	extern ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags);
	953	extern ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags);
	954	extern ssize_t send(int sockfd, const void *buf, size_t len, int flags);
	955	extern ssize_t recv(int sockfd, void *buf, size_t len, int flags);
	956	extern int accept4(int sockfd, struct sockaddr addr, socklen_t addrlen, int flags);
	957	extern int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
	958
	959	extern int fallocate(int fd, int mode, uint64_t offset, uint64_t len);
	960	extern int posix_fadvise(int fd, uint64_t offset, uint64_t len, int advice);
	961	extern int madvise(void *addr, size_t length, int advice);
	962
	963	extern int openat(int dirfd, const char *pathname, int flags, mode_t mode);
	964	extern int close(int fd);
	965
	966	extern ssize_t read (int fd, void *buf, size_t count);
	967	}
	968
[2d8f7b0]	969	//-----------------------------------------------------------------------------
	970	// Asynchronous operations
[08a994e]	971	#if defined(HAVE_PREADV2)
	972	ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) {
	973	#if !defined(HAVE_LINUX_IO_URING_H) \|\| !defined(IORING_OP_READV)
	974	return preadv2(fd, iov, iovcnt, offset, flags);
	975	#else
	976	__submit_prelude
[ecf6b46]	977
[08a994e]	978	(*sqe){ IORING_OP_READV, fd, iov, iovcnt, offset };
[ecf6b46]	979
[08a994e]	980	__submit_wait
	981	#endif
	982	}
	983	#endif
[ecf6b46]	984
[08a994e]	985	#if defined(HAVE_PWRITEV2)
	986	ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) {
	987	#if !defined(HAVE_LINUX_IO_URING_H) \|\| !defined(IORING_OP_WRITEV)
	988	return pwritev2(fd, iov, iovcnt, offset, flags);
	989	#else
	990	__submit_prelude
[ecf6b46]	991
[08a994e]	992	(*sqe){ IORING_OP_WRITEV, fd, iov, iovcnt, offset };
[ecf6b46]	993
[08a994e]	994	__submit_wait
	995	#endif
	996	}
	997	#endif
[ecf6b46]	998
	999	int cfa_fsync(int fd) {
	1000	#if !defined(HAVE_LINUX_IO_URING_H) \|\| !defined(IORING_OP_FSYNC)
	1001	return fsync(fd);
	1002	#else
	1003	__submit_prelude
	1004
	1005	(*sqe){ IORING_OP_FSYNC, fd };
	1006
	1007	__submit_wait
	1008	#endif
	1009	}
	1010
	1011	int cfa_sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags) {
	1012	#if !defined(HAVE_LINUX_IO_URING_H) \|\| !defined(IORING_OP_SYNC_FILE_RANGE)
	1013	return sync_file_range(fd, offset, nbytes, flags);
	1014	#else
	1015	__submit_prelude
	1016
	1017	(*sqe){ IORING_OP_SYNC_FILE_RANGE, fd };
	1018	sqe->off = offset;
	1019	sqe->len = nbytes;
	1020	sqe->sync_range_flags = flags;
	1021
	1022	__submit_wait
	1023	#endif
	1024	}
	1025
	1026
	1027	ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags) {
	1028	#if !defined(HAVE_LINUX_IO_URING_H) \|\| !defined(IORING_OP_SENDMSG)
[2292067]	1029	return sendmsg(sockfd, msg, flags);
[ecf6b46]	1030	#else
	1031	__submit_prelude
	1032
	1033	(*sqe){ IORING_OP_SENDMSG, sockfd, msg, 1, 0 };
	1034	sqe->msg_flags = flags;
	1035
	1036	__submit_wait
	1037	#endif
	1038	}
	1039
	1040	ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags) {
	1041	#if !defined(HAVE_LINUX_IO_URING_H) \|\| !defined(IORING_OP_RECVMSG)
[2292067]	1042	return recvmsg(sockfd, msg, flags);
[ecf6b46]	1043	#else
	1044	__submit_prelude
	1045
	1046	(*sqe){ IORING_OP_RECVMSG, sockfd, msg, 1, 0 };
	1047	sqe->msg_flags = flags;
	1048
	1049	__submit_wait
	1050	#endif
	1051	}
	1052
	1053	ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags) {
	1054	#if !defined(HAVE_LINUX_IO_URING_H) \|\| !defined(IORING_OP_SEND)
	1055	return send( sockfd, buf, len, flags );
	1056	#else
	1057	__submit_prelude
	1058
	1059	(*sqe){ IORING_OP_SEND, sockfd };
	1060	sqe->addr = (uint64_t)buf;
	1061	sqe->len = len;
	1062	sqe->msg_flags = flags;
	1063
	1064	__submit_wait
	1065	#endif
	1066	}
	1067
	1068	ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags) {
	1069	#if !defined(HAVE_LINUX_IO_URING_H) \|\| !defined(IORING_OP_RECV)
	1070	return recv( sockfd, buf, len, flags );
	1071	#else
	1072	__submit_prelude
	1073
	1074	(*sqe){ IORING_OP_RECV, sockfd };
	1075	sqe->addr = (uint64_t)buf;
	1076	sqe->len = len;
	1077	sqe->msg_flags = flags;
	1078
	1079	__submit_wait
	1080	#endif
	1081	}
	1082
	1083	int cfa_accept4(int sockfd, struct sockaddr addr, socklen_t addrlen, int flags) {
	1084	#if !defined(HAVE_LINUX_IO_URING_H) \|\| !defined(IORING_OP_ACCEPT)
[0ea6c5a]	1085	return accept4( sockfd, addr, addrlen, flags );
[ecf6b46]	1086	#else
	1087	__submit_prelude
	1088
	1089	(*sqe){ IORING_OP_ACCEPT, sockfd };
	1090	sqe->addr = addr;
	1091	sqe->addr2 = addrlen;
	1092	sqe->accept_flags = flags;
	1093
	1094	__submit_wait
	1095	#endif
	1096	}
	1097
	1098	int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen) {
	1099	#if !defined(HAVE_LINUX_IO_URING_H) \|\| !defined(IORING_OP_CONNECT)
[0ea6c5a]	1100	return connect( sockfd, addr, addrlen );
[ecf6b46]	1101	#else
	1102	__submit_prelude
	1103
	1104	(*sqe){ IORING_OP_CONNECT, sockfd };
	1105	sqe->addr = (uint64_t)addr;
	1106	sqe->off = addrlen;
	1107
	1108	__submit_wait
	1109	#endif
	1110	}
	1111
	1112	int cfa_fallocate(int fd, int mode, uint64_t offset, uint64_t len) {
	1113	#if !defined(HAVE_LINUX_IO_URING_H) \|\| !defined(IORING_OP_FALLOCATE)
	1114	return fallocate( fd, mode, offset, len );
	1115	#else
	1116	__submit_prelude
	1117
	1118	(*sqe){ IORING_OP_FALLOCATE, fd };
	1119	sqe->off = offset;
	1120	sqe->len = length;
	1121	sqe->mode = mode;
	1122
	1123	__submit_wait
	1124	#endif
	1125	}
	1126
	1127	int cfa_fadvise(int fd, uint64_t offset, uint64_t len, int advice) {
	1128	#if !defined(HAVE_LINUX_IO_URING_H) \|\| !defined(IORING_OP_FADVISE)
	1129	return posix_fadvise( fd, offset, len, advice );
	1130	#else
	1131	__submit_prelude
	1132
	1133	(*sqe){ IORING_OP_FADVISE, fd };
	1134	sqe->off = (uint64_t)offset;
	1135	sqe->len = length;
	1136	sqe->fadvise_advice = advice;
	1137
	1138	__submit_wait
	1139	#endif
	1140	}
	1141
	1142	int cfa_madvise(void *addr, size_t length, int advice) {
	1143	#if !defined(HAVE_LINUX_IO_URING_H) \|\| !defined(IORING_OP_MADVISE)
	1144	return madvise( addr, length, advice );
	1145	#else
	1146	__submit_prelude
	1147
	1148	(*sqe){ IORING_OP_MADVISE, 0 };
	1149	sqe->addr = (uint64_t)addr;
	1150	sqe->len = length;
	1151	sqe->fadvise_advice = advice;
	1152
	1153	__submit_wait
	1154	#endif
	1155	}
	1156
	1157	int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode) {
	1158	#if !defined(HAVE_LINUX_IO_URING_H) \|\| !defined(IORING_OP_OPENAT)
	1159	return openat( dirfd, pathname, flags, mode );
	1160	#else
	1161	__submit_prelude
	1162
	1163	(*sqe){ IORING_OP_OPENAT, dirfd };
	1164	sqe->addr = (uint64_t)pathname;
	1165	sqe->open_flags = flags;
	1166	sqe->mode = mode;
	1167
	1168	__submit_wait
	1169	#endif
	1170	}
	1171
	1172	int cfa_close(int fd) {
	1173	#if !defined(HAVE_LINUX_IO_URING_H) \|\| !defined(IORING_OP_CLOSE)
	1174	return close( fd );
	1175	#else
	1176	__submit_prelude
	1177
	1178	(*sqe){ IORING_OP_CLOSE, fd };
	1179
	1180	__submit_wait
	1181	#endif
	1182	}
	1183
	1184
	1185	ssize_t cfa_read(int fd, void *buf, size_t count) {
	1186	#if !defined(HAVE_LINUX_IO_URING_H) \|\| !defined(IORING_OP_READ)
	1187	return read( fd, buf, count );
	1188	#else
	1189	__submit_prelude
	1190
	1191	(*sqe){ IORING_OP_READ, fd, buf, count, 0 };
	1192
	1193	__submit_wait
	1194	#endif
	1195	}
	1196
	1197	ssize_t cfa_write(int fd, void *buf, size_t count) {
	1198	#if !defined(HAVE_LINUX_IO_URING_H) \|\| !defined(IORING_OP_WRITE)
	1199	return read( fd, buf, count );
	1200	#else
	1201	__submit_prelude
	1202
	1203	(*sqe){ IORING_OP_WRITE, fd, buf, count, 0 };
	1204
	1205	__submit_wait
	1206	#endif
	1207	}
[2d8f7b0]	1208
	1209	//-----------------------------------------------------------------------------
	1210	// Check if a function is asynchronous
	1211
	1212	// Macro magic to reduce the size of the following switch case
[ecf6b46]	1213	#define IS_DEFINED_APPLY(f, ...) f(__VA_ARGS__)
	1214	#define IS_DEFINED_SECOND(first, second, ...) second
	1215	#define IS_DEFINED_TEST(expansion) _CFA_IO_FEATURE_##expansion
	1216	#define IS_DEFINED(macro) IS_DEFINED_APPLY( IS_DEFINED_SECOND,IS_DEFINED_TEST(macro) false, true)
[2d8f7b0]	1217
[ecf6b46]	1218	bool has_user_level_blocking( fptr_t func ) {
	1219	#if defined(HAVE_LINUX_IO_URING_H)
[08a994e]	1220	#if defined(HAVE_PREADV2)
[171ca0d]	1221	if( /func == (fptr_t)preadv2 \|\| /
	1222	func == (fptr_t)cfa_preadv2 )
	1223	#define _CFA_IO_FEATURE_IORING_OP_READV ,
	1224	return IS_DEFINED(IORING_OP_READV);
	1225	#endif
	1226
	1227	#if defined(HAVE_PWRITEV2)
[08a994e]	1228	if( /func == (fptr_t)pwritev2 \|\| /
	1229	func == (fptr_t)cfa_pwritev2 )
	1230	#define _CFA_IO_FEATURE_IORING_OP_WRITEV ,
	1231	return IS_DEFINED(IORING_OP_WRITEV);
	1232	#endif
[2d8f7b0]	1233
[171ca0d]	1234	if( /func == (fptr_t)fsync \|\| /
	1235	func == (fptr_t)cfa_fsync )
	1236	#define _CFA_IO_FEATURE_IORING_OP_FSYNC ,
	1237	return IS_DEFINED(IORING_OP_FSYNC);
[2d8f7b0]	1238
	1239	if( /func == (fptr_t)ync_file_range \|\| /
[ecf6b46]	1240	func == (fptr_t)cfa_sync_file_range )
[2d8f7b0]	1241	#define _CFA_IO_FEATURE_IORING_OP_SYNC_FILE_RANGE ,
	1242	return IS_DEFINED(IORING_OP_SYNC_FILE_RANGE);
	1243
	1244	if( /func == (fptr_t)sendmsg \|\| /
[ecf6b46]	1245	func == (fptr_t)cfa_sendmsg )
[2d8f7b0]	1246	#define _CFA_IO_FEATURE_IORING_OP_SENDMSG ,
	1247	return IS_DEFINED(IORING_OP_SENDMSG);
	1248
	1249	if( /func == (fptr_t)recvmsg \|\| /
[ecf6b46]	1250	func == (fptr_t)cfa_recvmsg )
[2d8f7b0]	1251	#define _CFA_IO_FEATURE_IORING_OP_RECVMSG ,
	1252	return IS_DEFINED(IORING_OP_RECVMSG);
	1253
	1254	if( /func == (fptr_t)send \|\| /
[2489d31]	1255	func == (fptr_t)cfa_send )
[2d8f7b0]	1256	#define _CFA_IO_FEATURE_IORING_OP_SEND ,
	1257	return IS_DEFINED(IORING_OP_SEND);
	1258
	1259	if( /func == (fptr_t)recv \|\| /
[2489d31]	1260	func == (fptr_t)cfa_recv )
[2d8f7b0]	1261	#define _CFA_IO_FEATURE_IORING_OP_RECV ,
	1262	return IS_DEFINED(IORING_OP_RECV);
	1263
	1264	if( /func == (fptr_t)accept4 \|\| /
[2489d31]	1265	func == (fptr_t)cfa_accept4 )
[2d8f7b0]	1266	#define _CFA_IO_FEATURE_IORING_OP_ACCEPT ,
	1267	return IS_DEFINED(IORING_OP_ACCEPT);
	1268
	1269	if( /func == (fptr_t)connect \|\| /
[2489d31]	1270	func == (fptr_t)cfa_connect )
[2d8f7b0]	1271	#define _CFA_IO_FEATURE_IORING_OP_CONNECT ,
	1272	return IS_DEFINED(IORING_OP_CONNECT);
	1273
	1274	if( /func == (fptr_t)fallocate \|\| /
[2489d31]	1275	func == (fptr_t)cfa_fallocate )
[2d8f7b0]	1276	#define _CFA_IO_FEATURE_IORING_OP_FALLOCATE ,
	1277	return IS_DEFINED(IORING_OP_FALLOCATE);
	1278
[0ea6c5a]	1279	if( /func == (fptr_t)posix_fadvise \|\| /
[2489d31]	1280	func == (fptr_t)cfa_fadvise )
[2d8f7b0]	1281	#define _CFA_IO_FEATURE_IORING_OP_FADVISE ,
	1282	return IS_DEFINED(IORING_OP_FADVISE);
	1283
	1284	if( /func == (fptr_t)madvise \|\| /
[2489d31]	1285	func == (fptr_t)cfa_madvise )
[2d8f7b0]	1286	#define _CFA_IO_FEATURE_IORING_OP_MADVISE ,
	1287	return IS_DEFINED(IORING_OP_MADVISE);
	1288
	1289	if( /func == (fptr_t)openat \|\| /
[2489d31]	1290	func == (fptr_t)cfa_openat )
[2d8f7b0]	1291	#define _CFA_IO_FEATURE_IORING_OP_OPENAT ,
	1292	return IS_DEFINED(IORING_OP_OPENAT);
	1293
	1294	if( /func == (fptr_t)close \|\| /
[2489d31]	1295	func == (fptr_t)cfa_close )
[2d8f7b0]	1296	#define _CFA_IO_FEATURE_IORING_OP_CLOSE ,
	1297	return IS_DEFINED(IORING_OP_CLOSE);
	1298
	1299	if( /func == (fptr_t)read \|\| /
[ecf6b46]	1300	func == (fptr_t)cfa_read )
[2d8f7b0]	1301	#define _CFA_IO_FEATURE_IORING_OP_READ ,
	1302	return IS_DEFINED(IORING_OP_READ);
	1303
	1304	if( /func == (fptr_t)write \|\| /
[ecf6b46]	1305	func == (fptr_t)cfa_write )
[2d8f7b0]	1306	#define _CFA_IO_FEATURE_IORING_OP_WRITE ,
	1307	return IS_DEFINED(IORING_OP_WRITE);
[ecf6b46]	1308	#endif
[2d8f7b0]	1309
[ecf6b46]	1310	return false;
	1311	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format