| [ecf6b46] | 1 | //
 | 
|---|
 | 2 | // Cforall Version 1.0.0 Copyright (C) 2020 University of Waterloo
 | 
|---|
 | 3 | //
 | 
|---|
 | 4 | // The contents of this file are covered under the licence agreement in the
 | 
|---|
 | 5 | // file "LICENCE" distributed with Cforall.
 | 
|---|
 | 6 | //
 | 
|---|
 | 7 | // io.cfa --
 | 
|---|
 | 8 | //
 | 
|---|
 | 9 | // Author           : Thierry Delisle
 | 
|---|
 | 10 | // Created On       : Thu Apr 23 17:31:00 2020
 | 
|---|
 | 11 | // Last Modified By :
 | 
|---|
 | 12 | // Last Modified On :
 | 
|---|
 | 13 | // Update Count     :
 | 
|---|
 | 14 | //
 | 
|---|
 | 15 | 
 | 
|---|
| [4069faad] | 16 | // #define __CFA_DEBUG_PRINT_IO__
 | 
|---|
 | 17 | 
 | 
|---|
| [92976d9] | 18 | #include "kernel.hfa"
 | 
|---|
 | 19 | 
 | 
|---|
 | 20 | #if !defined(HAVE_LINUX_IO_URING_H)
 | 
|---|
| [f6660520] | 21 |         void __kernel_io_startup( cluster & ) {
 | 
|---|
| [92976d9] | 22 |                 // Nothing to do without io_uring
 | 
|---|
 | 23 |         }
 | 
|---|
 | 24 | 
 | 
|---|
| [f6660520] | 25 |         void __kernel_io_start_thrd( cluster & ) {
 | 
|---|
 | 26 |                 // Nothing to do without io_uring
 | 
|---|
 | 27 |         }
 | 
|---|
 | 28 | 
 | 
|---|
 | 29 |         void __kernel_io_stop_thrd ( cluster & ) {
 | 
|---|
 | 30 |                 // Nothing to do without io_uring
 | 
|---|
 | 31 |         }
 | 
|---|
 | 32 | 
 | 
|---|
 | 33 |         void __kernel_io_shutdown( cluster & ) {
 | 
|---|
| [92976d9] | 34 |                 // Nothing to do without io_uring
 | 
|---|
 | 35 |         }
 | 
|---|
 | 36 | 
 | 
|---|
 | 37 | #else
 | 
|---|
 | 38 |         extern "C" {
 | 
|---|
 | 39 |                 #define _GNU_SOURCE         /* See feature_test_macros(7) */
 | 
|---|
 | 40 |                 #include <errno.h>
 | 
|---|
 | 41 |                 #include <stdint.h>
 | 
|---|
 | 42 |                 #include <string.h>
 | 
|---|
 | 43 |                 #include <unistd.h>
 | 
|---|
 | 44 |                 #include <sys/mman.h>
 | 
|---|
 | 45 |                 #include <sys/syscall.h>
 | 
|---|
 | 46 | 
 | 
|---|
 | 47 |                 #include <linux/io_uring.h>
 | 
|---|
 | 48 |         }
 | 
|---|
 | 49 | 
 | 
|---|
 | 50 |         #include "bits/signal.hfa"
 | 
|---|
 | 51 |         #include "kernel_private.hfa"
 | 
|---|
 | 52 |         #include "thread.hfa"
 | 
|---|
 | 53 | 
 | 
|---|
 | 54 |         uint32_t entries_per_cluster() {
 | 
|---|
 | 55 |                 return 256;
 | 
|---|
 | 56 |         }
 | 
|---|
 | 57 | 
 | 
|---|
| [f6660520] | 58 |         static void * __io_poller_slow( void * arg );
 | 
|---|
 | 59 | 
 | 
|---|
 | 60 |         // Weirdly, some systems that do support io_uring don't actually define these
 | 
|---|
 | 61 |         #ifdef __alpha__
 | 
|---|
 | 62 |                 /*
 | 
|---|
 | 63 |                 * alpha is the only exception, all other architectures
 | 
|---|
 | 64 |                 * have common numbers for new system calls.
 | 
|---|
 | 65 |                 */
 | 
|---|
 | 66 |                 #ifndef __NR_io_uring_setup
 | 
|---|
 | 67 |                         #define __NR_io_uring_setup           535
 | 
|---|
 | 68 |                 #endif
 | 
|---|
 | 69 |                 #ifndef __NR_io_uring_enter
 | 
|---|
 | 70 |                         #define __NR_io_uring_enter           536
 | 
|---|
 | 71 |                 #endif
 | 
|---|
 | 72 |                 #ifndef __NR_io_uring_register
 | 
|---|
 | 73 |                         #define __NR_io_uring_register        537
 | 
|---|
 | 74 |                 #endif
 | 
|---|
 | 75 |         #else /* !__alpha__ */
 | 
|---|
 | 76 |                 #ifndef __NR_io_uring_setup
 | 
|---|
 | 77 |                         #define __NR_io_uring_setup           425
 | 
|---|
 | 78 |                 #endif
 | 
|---|
 | 79 |                 #ifndef __NR_io_uring_enter
 | 
|---|
 | 80 |                         #define __NR_io_uring_enter           426
 | 
|---|
 | 81 |                 #endif
 | 
|---|
 | 82 |                 #ifndef __NR_io_uring_register
 | 
|---|
 | 83 |                         #define __NR_io_uring_register        427
 | 
|---|
 | 84 |                 #endif
 | 
|---|
 | 85 |         #endif
 | 
|---|
 | 86 | 
 | 
|---|
 | 87 |         #if defined(__CFA_IO_POLLING_USER__)
 | 
|---|
 | 88 |                 void ?{}( __io_poller_fast & this, struct cluster & cltr ) {
 | 
|---|
 | 89 |                         this.ring = &cltr.io;
 | 
|---|
| [93f7c001] | 90 |                         (this.thrd){ "Fast I/O Poller", cltr };
 | 
|---|
| [f6660520] | 91 |                 }
 | 
|---|
 | 92 |                 void ^?{}( __io_poller_fast & mutex this );
 | 
|---|
 | 93 |         void main( __io_poller_fast & this );
 | 
|---|
 | 94 |         static inline $thread * get_thread( __io_poller_fast & this ) { return &this.thrd; }
 | 
|---|
 | 95 |                 void ^?{}( __io_poller_fast & mutex this ) {}
 | 
|---|
 | 96 |         #endif
 | 
|---|
| [185efe6] | 97 | 
 | 
|---|
| [92976d9] | 98 | //=============================================================================================
 | 
|---|
 | 99 | // I/O Startup / Shutdown logic
 | 
|---|
 | 100 | //=============================================================================================
 | 
|---|
| [f6660520] | 101 |         void __kernel_io_startup( cluster & this, bool main_cluster ) {
 | 
|---|
| [92976d9] | 102 |                 // Step 1 : call to setup
 | 
|---|
 | 103 |                 struct io_uring_params params;
 | 
|---|
 | 104 |                 memset(¶ms, 0, sizeof(params));
 | 
|---|
 | 105 | 
 | 
|---|
| [2d8f7b0] | 106 |                 uint32_t nentries = entries_per_cluster();
 | 
|---|
 | 107 | 
 | 
|---|
 | 108 |                 int fd = syscall(__NR_io_uring_setup, nentries, ¶ms );
 | 
|---|
| [92976d9] | 109 |                 if(fd < 0) {
 | 
|---|
 | 110 |                         abort("KERNEL ERROR: IO_URING SETUP - %s\n", strerror(errno));
 | 
|---|
 | 111 |                 }
 | 
|---|
 | 112 | 
 | 
|---|
 | 113 |                 // Step 2 : mmap result
 | 
|---|
 | 114 |                 memset(&this.io, 0, sizeof(struct io_ring));
 | 
|---|
| [2d8f7b0] | 115 |                 struct io_uring_sq & sq = this.io.submit_q;
 | 
|---|
 | 116 |                 struct io_uring_cq & cq = this.io.completion_q;
 | 
|---|
| [92976d9] | 117 | 
 | 
|---|
 | 118 |                 // calculate the right ring size
 | 
|---|
| [2d8f7b0] | 119 |                 sq.ring_sz = params.sq_off.array + (params.sq_entries * sizeof(unsigned)           );
 | 
|---|
 | 120 |                 cq.ring_sz = params.cq_off.cqes  + (params.cq_entries * sizeof(struct io_uring_cqe));
 | 
|---|
| [92976d9] | 121 | 
 | 
|---|
 | 122 |                 // Requires features
 | 
|---|
| [d384787] | 123 |                 #if defined(IORING_FEAT_SINGLE_MMAP)
 | 
|---|
 | 124 |                         // adjust the size according to the parameters
 | 
|---|
 | 125 |                         if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
 | 
|---|
 | 126 |                                 cq->ring_sz = sq->ring_sz = max(cq->ring_sz, sq->ring_sz);
 | 
|---|
 | 127 |                         }
 | 
|---|
 | 128 |                 #endif
 | 
|---|
| [92976d9] | 129 | 
 | 
|---|
 | 130 |                 // mmap the Submit Queue into existence
 | 
|---|
| [2d8f7b0] | 131 |                 sq.ring_ptr = mmap(0, sq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
 | 
|---|
 | 132 |                 if (sq.ring_ptr == (void*)MAP_FAILED) {
 | 
|---|
| [92976d9] | 133 |                         abort("KERNEL ERROR: IO_URING MMAP1 - %s\n", strerror(errno));
 | 
|---|
 | 134 |                 }
 | 
|---|
 | 135 | 
 | 
|---|
 | 136 |                 // Requires features
 | 
|---|
| [d384787] | 137 |                 #if defined(IORING_FEAT_SINGLE_MMAP)
 | 
|---|
 | 138 |                         // mmap the Completion Queue into existence (may or may not be needed)
 | 
|---|
 | 139 |                         if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
 | 
|---|
 | 140 |                                 cq->ring_ptr = sq->ring_ptr;
 | 
|---|
 | 141 |                         }
 | 
|---|
 | 142 |                         else
 | 
|---|
 | 143 |                 #endif
 | 
|---|
 | 144 |                 {
 | 
|---|
| [92976d9] | 145 |                         // We need multiple call to MMAP
 | 
|---|
| [2d8f7b0] | 146 |                         cq.ring_ptr = mmap(0, cq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
 | 
|---|
 | 147 |                         if (cq.ring_ptr == (void*)MAP_FAILED) {
 | 
|---|
 | 148 |                                 munmap(sq.ring_ptr, sq.ring_sz);
 | 
|---|
| [92976d9] | 149 |                                 abort("KERNEL ERROR: IO_URING MMAP2 - %s\n", strerror(errno));
 | 
|---|
 | 150 |                         }
 | 
|---|
| [d384787] | 151 |                 }
 | 
|---|
| [92976d9] | 152 | 
 | 
|---|
 | 153 |                 // mmap the submit queue entries
 | 
|---|
 | 154 |                 size_t size = params.sq_entries * sizeof(struct io_uring_sqe);
 | 
|---|
| [2d8f7b0] | 155 |                 sq.sqes = (struct io_uring_sqe *)mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
 | 
|---|
 | 156 |                 if (sq.sqes == (struct io_uring_sqe *)MAP_FAILED) {
 | 
|---|
 | 157 |                         munmap(sq.ring_ptr, sq.ring_sz);
 | 
|---|
 | 158 |                         if (cq.ring_ptr != sq.ring_ptr) munmap(cq.ring_ptr, cq.ring_sz);
 | 
|---|
| [92976d9] | 159 |                         abort("KERNEL ERROR: IO_URING MMAP3 - %s\n", strerror(errno));
 | 
|---|
 | 160 |                 }
 | 
|---|
 | 161 | 
 | 
|---|
 | 162 |                 // Get the pointers from the kernel to fill the structure
 | 
|---|
 | 163 |                 // submit queue
 | 
|---|
| [2d8f7b0] | 164 |                 sq.head    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
 | 
|---|
 | 165 |                 sq.tail    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
 | 
|---|
 | 166 |                 sq.mask    = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
 | 
|---|
 | 167 |                 sq.num     = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
 | 
|---|
 | 168 |                 sq.flags   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
 | 
|---|
 | 169 |                 sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
 | 
|---|
 | 170 |                 sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
 | 
|---|
 | 171 |                 sq.alloc = *sq.tail;
 | 
|---|
| [c59a346] | 172 |                 sq.ready = *sq.tail;
 | 
|---|
| [92976d9] | 173 | 
 | 
|---|
 | 174 |                 // completion queue
 | 
|---|
| [2d8f7b0] | 175 |                 cq.head     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
 | 
|---|
 | 176 |                 cq.tail     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
 | 
|---|
 | 177 |                 cq.mask     = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
 | 
|---|
 | 178 |                 cq.num      = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
 | 
|---|
 | 179 |                 cq.overflow = (         uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
 | 
|---|
 | 180 |                 cq.cqes   = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
 | 
|---|
 | 181 | 
 | 
|---|
 | 182 |                 // some paranoid checks
 | 
|---|
 | 183 |                 /* paranoid */ verifyf( (*cq.mask) == ((*cq.num) - 1ul32), "IO_URING Expected mask to be %u (%u entries), was %u", (*cq.num) - 1ul32, *cq.num, *cq.mask  );
 | 
|---|
 | 184 |                 /* paranoid */ verifyf( (*cq.num)  >= nentries, "IO_URING Expected %u entries, got %u", nentries, *cq.num );
 | 
|---|
 | 185 |                 /* paranoid */ verifyf( (*cq.head) == 0, "IO_URING Expected head to be 0, got %u", *cq.head );
 | 
|---|
 | 186 |                 /* paranoid */ verifyf( (*cq.tail) == 0, "IO_URING Expected tail to be 0, got %u", *cq.tail );
 | 
|---|
 | 187 | 
 | 
|---|
 | 188 |                 /* paranoid */ verifyf( (*sq.mask) == ((*sq.num) - 1ul32), "IO_URING Expected mask to be %u (%u entries), was %u", (*sq.num) - 1ul32, *sq.num, *sq.mask );
 | 
|---|
 | 189 |                 /* paranoid */ verifyf( (*sq.num) >= nentries, "IO_URING Expected %u entries, got %u", nentries, *sq.num );
 | 
|---|
 | 190 |                 /* paranoid */ verifyf( (*sq.head) == 0, "IO_URING Expected head to be 0, got %u", *sq.head );
 | 
|---|
 | 191 |                 /* paranoid */ verifyf( (*sq.tail) == 0, "IO_URING Expected tail to be 0, got %u", *sq.tail );
 | 
|---|
| [92976d9] | 192 | 
 | 
|---|
 | 193 |                 // Update the global ring info
 | 
|---|
 | 194 |                 this.io.flags = params.flags;
 | 
|---|
 | 195 |                 this.io.fd    = fd;
 | 
|---|
 | 196 |                 this.io.done  = false;
 | 
|---|
| [2d8f7b0] | 197 |                 (this.io.submit){ min(*sq.num, *cq.num) };
 | 
|---|
| [92976d9] | 198 | 
 | 
|---|
| [d384787] | 199 |                 // Initialize statistics
 | 
|---|
| [038be32] | 200 |                 #if !defined(__CFA_NO_STATISTICS__)
 | 
|---|
 | 201 |                         this.io.submit_q.stats.submit_avg.val = 0;
 | 
|---|
 | 202 |                         this.io.submit_q.stats.submit_avg.cnt = 0;
 | 
|---|
 | 203 |                         this.io.completion_q.stats.completed_avg.val = 0;
 | 
|---|
 | 204 |                         this.io.completion_q.stats.completed_avg.cnt = 0;
 | 
|---|
 | 205 |                 #endif
 | 
|---|
| [d384787] | 206 | 
 | 
|---|
| [f6660520] | 207 |                 if(!main_cluster) {
 | 
|---|
 | 208 |                         __kernel_io_finish_start( this );
 | 
|---|
 | 209 |                 }
 | 
|---|
 | 210 |         }
 | 
|---|
 | 211 | 
 | 
|---|
 | 212 |         void __kernel_io_finish_start( cluster & this ) {
 | 
|---|
 | 213 |                 #if defined(__CFA_IO_POLLING_USER__)
 | 
|---|
| [4069faad] | 214 |                         __cfadbg_print_safe(io, "Kernel I/O : Creating fast poller for cluter %p\n", &this);
 | 
|---|
| [93f7c001] | 215 |                         (this.io.poller.fast){ this };
 | 
|---|
| [f6660520] | 216 |                         __thrd_start( this.io.poller.fast, main );
 | 
|---|
 | 217 |                 #endif
 | 
|---|
 | 218 | 
 | 
|---|
| [92976d9] | 219 |                 // Create the poller thread
 | 
|---|
| [4069faad] | 220 |                 __cfadbg_print_safe(io, "Kernel I/O : Creating slow poller for cluter %p\n", &this);
 | 
|---|
| [f6660520] | 221 |                 this.io.poller.slow.stack = __create_pthread( &this.io.poller.slow.kthrd, __io_poller_slow, &this );
 | 
|---|
| [92976d9] | 222 |         }
 | 
|---|
 | 223 | 
 | 
|---|
| [f6660520] | 224 |         void __kernel_io_prepare_stop( cluster & this ) {
 | 
|---|
| [4069faad] | 225 |                 __cfadbg_print_safe(io, "Kernel I/O : Stopping pollers for cluster\n", &this);
 | 
|---|
| [92976d9] | 226 |                 // Notify the poller thread of the shutdown
 | 
|---|
 | 227 |                 __atomic_store_n(&this.io.done, true, __ATOMIC_SEQ_CST);
 | 
|---|
| [f6660520] | 228 | 
 | 
|---|
 | 229 |                 // Stop the IO Poller
 | 
|---|
| [92976d9] | 230 |                 sigval val = { 1 };
 | 
|---|
| [f6660520] | 231 |                 pthread_sigqueue( this.io.poller.slow.kthrd, SIGUSR1, val );
 | 
|---|
 | 232 |                 #if defined(__CFA_IO_POLLING_USER__)
 | 
|---|
 | 233 |                         post( this.io.poller.sem );
 | 
|---|
 | 234 |                 #endif
 | 
|---|
| [92976d9] | 235 | 
 | 
|---|
 | 236 |                 // Wait for the poller thread to finish
 | 
|---|
| [f6660520] | 237 |                 pthread_join( this.io.poller.slow.kthrd, 0p );
 | 
|---|
 | 238 |                 free( this.io.poller.slow.stack );
 | 
|---|
 | 239 | 
 | 
|---|
| [4069faad] | 240 |                 __cfadbg_print_safe(io, "Kernel I/O : Slow poller stopped for cluster\n", &this);
 | 
|---|
 | 241 | 
 | 
|---|
| [f6660520] | 242 |                 #if defined(__CFA_IO_POLLING_USER__)
 | 
|---|
 | 243 |                         // unpark the fast io_poller
 | 
|---|
 | 244 |                         unpark( &this.io.poller.fast.thrd __cfaabi_dbg_ctx2 );
 | 
|---|
 | 245 | 
 | 
|---|
 | 246 |                         ^(this.io.poller.fast){};
 | 
|---|
| [4069faad] | 247 | 
 | 
|---|
 | 248 |                         __cfadbg_print_safe(io, "Kernel I/O : Fast poller stopped for cluster\n", &this);
 | 
|---|
| [f6660520] | 249 |                 #endif
 | 
|---|
 | 250 |         }
 | 
|---|
 | 251 | 
 | 
|---|
 | 252 |         void __kernel_io_shutdown( cluster & this, bool main_cluster ) {
 | 
|---|
 | 253 |                 if(!main_cluster) {
 | 
|---|
 | 254 |                         __kernel_io_prepare_stop( this );
 | 
|---|
 | 255 |                 }
 | 
|---|
| [92976d9] | 256 | 
 | 
|---|
| [d384787] | 257 |                 // print statistics
 | 
|---|
| [038be32] | 258 |                 #if !defined(__CFA_NO_STATISTICS__)
 | 
|---|
 | 259 |                         if(this.print_stats) {
 | 
|---|
 | 260 |                                 __cfaabi_bits_print_safe( STDERR_FILENO,
 | 
|---|
 | 261 |                                         "----- I/O uRing Stats -----\n"
 | 
|---|
 | 262 |                                         "- total submit calls  : %llu\n"
 | 
|---|
 | 263 |                                         "- avg submit          : %lf\n"
 | 
|---|
 | 264 |                                         "- total wait calls    : %llu\n"
 | 
|---|
 | 265 |                                         "- avg completion/wait : %lf\n",
 | 
|---|
 | 266 |                                         this.io.submit_q.stats.submit_avg.cnt,
 | 
|---|
 | 267 |                                         ((double)this.io.submit_q.stats.submit_avg.val) / this.io.submit_q.stats.submit_avg.cnt,
 | 
|---|
 | 268 |                                         this.io.completion_q.stats.completed_avg.cnt,
 | 
|---|
 | 269 |                                         ((double)this.io.completion_q.stats.completed_avg.val) / this.io.completion_q.stats.completed_avg.cnt
 | 
|---|
 | 270 |                                 );
 | 
|---|
 | 271 |                         }
 | 
|---|
 | 272 |                 #endif
 | 
|---|
| [d384787] | 273 | 
 | 
|---|
| [92976d9] | 274 |                 // Shutdown the io rings
 | 
|---|
 | 275 |                 struct io_uring_sq & sq = this.io.submit_q;
 | 
|---|
 | 276 |                 struct io_uring_cq & cq = this.io.completion_q;
 | 
|---|
 | 277 | 
 | 
|---|
 | 278 |                 // unmap the submit queue entries
 | 
|---|
| [2d8f7b0] | 279 |                 munmap(sq.sqes, (*sq.num) * sizeof(struct io_uring_sqe));
 | 
|---|
| [92976d9] | 280 | 
 | 
|---|
 | 281 |                 // unmap the Submit Queue ring
 | 
|---|
 | 282 |                 munmap(sq.ring_ptr, sq.ring_sz);
 | 
|---|
 | 283 | 
 | 
|---|
 | 284 |                 // unmap the Completion Queue ring, if it is different
 | 
|---|
 | 285 |                 if (cq.ring_ptr != sq.ring_ptr) {
 | 
|---|
 | 286 |                         munmap(cq.ring_ptr, cq.ring_sz);
 | 
|---|
 | 287 |                 }
 | 
|---|
 | 288 | 
 | 
|---|
 | 289 |                 // close the file descriptor
 | 
|---|
 | 290 |                 close(this.io.fd);
 | 
|---|
 | 291 |         }
 | 
|---|
 | 292 | 
 | 
|---|
 | 293 | //=============================================================================================
 | 
|---|
 | 294 | // I/O Polling
 | 
|---|
 | 295 | //=============================================================================================
 | 
|---|
 | 296 |         struct io_user_data {
 | 
|---|
 | 297 |                 int32_t result;
 | 
|---|
 | 298 |                 $thread * thrd;
 | 
|---|
 | 299 |         };
 | 
|---|
 | 300 | 
 | 
|---|
 | 301 |         // Process a single completion message from the io_uring
 | 
|---|
 | 302 |         // This is NOT thread-safe
 | 
|---|
| [f6660520] | 303 |         static int __drain_io( struct io_ring & ring, sigset_t * mask, int waitcnt, bool in_kernel ) {
 | 
|---|
 | 304 |                 int ret = syscall( __NR_io_uring_enter, ring.fd, 0, waitcnt, IORING_ENTER_GETEVENTS, mask, _NSIG / 8);
 | 
|---|
| [d384787] | 305 |                 if( ret < 0 ) {
 | 
|---|
 | 306 |                         switch((int)errno) {
 | 
|---|
 | 307 |                         case EAGAIN:
 | 
|---|
 | 308 |                         case EINTR:
 | 
|---|
 | 309 |                                 return -EAGAIN;
 | 
|---|
 | 310 |                         default:
 | 
|---|
 | 311 |                                 abort( "KERNEL ERROR: IO_URING WAIT - %s\n", strerror(errno) );
 | 
|---|
 | 312 |                         }
 | 
|---|
 | 313 |                 }
 | 
|---|
 | 314 | 
 | 
|---|
 | 315 |                 // Drain the queue
 | 
|---|
| [92976d9] | 316 |                 unsigned head = *ring.completion_q.head;
 | 
|---|
 | 317 |                 unsigned tail = __atomic_load_n(ring.completion_q.tail, __ATOMIC_ACQUIRE);
 | 
|---|
 | 318 | 
 | 
|---|
| [d384787] | 319 |                 // Nothing was new return 0
 | 
|---|
 | 320 |                 if (head == tail) {
 | 
|---|
| [038be32] | 321 |                         #if !defined(__CFA_NO_STATISTICS__)
 | 
|---|
 | 322 |                                 ring.completion_q.stats.completed_avg.cnt += 1;
 | 
|---|
 | 323 |                         #endif
 | 
|---|
| [d384787] | 324 |                         return 0;
 | 
|---|
 | 325 |                 }
 | 
|---|
| [92976d9] | 326 | 
 | 
|---|
| [d384787] | 327 |                 uint32_t count = tail - head;
 | 
|---|
 | 328 |                 for(i; count) {
 | 
|---|
 | 329 |                         unsigned idx = (head + i) & (*ring.completion_q.mask);
 | 
|---|
 | 330 |                         struct io_uring_cqe & cqe = ring.completion_q.cqes[idx];
 | 
|---|
| [92976d9] | 331 | 
 | 
|---|
| [d384787] | 332 |                         /* paranoid */ verify(&cqe);
 | 
|---|
| [92976d9] | 333 | 
 | 
|---|
| [d384787] | 334 |                         struct io_user_data * data = (struct io_user_data *)cqe.user_data;
 | 
|---|
| [4069faad] | 335 |                         __cfadbg_print_safe( io, "Kernel I/O : Performed reading io cqe %p, result %d for %p\n", data, cqe.res, data->thrd );
 | 
|---|
| [2d8f7b0] | 336 | 
 | 
|---|
| [d384787] | 337 |                         data->result = cqe.res;
 | 
|---|
| [f6660520] | 338 |                         if(!in_kernel) { unpark( data->thrd __cfaabi_dbg_ctx2 ); }
 | 
|---|
 | 339 |                         else         { __unpark( data->thrd __cfaabi_dbg_ctx2 ); }
 | 
|---|
| [d384787] | 340 |                 }
 | 
|---|
| [2d8f7b0] | 341 | 
 | 
|---|
 | 342 |                 // Allow new submissions to happen
 | 
|---|
| [d384787] | 343 |                 V(ring.submit, count);
 | 
|---|
| [92976d9] | 344 | 
 | 
|---|
 | 345 |                 // Mark to the kernel that the cqe has been seen
 | 
|---|
 | 346 |                 // Ensure that the kernel only sees the new value of the head index after the CQEs have been read.
 | 
|---|
| [d384787] | 347 |                 __atomic_fetch_add( ring.completion_q.head, count, __ATOMIC_RELAXED );
 | 
|---|
| [92976d9] | 348 | 
 | 
|---|
| [038be32] | 349 |                 // Update statistics
 | 
|---|
 | 350 |                 #if !defined(__CFA_NO_STATISTICS__)
 | 
|---|
 | 351 |                         ring.completion_q.stats.completed_avg.val += count;
 | 
|---|
 | 352 |                         ring.completion_q.stats.completed_avg.cnt += 1;
 | 
|---|
 | 353 |                 #endif
 | 
|---|
| [d384787] | 354 | 
 | 
|---|
 | 355 |                 return count;
 | 
|---|
| [92976d9] | 356 |         }
 | 
|---|
 | 357 | 
 | 
|---|
| [f6660520] | 358 |         static void * __io_poller_slow( void * arg ) {
 | 
|---|
| [92976d9] | 359 |                 cluster * cltr = (cluster *)arg;
 | 
|---|
 | 360 |                 struct io_ring & ring = cltr->io;
 | 
|---|
 | 361 | 
 | 
|---|
 | 362 |                 sigset_t mask;
 | 
|---|
 | 363 |                 sigfillset(&mask);
 | 
|---|
 | 364 |                 if ( pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
 | 
|---|
 | 365 |                         abort( "KERNEL ERROR: IO_URING - pthread_sigmask" );
 | 
|---|
 | 366 |                 }
 | 
|---|
 | 367 | 
 | 
|---|
 | 368 |                 sigdelset( &mask, SIGUSR1 );
 | 
|---|
 | 369 | 
 | 
|---|
 | 370 |                 verify( (*ring.submit_q.head) == (*ring.submit_q.tail) );
 | 
|---|
 | 371 |                 verify( (*ring.completion_q.head) == (*ring.completion_q.tail) );
 | 
|---|
 | 372 | 
 | 
|---|
| [d384787] | 373 |                 while(!__atomic_load_n(&ring.done, __ATOMIC_SEQ_CST)) {
 | 
|---|
| [f6660520] | 374 |                         #if defined(__CFA_IO_POLLING_USER__)
 | 
|---|
 | 375 | 
 | 
|---|
 | 376 |                                 // In the user-thread approach drain and if anything was drained,
 | 
|---|
 | 377 |                                 // batton pass to the user-thread
 | 
|---|
 | 378 |                                 int count = __drain_io( ring, &mask, 1, true );
 | 
|---|
 | 379 |                                 if(count > 0) {
 | 
|---|
| [4069faad] | 380 |                                         __cfadbg_print_safe(io, "Kernel I/O : Moving to ring %p to fast poller\n", &ring);
 | 
|---|
| [f6660520] | 381 |                                         __unpark( &ring.poller.fast.thrd __cfaabi_dbg_ctx2 );
 | 
|---|
 | 382 |                                         wait( ring.poller.sem );
 | 
|---|
 | 383 |                                 }
 | 
|---|
 | 384 | 
 | 
|---|
 | 385 |                         #else
 | 
|---|
 | 386 | 
 | 
|---|
 | 387 |                                 //In the naive approach, just poll the io completion queue directly
 | 
|---|
 | 388 |                                 __drain_io( ring, &mask, 1, true );
 | 
|---|
 | 389 | 
 | 
|---|
 | 390 |                         #endif
 | 
|---|
| [92976d9] | 391 |                 }
 | 
|---|
 | 392 | 
 | 
|---|
 | 393 |                 return 0p;
 | 
|---|
 | 394 |         }
 | 
|---|
 | 395 | 
 | 
|---|
| [f6660520] | 396 |         #if defined(__CFA_IO_POLLING_USER__)
 | 
|---|
 | 397 |                 void main( __io_poller_fast & this ) {
 | 
|---|
 | 398 |                         // Start parked
 | 
|---|
 | 399 |                         park( __cfaabi_dbg_ctx );
 | 
|---|
 | 400 | 
 | 
|---|
 | 401 |                         // Then loop until we need to start
 | 
|---|
 | 402 |                         while(!__atomic_load_n(&this.ring->done, __ATOMIC_SEQ_CST)) {
 | 
|---|
 | 403 |                                 // Drain the io
 | 
|---|
 | 404 |                                 if(0 > __drain_io( *this.ring, 0p, 0, false )) {
 | 
|---|
 | 405 |                                         // If we got something, just yield and check again
 | 
|---|
 | 406 |                                         yield();
 | 
|---|
 | 407 |                                 }
 | 
|---|
 | 408 |                                 else {
 | 
|---|
 | 409 |                                         // We didn't get anything baton pass to the slow poller
 | 
|---|
| [4069faad] | 410 |                                         __cfadbg_print_safe(io, "Kernel I/O : Moving to ring %p to slow poller\n", &this.ring);
 | 
|---|
| [f6660520] | 411 |                                         post( this.ring->poller.sem );
 | 
|---|
 | 412 |                                         park( __cfaabi_dbg_ctx );
 | 
|---|
 | 413 |                                 }
 | 
|---|
 | 414 |                         }
 | 
|---|
 | 415 |                 }
 | 
|---|
 | 416 |         #endif
 | 
|---|
 | 417 | 
 | 
|---|
| [92976d9] | 418 | //=============================================================================================
 | 
|---|
 | 419 | // I/O Submissions
 | 
|---|
 | 420 | //=============================================================================================
 | 
|---|
 | 421 | 
 | 
|---|
| [2d8f7b0] | 422 | // Submition steps :
 | 
|---|
 | 423 | // 1 - We need to make sure we don't overflow any of the buffer, P(ring.submit) to make sure
 | 
|---|
 | 424 | //     entries are available. The semaphore make sure that there is no more operations in
 | 
|---|
 | 425 | //     progress then the number of entries in the buffer. This probably limits concurrency
 | 
|---|
 | 426 | //     more than necessary since submitted but not completed operations don't need any
 | 
|---|
 | 427 | //     entries in user space. However, I don't know what happens if we overflow the buffers
 | 
|---|
 | 428 | //     because too many requests completed at once. This is a safe approach in all cases.
 | 
|---|
 | 429 | //     Furthermore, with hundreds of entries, this may be okay.
 | 
|---|
 | 430 | //
 | 
|---|
 | 431 | // 2 - Allocate a queue entry. The ring already has memory for all entries but only the ones
 | 
|---|
 | 432 | //     listed in sq.array are visible by the kernel. For those not listed, the kernel does not
 | 
|---|
 | 433 | //     offer any assurance that an entry is not being filled by multiple flags. Therefore, we
 | 
|---|
 | 434 | //     need to write an allocator that allows allocating concurrently.
 | 
|---|
 | 435 | //
 | 
|---|
 | 436 | // 3 - Actually fill the submit entry, this is the only simple and straightforward step.
 | 
|---|
 | 437 | //
 | 
|---|
 | 438 | // 4 - Append the entry index to the array and adjust the tail accordingly. This operation
 | 
|---|
 | 439 | //     needs to arrive to two concensus at the same time:
 | 
|---|
 | 440 | //     A - The order in which entries are listed in the array: no two threads must pick the
 | 
|---|
 | 441 | //         same index for their entries
 | 
|---|
 | 442 | //     B - When can the tail be update for the kernel. EVERY entries in the array between
 | 
|---|
 | 443 | //         head and tail must be fully filled and shouldn't ever be touched again.
 | 
|---|
 | 444 | //
 | 
|---|
 | 445 | 
 | 
|---|
| [2489d31] | 446 |         static inline [* struct io_uring_sqe, uint32_t] __submit_alloc( struct io_ring & ring ) {
 | 
|---|
 | 447 |                 // Wait for a spot to be available
 | 
|---|
 | 448 |                 P(ring.submit);
 | 
|---|
 | 449 | 
 | 
|---|
 | 450 |                 // Allocate the sqe
 | 
|---|
 | 451 |                 uint32_t idx = __atomic_fetch_add(&ring.submit_q.alloc, 1ul32, __ATOMIC_SEQ_CST);
 | 
|---|
 | 452 | 
 | 
|---|
 | 453 |                 // Validate that we didn't overflow anything
 | 
|---|
 | 454 |                 // Check that nothing overflowed
 | 
|---|
 | 455 |                 /* paranoid */ verify( true );
 | 
|---|
 | 456 | 
 | 
|---|
 | 457 |                 // Check that it goes head -> tail -> alloc and never head -> alloc -> tail
 | 
|---|
 | 458 |                 /* paranoid */ verify( true );
 | 
|---|
 | 459 | 
 | 
|---|
 | 460 |                 // Return the sqe
 | 
|---|
 | 461 |                 return [&ring.submit_q.sqes[ idx & (*ring.submit_q.mask)], idx];
 | 
|---|
 | 462 |         }
 | 
|---|
 | 463 | 
 | 
|---|
 | 464 |         static inline void __submit( struct io_ring & ring, uint32_t idx ) {
 | 
|---|
 | 465 |                 // get mutual exclusion
 | 
|---|
 | 466 |                 lock(ring.submit_q.lock __cfaabi_dbg_ctx2);
 | 
|---|
 | 467 | 
 | 
|---|
 | 468 |                 // Append to the list of ready entries
 | 
|---|
 | 469 |                 uint32_t * tail = ring.submit_q.tail;
 | 
|---|
 | 470 |                 const uint32_t mask = *ring.submit_q.mask;
 | 
|---|
 | 471 | 
 | 
|---|
 | 472 |                 ring.submit_q.array[ (*tail) & mask ] = idx & mask;
 | 
|---|
 | 473 |                 __atomic_fetch_add(tail, 1ul32, __ATOMIC_SEQ_CST);
 | 
|---|
 | 474 | 
 | 
|---|
 | 475 |                 // Submit however, many entries need to be submitted
 | 
|---|
 | 476 |                 int ret = syscall( __NR_io_uring_enter, ring.fd, 1, 0, 0, 0p, 0);
 | 
|---|
 | 477 |                 if( ret < 0 ) {
 | 
|---|
 | 478 |                         switch((int)errno) {
 | 
|---|
 | 479 |                         default:
 | 
|---|
 | 480 |                                 abort( "KERNEL ERROR: IO_URING SUBMIT - %s\n", strerror(errno) );
 | 
|---|
 | 481 |                         }
 | 
|---|
| [2d8f7b0] | 482 |                 }
 | 
|---|
| [2489d31] | 483 | 
 | 
|---|
| [038be32] | 484 |                 // update statistics
 | 
|---|
 | 485 |                 #if !defined(__CFA_NO_STATISTICS__)
 | 
|---|
 | 486 |                         ring.submit_q.stats.submit_avg.val += 1;
 | 
|---|
 | 487 |                         ring.submit_q.stats.submit_avg.cnt += 1;
 | 
|---|
 | 488 |                 #endif
 | 
|---|
| [d384787] | 489 | 
 | 
|---|
| [2489d31] | 490 |                 unlock(ring.submit_q.lock);
 | 
|---|
 | 491 |                 // Make sure that idx was submitted
 | 
|---|
 | 492 |                 // Be careful to not get false positive if we cycled the entire list or that someone else submitted for us
 | 
|---|
| [4069faad] | 493 |                 __cfadbg_print_safe( io, "Kernel I/O : Performed io_submit for %p, returned %d\n", active_thread(), ret );
 | 
|---|
| [2489d31] | 494 |         }
 | 
|---|
 | 495 | 
 | 
|---|
 | 496 |         static inline void ?{}(struct io_uring_sqe & this, uint8_t opcode, int fd) {
 | 
|---|
 | 497 |                 this.opcode = opcode;
 | 
|---|
 | 498 |                 #if !defined(IOSQE_ASYNC)
 | 
|---|
 | 499 |                         this.flags = 0;
 | 
|---|
 | 500 |                 #else
 | 
|---|
 | 501 |                         this.flags = IOSQE_ASYNC;
 | 
|---|
 | 502 |                 #endif
 | 
|---|
 | 503 |                 this.ioprio = 0;
 | 
|---|
 | 504 |                 this.fd = fd;
 | 
|---|
 | 505 |                 this.off = 0;
 | 
|---|
 | 506 |                 this.addr = 0;
 | 
|---|
 | 507 |                 this.len = 0;
 | 
|---|
 | 508 |                 this.rw_flags = 0;
 | 
|---|
 | 509 |                 this.__pad2[0] = this.__pad2[1] = this.__pad2[2] = 0;
 | 
|---|
| [2d8f7b0] | 510 |         }
 | 
|---|
 | 511 | 
 | 
|---|
| [2489d31] | 512 |         static inline void ?{}(struct io_uring_sqe & this, uint8_t opcode, int fd, void * addr, uint32_t len, uint64_t off ) {
 | 
|---|
 | 513 |                 (this){ opcode, fd };
 | 
|---|
 | 514 |                 this.off = off;
 | 
|---|
 | 515 |                 this.addr = (uint64_t)addr;
 | 
|---|
 | 516 |                 this.len = len;
 | 
|---|
 | 517 |         }
 | 
|---|
| [f6660520] | 518 | 
 | 
|---|
| [92976d9] | 519 | 
 | 
|---|
 | 520 | //=============================================================================================
 | 
|---|
 | 521 | // I/O Interface
 | 
|---|
 | 522 | //=============================================================================================
 | 
|---|
| [f6660520] | 523 | 
 | 
|---|
| [2d8f7b0] | 524 |         #define __submit_prelude \
 | 
|---|
 | 525 |                 struct io_ring & ring = active_cluster()->io; \
 | 
|---|
 | 526 |                 struct io_uring_sqe * sqe; \
 | 
|---|
 | 527 |                 uint32_t idx; \
 | 
|---|
 | 528 |                 [sqe, idx] = __submit_alloc( ring );
 | 
|---|
 | 529 | 
 | 
|---|
 | 530 |         #define __submit_wait \
 | 
|---|
 | 531 |                 io_user_data data = { 0, active_thread() }; \
 | 
|---|
| [185efe6] | 532 |                 /*__cfaabi_bits_print_safe( STDERR_FILENO, "Preparing user data %p for %p\n", &data, data.thrd );*/ \
 | 
|---|
| [2d8f7b0] | 533 |                 sqe->user_data = (uint64_t)&data; \
 | 
|---|
 | 534 |                 __submit( ring, idx ); \
 | 
|---|
 | 535 |                 park( __cfaabi_dbg_ctx ); \
 | 
|---|
 | 536 |                 return data.result;
 | 
|---|
| [ecf6b46] | 537 | #endif
 | 
|---|
| [2d8f7b0] | 538 | 
 | 
|---|
| [0ea6c5a] | 539 | // Some forward declarations
 | 
|---|
 | 540 | extern "C" {
 | 
|---|
 | 541 |         #include <sys/types.h>
 | 
|---|
| [93f7c001] | 542 |         #include <sys/socket.h>
 | 
|---|
| [0ea6c5a] | 543 |         struct iovec;
 | 
|---|
 | 544 |         extern ssize_t preadv2 (int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
 | 
|---|
 | 545 |         extern ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
 | 
|---|
 | 546 | 
 | 
|---|
 | 547 |         extern int fsync(int fd);
 | 
|---|
 | 548 |         extern int sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags);
 | 
|---|
 | 549 | 
 | 
|---|
 | 550 |         struct msghdr;
 | 
|---|
 | 551 |         struct sockaddr;
 | 
|---|
 | 552 |         extern ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags);
 | 
|---|
 | 553 |         extern ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags);
 | 
|---|
 | 554 |         extern ssize_t send(int sockfd, const void *buf, size_t len, int flags);
 | 
|---|
 | 555 |         extern ssize_t recv(int sockfd, void *buf, size_t len, int flags);
 | 
|---|
 | 556 |         extern int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
 | 
|---|
 | 557 |         extern int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
 | 
|---|
 | 558 | 
 | 
|---|
 | 559 |         extern int fallocate(int fd, int mode, uint64_t offset, uint64_t len);
 | 
|---|
 | 560 |         extern int posix_fadvise(int fd, uint64_t offset, uint64_t len, int advice);
 | 
|---|
 | 561 |         extern int madvise(void *addr, size_t length, int advice);
 | 
|---|
 | 562 | 
 | 
|---|
 | 563 |         extern int openat(int dirfd, const char *pathname, int flags, mode_t mode);
 | 
|---|
 | 564 |         extern int close(int fd);
 | 
|---|
 | 565 | 
 | 
|---|
 | 566 |         struct statx;
 | 
|---|
 | 567 |         extern int statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf);
 | 
|---|
 | 568 | 
 | 
|---|
 | 569 |         extern ssize_t read (int fd, void *buf, size_t count);
 | 
|---|
 | 570 | }
 | 
|---|
 | 571 | 
 | 
|---|
| [2d8f7b0] | 572 | //-----------------------------------------------------------------------------
 | 
|---|
 | 573 | // Asynchronous operations
 | 
|---|
| [ecf6b46] | 574 | ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) {
 | 
|---|
 | 575 |         #if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_READV)
 | 
|---|
 | 576 |                 return preadv2(fd, iov, iovcnt, offset, flags);
 | 
|---|
 | 577 |         #else
 | 
|---|
 | 578 |                 __submit_prelude
 | 
|---|
 | 579 | 
 | 
|---|
 | 580 |                 (*sqe){ IORING_OP_READV, fd, iov, iovcnt, offset };
 | 
|---|
 | 581 | 
 | 
|---|
 | 582 |                 __submit_wait
 | 
|---|
 | 583 |         #endif
 | 
|---|
 | 584 | }
 | 
|---|
 | 585 | 
 | 
|---|
 | 586 | ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) {
 | 
|---|
 | 587 |         #if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_WRITEV)
 | 
|---|
 | 588 |                 return pwritev2(fd, iov, iovcnt, offset, flags);
 | 
|---|
 | 589 |         #else
 | 
|---|
 | 590 |                 __submit_prelude
 | 
|---|
 | 591 | 
 | 
|---|
 | 592 |                 (*sqe){ IORING_OP_WRITEV, fd, iov, iovcnt, offset };
 | 
|---|
 | 593 | 
 | 
|---|
 | 594 |                 __submit_wait
 | 
|---|
 | 595 |         #endif
 | 
|---|
 | 596 | }
 | 
|---|
 | 597 | 
 | 
|---|
 | 598 | int cfa_fsync(int fd) {
 | 
|---|
 | 599 |         #if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_FSYNC)
 | 
|---|
 | 600 |                 return fsync(fd);
 | 
|---|
 | 601 |         #else
 | 
|---|
 | 602 |                 __submit_prelude
 | 
|---|
 | 603 | 
 | 
|---|
 | 604 |                 (*sqe){ IORING_OP_FSYNC, fd };
 | 
|---|
 | 605 | 
 | 
|---|
 | 606 |                 __submit_wait
 | 
|---|
 | 607 |         #endif
 | 
|---|
 | 608 | }
 | 
|---|
 | 609 | 
 | 
|---|
 | 610 | int cfa_sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags) {
 | 
|---|
 | 611 |         #if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_SYNC_FILE_RANGE)
 | 
|---|
 | 612 |                 return sync_file_range(fd, offset, nbytes, flags);
 | 
|---|
 | 613 |         #else
 | 
|---|
 | 614 |                 __submit_prelude
 | 
|---|
 | 615 | 
 | 
|---|
 | 616 |                 (*sqe){ IORING_OP_SYNC_FILE_RANGE, fd };
 | 
|---|
 | 617 |                 sqe->off = offset;
 | 
|---|
 | 618 |                 sqe->len = nbytes;
 | 
|---|
 | 619 |                 sqe->sync_range_flags = flags;
 | 
|---|
 | 620 | 
 | 
|---|
 | 621 |                 __submit_wait
 | 
|---|
 | 622 |         #endif
 | 
|---|
 | 623 | }
 | 
|---|
 | 624 | 
 | 
|---|
 | 625 | 
 | 
|---|
 | 626 | ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags) {
 | 
|---|
 | 627 |         #if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_SENDMSG)
 | 
|---|
 | 628 |                 return recv(sockfd, msg, flags);
 | 
|---|
 | 629 |         #else
 | 
|---|
 | 630 |                 __submit_prelude
 | 
|---|
 | 631 | 
 | 
|---|
 | 632 |                 (*sqe){ IORING_OP_SENDMSG, sockfd, msg, 1, 0 };
 | 
|---|
 | 633 |                 sqe->msg_flags = flags;
 | 
|---|
 | 634 | 
 | 
|---|
 | 635 |                 __submit_wait
 | 
|---|
 | 636 |         #endif
 | 
|---|
 | 637 | }
 | 
|---|
 | 638 | 
 | 
|---|
 | 639 | ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags) {
 | 
|---|
 | 640 |         #if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_RECVMSG)
 | 
|---|
 | 641 |                 return recv(sockfd, msg, flags);
 | 
|---|
 | 642 |         #else
 | 
|---|
 | 643 |                 __submit_prelude
 | 
|---|
 | 644 | 
 | 
|---|
 | 645 |                 (*sqe){ IORING_OP_RECVMSG, sockfd, msg, 1, 0 };
 | 
|---|
 | 646 |                 sqe->msg_flags = flags;
 | 
|---|
 | 647 | 
 | 
|---|
 | 648 |                 __submit_wait
 | 
|---|
 | 649 |         #endif
 | 
|---|
 | 650 | }
 | 
|---|
 | 651 | 
 | 
|---|
 | 652 | ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags) {
 | 
|---|
 | 653 |         #if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_SEND)
 | 
|---|
 | 654 |                 return send( sockfd, buf, len, flags );
 | 
|---|
 | 655 |         #else
 | 
|---|
 | 656 |                 __submit_prelude
 | 
|---|
 | 657 | 
 | 
|---|
 | 658 |                 (*sqe){ IORING_OP_SEND, sockfd };
 | 
|---|
 | 659 |                 sqe->addr = (uint64_t)buf;
 | 
|---|
 | 660 |                 sqe->len = len;
 | 
|---|
 | 661 |                 sqe->msg_flags = flags;
 | 
|---|
 | 662 | 
 | 
|---|
 | 663 |                 __submit_wait
 | 
|---|
 | 664 |         #endif
 | 
|---|
 | 665 | }
 | 
|---|
 | 666 | 
 | 
|---|
 | 667 | ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags) {
 | 
|---|
 | 668 |         #if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_RECV)
 | 
|---|
 | 669 |                 return recv( sockfd, buf, len, flags );
 | 
|---|
 | 670 |         #else
 | 
|---|
 | 671 |                 __submit_prelude
 | 
|---|
 | 672 | 
 | 
|---|
 | 673 |                 (*sqe){ IORING_OP_RECV, sockfd };
 | 
|---|
 | 674 |                 sqe->addr = (uint64_t)buf;
 | 
|---|
 | 675 |                 sqe->len = len;
 | 
|---|
 | 676 |                 sqe->msg_flags = flags;
 | 
|---|
 | 677 | 
 | 
|---|
 | 678 |                 __submit_wait
 | 
|---|
 | 679 |         #endif
 | 
|---|
 | 680 | }
 | 
|---|
 | 681 | 
 | 
|---|
 | 682 | int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags) {
 | 
|---|
 | 683 |         #if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_ACCEPT)
 | 
|---|
| [0ea6c5a] | 684 |                 return accept4( sockfd, addr, addrlen, flags );
 | 
|---|
| [ecf6b46] | 685 |         #else
 | 
|---|
 | 686 |                 __submit_prelude
 | 
|---|
 | 687 | 
 | 
|---|
 | 688 |                 (*sqe){ IORING_OP_ACCEPT, sockfd };
 | 
|---|
 | 689 |                 sqe->addr = addr;
 | 
|---|
 | 690 |                 sqe->addr2 = addrlen;
 | 
|---|
 | 691 |                 sqe->accept_flags = flags;
 | 
|---|
 | 692 | 
 | 
|---|
 | 693 |                 __submit_wait
 | 
|---|
 | 694 |         #endif
 | 
|---|
 | 695 | }
 | 
|---|
 | 696 | 
 | 
|---|
 | 697 | int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen) {
 | 
|---|
 | 698 |         #if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_CONNECT)
 | 
|---|
| [0ea6c5a] | 699 |                 return connect( sockfd, addr, addrlen );
 | 
|---|
| [ecf6b46] | 700 |         #else
 | 
|---|
 | 701 |                 __submit_prelude
 | 
|---|
 | 702 | 
 | 
|---|
 | 703 |                 (*sqe){ IORING_OP_CONNECT, sockfd };
 | 
|---|
 | 704 |                 sqe->addr = (uint64_t)addr;
 | 
|---|
 | 705 |                 sqe->off = addrlen;
 | 
|---|
 | 706 | 
 | 
|---|
 | 707 |                 __submit_wait
 | 
|---|
 | 708 |         #endif
 | 
|---|
 | 709 | }
 | 
|---|
 | 710 | 
 | 
|---|
 | 711 | int cfa_fallocate(int fd, int mode, uint64_t offset, uint64_t len) {
 | 
|---|
 | 712 |         #if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_FALLOCATE)
 | 
|---|
 | 713 |                 return fallocate( fd, mode, offset, len );
 | 
|---|
 | 714 |         #else
 | 
|---|
 | 715 |                 __submit_prelude
 | 
|---|
 | 716 | 
 | 
|---|
 | 717 |                 (*sqe){ IORING_OP_FALLOCATE, fd };
 | 
|---|
 | 718 |                 sqe->off = offset;
 | 
|---|
 | 719 |                 sqe->len = length;
 | 
|---|
 | 720 |                 sqe->mode = mode;
 | 
|---|
 | 721 | 
 | 
|---|
 | 722 |                 __submit_wait
 | 
|---|
 | 723 |         #endif
 | 
|---|
 | 724 | }
 | 
|---|
 | 725 | 
 | 
|---|
 | 726 | int cfa_fadvise(int fd, uint64_t offset, uint64_t len, int advice) {
 | 
|---|
 | 727 |         #if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_FADVISE)
 | 
|---|
 | 728 |                 return posix_fadvise( fd, offset, len, advice );
 | 
|---|
 | 729 |         #else
 | 
|---|
 | 730 |                 __submit_prelude
 | 
|---|
 | 731 | 
 | 
|---|
 | 732 |                 (*sqe){ IORING_OP_FADVISE, fd };
 | 
|---|
 | 733 |                 sqe->off = (uint64_t)offset;
 | 
|---|
 | 734 |                 sqe->len = length;
 | 
|---|
 | 735 |                 sqe->fadvise_advice = advice;
 | 
|---|
 | 736 | 
 | 
|---|
 | 737 |                 __submit_wait
 | 
|---|
 | 738 |         #endif
 | 
|---|
 | 739 | }
 | 
|---|
 | 740 | 
 | 
|---|
 | 741 | int cfa_madvise(void *addr, size_t length, int advice) {
 | 
|---|
 | 742 |         #if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_MADVISE)
 | 
|---|
 | 743 |                 return madvise( addr, length, advice );
 | 
|---|
 | 744 |         #else
 | 
|---|
 | 745 |                 __submit_prelude
 | 
|---|
 | 746 | 
 | 
|---|
 | 747 |                 (*sqe){ IORING_OP_MADVISE, 0 };
 | 
|---|
 | 748 |                 sqe->addr = (uint64_t)addr;
 | 
|---|
 | 749 |                 sqe->len = length;
 | 
|---|
 | 750 |                 sqe->fadvise_advice = advice;
 | 
|---|
 | 751 | 
 | 
|---|
 | 752 |                 __submit_wait
 | 
|---|
 | 753 |         #endif
 | 
|---|
 | 754 | }
 | 
|---|
 | 755 | 
 | 
|---|
 | 756 | int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode) {
 | 
|---|
 | 757 |         #if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_OPENAT)
 | 
|---|
 | 758 |                 return openat( dirfd, pathname, flags, mode );
 | 
|---|
 | 759 |         #else
 | 
|---|
 | 760 |                 __submit_prelude
 | 
|---|
 | 761 | 
 | 
|---|
 | 762 |                 (*sqe){ IORING_OP_OPENAT, dirfd };
 | 
|---|
 | 763 |                 sqe->addr = (uint64_t)pathname;
 | 
|---|
 | 764 |                 sqe->open_flags = flags;
 | 
|---|
 | 765 |                 sqe->mode = mode;
 | 
|---|
 | 766 | 
 | 
|---|
 | 767 |                 __submit_wait
 | 
|---|
 | 768 |         #endif
 | 
|---|
 | 769 | }
 | 
|---|
 | 770 | 
 | 
|---|
 | 771 | int cfa_close(int fd) {
 | 
|---|
 | 772 |         #if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_CLOSE)
 | 
|---|
 | 773 |                 return close( fd );
 | 
|---|
 | 774 |         #else
 | 
|---|
 | 775 |                 __submit_prelude
 | 
|---|
 | 776 | 
 | 
|---|
 | 777 |                 (*sqe){ IORING_OP_CLOSE, fd };
 | 
|---|
 | 778 | 
 | 
|---|
 | 779 |                 __submit_wait
 | 
|---|
 | 780 |         #endif
 | 
|---|
 | 781 | }
 | 
|---|
 | 782 | 
 | 
|---|
 | 783 | int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf) {
 | 
|---|
 | 784 |         #if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_STATX)
 | 
|---|
 | 785 |                 //return statx( dirfd, pathname, flags, mask, statxbuf );
 | 
|---|
 | 786 |                 return syscall( __NR_io_uring_setup, dirfd, pathname, flags, mask, statxbuf );
 | 
|---|
 | 787 |         #else
 | 
|---|
 | 788 |                 __submit_prelude
 | 
|---|
 | 789 | 
 | 
|---|
 | 790 |                 (*sqe){ IORING_OP_STATX, dirfd };
 | 
|---|
 | 791 |                 sqe->addr = (uint64_t)pathname;
 | 
|---|
 | 792 |                 sqe->statx_flags = flags;
 | 
|---|
 | 793 |                 sqe->len = mask;
 | 
|---|
 | 794 |                 sqe->off = (uint64_t)statxbuf;
 | 
|---|
 | 795 | 
 | 
|---|
 | 796 |                 __submit_wait
 | 
|---|
 | 797 |         #endif
 | 
|---|
 | 798 | }
 | 
|---|
 | 799 | 
 | 
|---|
 | 800 | 
 | 
|---|
 | 801 | ssize_t cfa_read(int fd, void *buf, size_t count) {
 | 
|---|
 | 802 |         #if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_READ)
 | 
|---|
 | 803 |                 return read( fd, buf, count );
 | 
|---|
 | 804 |         #else
 | 
|---|
 | 805 |                 __submit_prelude
 | 
|---|
 | 806 | 
 | 
|---|
 | 807 |                 (*sqe){ IORING_OP_READ, fd, buf, count, 0 };
 | 
|---|
 | 808 | 
 | 
|---|
 | 809 |                 __submit_wait
 | 
|---|
 | 810 |         #endif
 | 
|---|
 | 811 | }
 | 
|---|
 | 812 | 
 | 
|---|
 | 813 | ssize_t cfa_write(int fd, void *buf, size_t count) {
 | 
|---|
 | 814 |         #if !defined(HAVE_LINUX_IO_URING_H) || !defined(IORING_OP_WRITE)
 | 
|---|
 | 815 |                 return read( fd, buf, count );
 | 
|---|
 | 816 |         #else
 | 
|---|
 | 817 |                 __submit_prelude
 | 
|---|
 | 818 | 
 | 
|---|
 | 819 |                 (*sqe){ IORING_OP_WRITE, fd, buf, count, 0 };
 | 
|---|
 | 820 | 
 | 
|---|
 | 821 |                 __submit_wait
 | 
|---|
 | 822 |         #endif
 | 
|---|
 | 823 | }
 | 
|---|
| [2d8f7b0] | 824 | 
 | 
|---|
 | 825 | //-----------------------------------------------------------------------------
 | 
|---|
 | 826 | // Check if a function is asynchronous
 | 
|---|
 | 827 | 
 | 
|---|
 | 828 | // Macro magic to reduce the size of the following switch case
 | 
|---|
| [ecf6b46] | 829 | #define IS_DEFINED_APPLY(f, ...) f(__VA_ARGS__)
 | 
|---|
 | 830 | #define IS_DEFINED_SECOND(first, second, ...) second
 | 
|---|
 | 831 | #define IS_DEFINED_TEST(expansion) _CFA_IO_FEATURE_##expansion
 | 
|---|
 | 832 | #define IS_DEFINED(macro) IS_DEFINED_APPLY( IS_DEFINED_SECOND,IS_DEFINED_TEST(macro) false, true)
 | 
|---|
| [2d8f7b0] | 833 | 
 | 
|---|
| [ecf6b46] | 834 | bool has_user_level_blocking( fptr_t func ) {
 | 
|---|
 | 835 |         #if defined(HAVE_LINUX_IO_URING_H)
 | 
|---|
| [2d8f7b0] | 836 |                 if( /*func == (fptr_t)preadv2 || */
 | 
|---|
| [2489d31] | 837 |                         func == (fptr_t)cfa_preadv2 )
 | 
|---|
| [2d8f7b0] | 838 |                         #define _CFA_IO_FEATURE_IORING_OP_READV ,
 | 
|---|
 | 839 |                         return IS_DEFINED(IORING_OP_READV);
 | 
|---|
 | 840 | 
 | 
|---|
 | 841 |                 if( /*func == (fptr_t)pwritev2 || */
 | 
|---|
| [ecf6b46] | 842 |                         func == (fptr_t)cfa_pwritev2 )
 | 
|---|
| [2d8f7b0] | 843 |                         #define _CFA_IO_FEATURE_IORING_OP_WRITEV ,
 | 
|---|
 | 844 |                         return IS_DEFINED(IORING_OP_WRITEV);
 | 
|---|
 | 845 | 
 | 
|---|
 | 846 |                 if( /*func == (fptr_t)fsync || */
 | 
|---|
| [ecf6b46] | 847 |                         func == (fptr_t)cfa_fsync )
 | 
|---|
| [2d8f7b0] | 848 |                         #define _CFA_IO_FEATURE_IORING_OP_FSYNC ,
 | 
|---|
 | 849 |                         return IS_DEFINED(IORING_OP_FSYNC);
 | 
|---|
 | 850 | 
 | 
|---|
 | 851 |                 if( /*func == (fptr_t)ync_file_range || */
 | 
|---|
| [ecf6b46] | 852 |                         func == (fptr_t)cfa_sync_file_range )
 | 
|---|
| [2d8f7b0] | 853 |                         #define _CFA_IO_FEATURE_IORING_OP_SYNC_FILE_RANGE ,
 | 
|---|
 | 854 |                         return IS_DEFINED(IORING_OP_SYNC_FILE_RANGE);
 | 
|---|
 | 855 | 
 | 
|---|
 | 856 |                 if( /*func == (fptr_t)sendmsg || */
 | 
|---|
| [ecf6b46] | 857 |                         func == (fptr_t)cfa_sendmsg )
 | 
|---|
| [2d8f7b0] | 858 |                         #define _CFA_IO_FEATURE_IORING_OP_SENDMSG ,
 | 
|---|
 | 859 |                         return IS_DEFINED(IORING_OP_SENDMSG);
 | 
|---|
 | 860 | 
 | 
|---|
 | 861 |                 if( /*func == (fptr_t)recvmsg || */
 | 
|---|
| [ecf6b46] | 862 |                         func == (fptr_t)cfa_recvmsg )
 | 
|---|
| [2d8f7b0] | 863 |                         #define _CFA_IO_FEATURE_IORING_OP_RECVMSG ,
 | 
|---|
 | 864 |                         return IS_DEFINED(IORING_OP_RECVMSG);
 | 
|---|
 | 865 | 
 | 
|---|
 | 866 |                 if( /*func == (fptr_t)send || */
 | 
|---|
| [2489d31] | 867 |                         func == (fptr_t)cfa_send )
 | 
|---|
| [2d8f7b0] | 868 |                         #define _CFA_IO_FEATURE_IORING_OP_SEND ,
 | 
|---|
 | 869 |                         return IS_DEFINED(IORING_OP_SEND);
 | 
|---|
 | 870 | 
 | 
|---|
 | 871 |                 if( /*func == (fptr_t)recv || */
 | 
|---|
| [2489d31] | 872 |                         func == (fptr_t)cfa_recv )
 | 
|---|
| [2d8f7b0] | 873 |                         #define _CFA_IO_FEATURE_IORING_OP_RECV ,
 | 
|---|
 | 874 |                         return IS_DEFINED(IORING_OP_RECV);
 | 
|---|
 | 875 | 
 | 
|---|
 | 876 |                 if( /*func == (fptr_t)accept4 || */
 | 
|---|
| [2489d31] | 877 |                         func == (fptr_t)cfa_accept4 )
 | 
|---|
| [2d8f7b0] | 878 |                         #define _CFA_IO_FEATURE_IORING_OP_ACCEPT ,
 | 
|---|
 | 879 |                         return IS_DEFINED(IORING_OP_ACCEPT);
 | 
|---|
 | 880 | 
 | 
|---|
 | 881 |                 if( /*func == (fptr_t)connect || */
 | 
|---|
| [2489d31] | 882 |                         func == (fptr_t)cfa_connect )
 | 
|---|
| [2d8f7b0] | 883 |                         #define _CFA_IO_FEATURE_IORING_OP_CONNECT ,
 | 
|---|
 | 884 |                         return IS_DEFINED(IORING_OP_CONNECT);
 | 
|---|
 | 885 | 
 | 
|---|
 | 886 |                 if( /*func == (fptr_t)fallocate || */
 | 
|---|
| [2489d31] | 887 |                         func == (fptr_t)cfa_fallocate )
 | 
|---|
| [2d8f7b0] | 888 |                         #define _CFA_IO_FEATURE_IORING_OP_FALLOCATE ,
 | 
|---|
 | 889 |                         return IS_DEFINED(IORING_OP_FALLOCATE);
 | 
|---|
 | 890 | 
 | 
|---|
| [0ea6c5a] | 891 |                 if( /*func == (fptr_t)posix_fadvise || */
 | 
|---|
| [2489d31] | 892 |                         func == (fptr_t)cfa_fadvise )
 | 
|---|
| [2d8f7b0] | 893 |                         #define _CFA_IO_FEATURE_IORING_OP_FADVISE ,
 | 
|---|
 | 894 |                         return IS_DEFINED(IORING_OP_FADVISE);
 | 
|---|
 | 895 | 
 | 
|---|
 | 896 |                 if( /*func == (fptr_t)madvise || */
 | 
|---|
| [2489d31] | 897 |                         func == (fptr_t)cfa_madvise )
 | 
|---|
| [2d8f7b0] | 898 |                         #define _CFA_IO_FEATURE_IORING_OP_MADVISE ,
 | 
|---|
 | 899 |                         return IS_DEFINED(IORING_OP_MADVISE);
 | 
|---|
 | 900 | 
 | 
|---|
 | 901 |                 if( /*func == (fptr_t)openat || */
 | 
|---|
| [2489d31] | 902 |                         func == (fptr_t)cfa_openat )
 | 
|---|
| [2d8f7b0] | 903 |                         #define _CFA_IO_FEATURE_IORING_OP_OPENAT ,
 | 
|---|
 | 904 |                         return IS_DEFINED(IORING_OP_OPENAT);
 | 
|---|
 | 905 | 
 | 
|---|
 | 906 |                 if( /*func == (fptr_t)close || */
 | 
|---|
| [2489d31] | 907 |                         func == (fptr_t)cfa_close )
 | 
|---|
| [2d8f7b0] | 908 |                         #define _CFA_IO_FEATURE_IORING_OP_CLOSE ,
 | 
|---|
 | 909 |                         return IS_DEFINED(IORING_OP_CLOSE);
 | 
|---|
 | 910 | 
 | 
|---|
 | 911 |                 if( /*func == (fptr_t)statx || */
 | 
|---|
| [2489d31] | 912 |                         func == (fptr_t)cfa_statx )
 | 
|---|
| [2d8f7b0] | 913 |                         #define _CFA_IO_FEATURE_IORING_OP_STATX ,
 | 
|---|
 | 914 |                         return IS_DEFINED(IORING_OP_STATX);
 | 
|---|
 | 915 | 
 | 
|---|
 | 916 |                 if( /*func == (fptr_t)read || */
 | 
|---|
| [ecf6b46] | 917 |                         func == (fptr_t)cfa_read )
 | 
|---|
| [2d8f7b0] | 918 |                         #define _CFA_IO_FEATURE_IORING_OP_READ ,
 | 
|---|
 | 919 |                         return IS_DEFINED(IORING_OP_READ);
 | 
|---|
 | 920 | 
 | 
|---|
 | 921 |                 if( /*func == (fptr_t)write || */
 | 
|---|
| [ecf6b46] | 922 |                         func == (fptr_t)cfa_write )
 | 
|---|
| [2d8f7b0] | 923 |                         #define _CFA_IO_FEATURE_IORING_OP_WRITE ,
 | 
|---|
 | 924 |                         return IS_DEFINED(IORING_OP_WRITE);
 | 
|---|
| [ecf6b46] | 925 |         #endif
 | 
|---|
| [2d8f7b0] | 926 | 
 | 
|---|
| [ecf6b46] | 927 |         return false;
 | 
|---|
 | 928 | }
 | 
|---|