Diff [49cad9121c8b9a0edfa73c15ad542c41af9ab818:3a32b3ae8496c5197e67965c7a5c54f2eeb7d823] for / – Cforall

benchmark/io/batch-readv.c

-                      r49cad912
+                      r3a32b3a
 // Program to test the optimial batchsize in a single threaded process
 extern "C" {
+        #ifndef _GNU_SOURCE         /* See feature_test_macros(7) */
+        #define _GNU_SOURCE         /* See feature_test_macros(7) */
+        #endif
+        #include <errno.h>
+        #include <stdio.h>
+        #include <stdint.h>
+        #include <stdlib.h>
+        #include <string.h>
+        #include <getopt.h>
         #include <locale.h>
-        #include <getopt.h>
-        #include <unistd.h>
-        #include <sys/mman.h>
-        #include <sys/syscall.h>
-        #include <sys/uio.h>
-        #include <fcntl.h>
         #include <time.h>                                                                               // timespec
         #include <sys/time.h>                                                                   // timeval
-        #include <linux/io_uring.h>
+}
 enum { TIMEGRAN = 1000000000LL };                                       // nanosecond granularity, except for timeval
 …
 #include <omp.h>
+# ifndef __NR_io_uring_setup
+#  define __NR_io_uring_setup           425
+# endif
+# ifndef __NR_io_uring_enter
+#  define __NR_io_uring_enter           426
+# endif
+# ifndef __NR_io_uring_register
+#  define __NR_io_uring_register        427
+# endif
+#include "io_uring.h"
-struct io_uring_sq {
-        // Head and tail of the ring (associated with array)
-        volatile uint32_t * head;
-        volatile uint32_t * tail;
-        // The actual kernel ring which uses head/tail
-        // indexes into the sqes arrays
-        uint32_t * array;
-        // number of entries and mask to go with it
-        const uint32_t * num;
-        const uint32_t * mask;
-        // Submission flags (Not sure what for)
-        uint32_t * flags;
-        // number of sqes not submitted (whatever that means)
-        uint32_t * dropped;
-        // Like head/tail but not seen by the kernel
-        volatile uint32_t alloc;
-        // A buffer of sqes (not the actual ring)
-        struct io_uring_sqe * sqes;
-        // The location and size of the mmaped area
-        void * ring_ptr;
-        size_t ring_sz;
-};
-struct io_uring_cq {
-        // Head and tail of the ring
-        volatile uint32_t * head;
-        volatile uint32_t * tail;
-        // number of entries and mask to go with it
-        const uint32_t * mask;
-        const uint32_t * num;
-        // number of cqes not submitted (whatever that means)
-        uint32_t * overflow;
-        // the kernel ring
-        struct io_uring_cqe * cqes;
-        // The location and size of the mmaped area
-        void * ring_ptr;
-        size_t ring_sz;
-};
-struct io_ring {
-        struct io_uring_sq submit_q;
-        struct io_uring_cq completion_q;
-        uint32_t flags;
-        int fd;
-};
-struct fred {
-        io_ring io;
-};
-fred self;
 int myfd;
 …
         myfd = open(__FILE__, 0);
+        // Step 1 : call to setup
+        struct io_uring_params params;
+        memset(&params, 0, sizeof(params));
+        uint32_t nentries = 2048;
+        int fd = syscall(__NR_io_uring_setup, nentries, &params );
+        if(fd < 0) {
+                fprintf(stderr, "KERNEL ERROR: IO_URING SETUP - %s\n", strerror(errno));
+                abort();
+        }
+        // Step 2 : mmap result
+        memset(&self.io, 0, sizeof(struct io_ring));
+        struct io_uring_sq & sq = self.io.submit_q;
+        struct io_uring_cq & cq = self.io.completion_q;
+        // calculate the right ring size
+        sq.ring_sz = params.sq_off.array + (params.sq_entries * sizeof(unsigned)           );
+        cq.ring_sz = params.cq_off.cqes  + (params.cq_entries * sizeof(struct io_uring_cqe));
+        // Requires features
+        // // adjust the size according to the parameters
+        // if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
+        //      cq->ring_sz = sq->ring_sz = max(cq->ring_sz, sq->ring_sz);
+        // }
+        // mmap the Submit Queue into existence
+        sq.ring_ptr = mmap(0, sq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
+        if (sq.ring_ptr == (void*)MAP_FAILED) {
+                fprintf(stderr, "KERNEL ERROR: IO_URING MMAP1 - %s\n", strerror(errno));
+                abort();
+        }
+        // mmap the Completion Queue into existence (may or may not be needed)
+        // Requires features
+        // if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) {
+        //      cq->ring_ptr = sq->ring_ptr;
+        // }
+        // else {
+                // We need multiple call to MMAP
+                cq.ring_ptr = mmap(0, cq.ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
+                if (cq.ring_ptr == (void*)MAP_FAILED) {
+                        munmap(sq.ring_ptr, sq.ring_sz);
+                        fprintf(stderr, "KERNEL ERROR: IO_URING MMAP2 - %s\n", strerror(errno));
+                        abort();
+                }
+        // }
+        // mmap the submit queue entries
+        size_t size = params.sq_entries * sizeof(struct io_uring_sqe);
+        sq.sqes = (struct io_uring_sqe *)mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
+        if (sq.sqes == (struct io_uring_sqe *)MAP_FAILED) {
+                munmap(sq.ring_ptr, sq.ring_sz);
+                if (cq.ring_ptr != sq.ring_ptr) munmap(cq.ring_ptr, cq.ring_sz);
+                fprintf(stderr, "KERNEL ERROR: IO_URING MMAP3 - %s\n", strerror(errno));
+                abort();
+        }
+        // Get the pointers from the kernel to fill the structure
+        // submit queue
+        sq.head    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
+        sq.tail    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
+        sq.mask    = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
+        sq.num     = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
+        sq.flags   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
+        sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
+        sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
+        sq.alloc = *sq.tail;
+        // completion queue
+        cq.head     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
+        cq.tail     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
+        cq.mask     = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
+        cq.num      = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
+        cq.overflow = (         uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
+        cq.cqes   = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
+        self.io.fd = fd;
+        init_uring(2048);
         // Allocate the sqe
 …
         printf("Took %'ld ms\n", to_miliseconds(end - start));
+        printf("Submitted       %'llu\n", submits);
+        printf("Completed       %'llu\n", completes);
+        printf("Submitted / sec %'.f\n", submits   / to_fseconds(end - start));
+        printf("Completed / sec %'.f\n", completes / to_fseconds(end - start));
+        printf("Submitted        %'llu\n", submits);
+        printf("Completed        %'llu\n", completes);
+        printf("Submitted / sec  %'.f\n", submits   / to_fseconds(end - start));
+        printf("Completed / sec  %'.f\n", completes / to_fseconds(end - start));
+        printf("ns per Submitted %'.f\n", 1000000000.0 * to_fseconds(end - start) / (submits   / batch) );
+        printf("ns per Completed %'.f\n", 1000000000.0 * to_fseconds(end - start) / (completes / batch) );
+}

benchmark/io/readv.cfa

-                      r49cad912
+                      r3a32b3a
                 static struct option options[] = {
                         BENCH_OPT_LONG
+                        {"bufsize",      required_argument, 0, 'b'},
+                        {"userthread",   no_argument      , 0, 'u'},
+                        {"submitthread", no_argument      , 0, 's'},
+                        {"submitlength", required_argument, 0, 'l'},
+                        {"bufsize",       required_argument, 0, 'b'},
+                        {"userthread",    no_argument      , 0, 'u'},
+                        {"submitthread",  no_argument      , 0, 's'},
+                        {"eagersubmit",   no_argument      , 0, 'e'},
+                        {"kpollsubmit",   no_argument      , 0, 'k'},
+                        {"kpollcomplete", no_argument      , 0, 'i'},
+                        {"submitlength",  required_argument, 0, 'l'},
                         {0, 0, 0, 0}
                 };
                 int idx = 0;
                 int opt = getopt_long(argc, argv, BENCH_OPT_SHORT "b:usl:", options, &idx);
+                int opt = getopt_long(argc, argv, BENCH_OPT_SHORT "b:usekil:", options, &idx);
                 const char * arg = optarg ? optarg : "";
 …
                                 flags |= CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS;
                                 break;
+                        case 'e':
+                                flags |= CFA_CLUSTER_IO_EAGER_SUBMITS;
+                                break;
+                        case 'k':
+                                flags |= CFA_CLUSTER_IO_KERNEL_POLL_SUBMITS;
+                                break;
+                        case 'i':
+                                flags |= CFA_CLUSTER_IO_KERNEL_POLL_COMPLETES;
+                                break;
                         case 'l':
                                 sublen = strtoul(arg, &end, 10);
 …
                                 fprintf( stderr, "  -u, --userthread         If set, cluster uses user-thread to poll I/O\n" );
                                 fprintf( stderr, "  -s, --submitthread       If set, cluster uses polling thread to submit I/O\n" );
+                                fprintf( stderr, "  -e, --eagersubmit        If set, cluster submits I/O eagerly but still aggregates submits\n" );
+                                fprintf( stderr, "  -k, --kpollsubmit        If set, cluster uses IORING_SETUP_SQPOLL\n" );
+                                fprintf( stderr, "  -i, --kpollcomplete      If set, cluster uses IORING_SETUP_IOPOLL\n" );
+                                fprintf( stderr, "  -l, --submitlength=LEN   Max number of submitions that can be submitted together\n" );
                                 exit(EXIT_FAILURE);
+                }

libcfa/src/concurrency/io.cfa

-                      r49cad912
+                      r3a32b3a
                 volatile uint32_t * head;
                 volatile uint32_t * tail;
+                volatile uint32_t prev_head;
                 // The actual kernel ring which uses head/tail
 …
                 __spinlock_t lock;
+                __spinlock_t release_lock;
                 // A buffer of sqes (not the actual ring)
 …
 //=============================================================================================
         void __kernel_io_startup( cluster & this, unsigned io_flags, bool main_cluster ) {
+                if( (io_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS) && (io_flags & CFA_CLUSTER_IO_EAGER_SUBMITS) ) {
+                        abort("CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS and CFA_CLUSTER_IO_EAGER_SUBMITS cannot be mixed\n");
+                }
                 this.io = malloc();
 …
                 struct io_uring_params params;
                 memset(&params, 0, sizeof(params));
+                if( io_flags & CFA_CLUSTER_IO_KERNEL_POLL_SUBMITS   ) params.flags |= IORING_SETUP_SQPOLL;
+                if( io_flags & CFA_CLUSTER_IO_KERNEL_POLL_COMPLETES ) params.flags |= IORING_SETUP_IOPOLL;
                 uint32_t nentries = entries_per_cluster();
 …
                 sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
                 sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
+                sq.prev_head = *sq.head;
+                {
 …
+                }
+                if( io_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
+                (sq.lock){};
+                (sq.release_lock){};
+                if( io_flags & ( CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS | CFA_CLUSTER_IO_EAGER_SUBMITS ) ) {
                         /* paranoid */ verify( is_pow2( io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET ) || ((io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET) < 8)  );
                         sq.ready_cnt = max(io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET, 8);
 …
 // I/O Polling
 //=============================================================================================
+        static unsigned __collect_submitions( struct __io_data & ring );
+        static uint32_t __release_consumed_submission( struct __io_data & ring );
         // Process a single completion message from the io_uring
         // This is NOT thread-safe
         static [int, bool] __drain_io( & struct __io_data ring, * sigset_t mask, int waitcnt, bool in_kernel ) {
+                /* paranoid */ verify( !kernelTLS.preemption_state.enabled );
+                const uint32_t smask = *ring.submit_q.mask;
                 unsigned to_submit = 0;
                 if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
                         // If the poller thread also submits, then we need to aggregate the submissions which are ready
+                        uint32_t tail = *ring.submit_q.tail;
+                        const uint32_t mask = *ring.submit_q.mask;
+                        // Go through the list of ready submissions
+                        for( i; ring.submit_q.ready_cnt ) {
+                                // replace any submission with the sentinel, to consume it.
+                                uint32_t idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
+                                // If it was already the sentinel, then we are done
+                                if( idx == -1ul32 ) continue;
+                                // If we got a real submission, append it to the list
+                                ring.submit_q.array[ (tail + to_submit) & mask ] = idx & mask;
+                                to_submit++;
+                        }
+                        // Increment the tail based on how many we are ready to submit
+                        __atomic_fetch_add(ring.submit_q.tail, to_submit, __ATOMIC_SEQ_CST);
+                }
+                const uint32_t smask = *ring.submit_q.mask;
+                uint32_t shead = *ring.submit_q.head;
+                int ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, waitcnt, IORING_ENTER_GETEVENTS, mask, _NSIG / 8);
+                if( ret < 0 ) {
+                        switch((int)errno) {
+                        case EAGAIN:
+                        case EINTR:
+                                return -EAGAIN;
+                        default:
+                                abort( "KERNEL ERROR: IO_URING WAIT - %s\n", strerror(errno) );
+                        }
+                }
+                // Release the consumed SQEs
+                for( i; ret ) {
+                        uint32_t idx = ring.submit_q.array[ (i + shead) & smask ];
+                        ring.submit_q.sqes[ idx ].user_data = 0;
+                }
+                uint32_t avail = 0;
+                uint32_t sqe_num = *ring.submit_q.num;
+                for(i; sqe_num) {
+                        if( ring.submit_q.sqes[ i ].user_data == 0 ) avail++;
+                }
+                // update statistics
+                #if !defined(__CFA_NO_STATISTICS__)
+                        __tls_stats()->io.submit_q.submit_avg.rdy += to_submit;
+                        __tls_stats()->io.submit_q.submit_avg.csm += ret;
+                        __tls_stats()->io.submit_q.submit_avg.avl += avail;
+                        __tls_stats()->io.submit_q.submit_avg.cnt += 1;
+                #endif
+                        to_submit = __collect_submitions( ring );
+                }
+                if (to_submit > 0 || waitcnt > 0) {
+                        int ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, waitcnt, IORING_ENTER_GETEVENTS, mask, _NSIG / 8);
+                        if( ret < 0 ) {
+                                switch((int)errno) {
+                                case EAGAIN:
+                                case EINTR:
+                                        return [0, true];
+                                default:
+                                        abort( "KERNEL ERROR: IO_URING WAIT - %s\n", strerror(errno) );
+                                }
+                        }
+                        // Release the consumed SQEs
+                        __release_consumed_submission( ring );
+                        // update statistics
+                        __STATS__( true,
+                                if( to_submit > 0 ) {
+                                        io.submit_q.submit_avg.rdy += to_submit;
+                                        io.submit_q.submit_avg.csm += ret;
+                                        io.submit_q.submit_avg.cnt += 1;
+                                }
+                        )
+                }
+                // Memory barrier
+                __atomic_thread_fence( __ATOMIC_SEQ_CST );
                 // Drain the queue
 …
                 const uint32_t mask = *ring.completion_q.mask;
-                // Memory barrier
-                __atomic_thread_fence( __ATOMIC_SEQ_CST );
                 // Nothing was new return 0
                 if (head == tail) {
                         return 0;
+                        return [0, to_submit > 0];
+                }
                 uint32_t count = tail - head;
+                /* paranoid */ verify( count != 0 );
                 for(i; count) {
                         unsigned idx = (head + i) & mask;
 …
                                 // Update statistics
                                 #if !defined(__CFA_NO_STATISTICS__)
                                         __tls_stats()->io.complete_q.completed_avg.val += count;
                                         __tls_stats()->io.complete_q.completed_avg.slow_cnt += 1;
                                 #endif
+                                __STATS__( true,
+                                        io.complete_q.completed_avg.val += count;
+                                        io.complete_q.completed_avg.slow_cnt += 1;
+                                )
                                 if(again) {
 …
                                 int count;
                                 bool again;
                                 [count, again] = __drain_io( ring, &mask, 0, true );
+                                [count, again] = __drain_io( ring, &mask, 1, true );
                                 // Update statistics
                                 #if !defined(__CFA_NO_STATISTICS__)
                                         __tls_stats()->io.complete_q.completed_avg.val += count;
                                         __tls_stats()->io.complete_q.completed_avg.slow_cnt += 1;
                                 #endif
+                                __STATS__( true,
+                                        io.complete_q.completed_avg.val += count;
+                                        io.complete_q.completed_avg.slow_cnt += 1;
+                                )
+                        }
+                }
 …
                                 // Update statistics
                                 #if !defined(__CFA_NO_STATISTICS__)
                                         __tls_stats()->io.complete_q.completed_avg.val += count;
                                         __tls_stats()->io.complete_q.completed_avg.fast_cnt += 1;
                                 #endif
+                                __STATS__( true,
+                                        io.complete_q.completed_avg.val += count;
+                                        io.complete_q.completed_avg.fast_cnt += 1;
+                                )
                         enable_interrupts( __cfaabi_dbg_ctx );
 …
 // Submition steps :
+// 1 - We need to make sure we don't overflow any of the buffer, P(ring.submit) to make sure
+//     entries are available. The semaphore make sure that there is no more operations in
+//     progress then the number of entries in the buffer. This probably limits concurrency
+//     more than necessary since submitted but not completed operations don't need any
+//     entries in user space. However, I don't know what happens if we overflow the buffers
+//     because too many requests completed at once. This is a safe approach in all cases.
+//     Furthermore, with hundreds of entries, this may be okay.
+//
+// 2 - Allocate a queue entry. The ring already has memory for all entries but only the ones
+// 1 - Allocate a queue entry. The ring already has memory for all entries but only the ones
 //     listed in sq.array are visible by the kernel. For those not listed, the kernel does not
 //     offer any assurance that an entry is not being filled by multiple flags. Therefore, we
 //     need to write an allocator that allows allocating concurrently.
 //
 // 3 - Actually fill the submit entry, this is the only simple and straightforward step.
+// 2 - Actually fill the submit entry, this is the only simple and straightforward step.
 //
 // 4 - Append the entry index to the array and adjust the tail accordingly. This operation
+// 3 - Append the entry index to the array and adjust the tail accordingly. This operation
 //     needs to arrive to two concensus at the same time:
 //     A - The order in which entries are listed in the array: no two threads must pick the
 …
         [* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring, uint64_t data ) {
+                verify( data != 0 );
+                /* paranoid */ verify( data != 0 );
                 // Prepare the data we need
 …
+                                {
                                         // update statistics
+                                        #if !defined(__CFA_NO_STATISTICS__)
+                                                disable_interrupts();
+                                                        __tls_stats()->io.submit_q.alloc_avg.val   += len;
+                                                        __tls_stats()->io.submit_q.alloc_avg.block += block;
+                                                        __tls_stats()->io.submit_q.alloc_avg.cnt   += 1;
+                                                enable_interrupts( __cfaabi_dbg_ctx );
+                                        #endif
+                                        __STATS__( false,
+                                                io.submit_q.alloc_avg.val   += len;
+                                                io.submit_q.alloc_avg.block += block;
+                                                io.submit_q.alloc_avg.cnt   += 1;
+                                        )
 …
                         block++;
+                        yield();
+                        if( try_lock(ring.submit_q.lock __cfaabi_dbg_ctx2) ) {
+                                __release_consumed_submission( ring );
+                                unlock( ring.submit_q.lock );
+                        }
+                        else {
+                                yield();
+                        }
+                }
                 // update statistics
+                #if !defined(__CFA_NO_STATISTICS__)
+                disable_interrupts();
+                        __tls_stats()->io.submit_q.look_avg.val   += len;
+                        __tls_stats()->io.submit_q.look_avg.block += block;
+                        __tls_stats()->io.submit_q.look_avg.cnt   += 1;
+                enable_interrupts( __cfaabi_dbg_ctx );
+                #endif
+                __STATS__( false,
+                        io.submit_q.look_avg.val   += len;
+                        io.submit_q.look_avg.block += block;
+                        io.submit_q.look_avg.cnt   += 1;
+                )
                 return picked;
 …
                 if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
                         // If the poller thread submits, then we just need to add this to the ready array
                         __submit_to_ready_array( ring, idx, mask );
 …
                         __cfadbg_print_safe( io, "Kernel I/O : Added %u to ready for %p\n", idx, active_thread() );
+                }
+                else if( ring.cltr_flags & CFA_CLUSTER_IO_EAGER_SUBMITS ) {
+                        uint32_t picked = __submit_to_ready_array( ring, idx, mask );
+                        for() {
+                                yield();
+                                // If some one else collected our index, we are done
+                                #warning ABA problem
+                                if( ring.submit_q.ready[picked] != idx ) {
+                                        __STATS__( false,
+                                                io.submit_q.helped += 1;
+                                        )
+                                        return;
+                                }
+                                if( try_lock(ring.submit_q.lock __cfaabi_dbg_ctx2) ) {
+                                        __STATS__( false,
+                                                io.submit_q.leader += 1;
+                                        )
+                                        break;
+                                }
+                                __STATS__( false,
+                                        io.submit_q.busy += 1;
+                                )
+                        }
+                        // We got the lock
+                        unsigned to_submit = __collect_submitions( ring );
+                        int ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, 0, 0, 0p, _NSIG / 8);
+                        if( ret < 0 ) {
+                                switch((int)errno) {
+                                case EAGAIN:
+                                case EINTR:
+                                        unlock(ring.submit_q.lock);
+                                        return;
+                                default:
+                                        abort( "KERNEL ERROR: IO_URING WAIT - %s\n", strerror(errno) );
+                                }
+                        }
+                        /* paranoid */ verify( ret > 0 );
+                        // Release the consumed SQEs
+                        __release_consumed_submission( ring );
+                        // update statistics
+                        __STATS__( true,
+                                io.submit_q.submit_avg.rdy += to_submit;
+                                io.submit_q.submit_avg.csm += ret;
+                                io.submit_q.submit_avg.cnt += 1;
+                        )
+                        unlock(ring.submit_q.lock);
+                }
                 else {
 …
                         ring.submit_q.array[ (*tail) & mask ] = idx & mask;
                         __atomic_fetch_add(tail, 1ul32, __ATOMIC_SEQ_CST);
+                        /* paranoid */ verify( ring.submit_q.sqes[ idx ].user_data != 0 );
                         // Submit however, many entries need to be submitted
 …
                         // update statistics
+                        #if !defined(__CFA_NO_STATISTICS__)
+                                __tls_stats()->io.submit_q.submit_avg.csm += 1;
+                                __tls_stats()->io.submit_q.submit_avg.cnt += 1;
+                        #endif
+                        ring.submit_q.sqes[ idx & mask ].user_data = 0;
+                        __STATS__( false,
+                                io.submit_q.submit_avg.csm += 1;
+                                io.submit_q.submit_avg.cnt += 1;
+                        )
+                        // Release the consumed SQEs
+                        __release_consumed_submission( ring );
                         unlock(ring.submit_q.lock);
 …
+                }
+        }
+        static unsigned __collect_submitions( struct __io_data & ring ) {
+                /* paranoid */ verify( ring.submit_q.ready != 0p );
+                /* paranoid */ verify( ring.submit_q.ready_cnt > 0 );
+                unsigned to_submit = 0;
+                uint32_t tail = *ring.submit_q.tail;
+                const uint32_t mask = *ring.submit_q.mask;
+                // Go through the list of ready submissions
+                for( i; ring.submit_q.ready_cnt ) {
+                        // replace any submission with the sentinel, to consume it.
+                        uint32_t idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
+                        // If it was already the sentinel, then we are done
+                        if( idx == -1ul32 ) continue;
+                        // If we got a real submission, append it to the list
+                        ring.submit_q.array[ (tail + to_submit) & mask ] = idx & mask;
+                        to_submit++;
+                }
+                // Increment the tail based on how many we are ready to submit
+                __atomic_fetch_add(ring.submit_q.tail, to_submit, __ATOMIC_SEQ_CST);
+                return to_submit;
+        }
+        static uint32_t __release_consumed_submission( struct __io_data & ring ) {
+                const uint32_t smask = *ring.submit_q.mask;
+                if( !try_lock(ring.submit_q.release_lock __cfaabi_dbg_ctx2) ) return 0;
+                uint32_t chead = *ring.submit_q.head;
+                uint32_t phead = ring.submit_q.prev_head;
+                ring.submit_q.prev_head = chead;
+                unlock(ring.submit_q.release_lock);
+                uint32_t count = chead - phead;
+                for( i; count ) {
+                        uint32_t idx = ring.submit_q.array[ (phead + i) & smask ];
+                        ring.submit_q.sqes[ idx ].user_data = 0;
+                }
+                return count;
+        }
 #endif

libcfa/src/concurrency/kernel.hfa

-                      r49cad912
+                      r3a32b3a
 struct __io_data;
+#define CFA_CLUSTER_IO_POLLER_USER_THREAD    1 << 0 // 0x1
+#define CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS 1 << 1 // 0x2
+// #define CFA_CLUSTER_IO_POLLER_KERNEL_SIDE 1 << 2 // 0x4
+#define CFA_CLUSTER_IO_POLLER_USER_THREAD    (1 << 0) // 0x01
+#define CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS (1 << 1) // 0x02
+#define CFA_CLUSTER_IO_EAGER_SUBMITS         (1 << 2) // 0x04
+#define CFA_CLUSTER_IO_KERNEL_POLL_SUBMITS   (1 << 3) // 0x08
+#define CFA_CLUSTER_IO_KERNEL_POLL_COMPLETES (1 << 4) // 0x10
 #define CFA_CLUSTER_IO_BUFFLEN_OFFSET        16

libcfa/src/concurrency/kernel_private.hfa

-                      r49cad912
+                      r3a32b3a
 // Statics call at the end of each thread to register statistics
 #if !defined(__CFA_NO_STATISTICS__)
+static inline struct __stats_t * __tls_stats() {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        /* paranoid */ verify( kernelTLS.this_stats );
+        return kernelTLS.this_stats;
+}
+        static inline struct __stats_t * __tls_stats() {
+                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                /* paranoid */ verify( kernelTLS.this_stats );
+                return kernelTLS.this_stats;
+        }
+        #define __STATS__(in_kernel, ...) { \
+                if( !(in_kernel) ) disable_interrupts(); \
+                with( *__tls_stats() ) { \
+                        __VA_ARGS__ \
+                } \
+                if( !(in_kernel) ) enable_interrupts( __cfaabi_dbg_ctx ); \
+        }
+#else
+        #define __STATS__(in_kernel, ...)
 #endif

libcfa/src/concurrency/preemption.cfa

r49cad912	r3a32b3a
186	186	void enable_interrupts( __cfaabi_dbg_ctx_param ) {
187	187	processor * proc = kernelTLS.this_processor; // Cache the processor now since interrupts can start happening after the atomic store
	188	/* paranoid */ verify( proc );
188	189
189	190	with( kernelTLS.preemption_state ){

libcfa/src/concurrency/stats.cfa

-                      r49cad912
+                      r3a32b3a
                         stats->io.submit_q.submit_avg.rdy = 0;
                         stats->io.submit_q.submit_avg.csm = 0;
-                        stats->io.submit_q.submit_avg.avl = 0;
                         stats->io.submit_q.submit_avg.cnt = 0;
                         stats->io.submit_q.look_avg.val   = 0;
 …
                         stats->io.submit_q.alloc_avg.cnt   = 0;
                         stats->io.submit_q.alloc_avg.block = 0;
+                        stats->io.submit_q.helped = 0;
+                        stats->io.submit_q.leader = 0;
+                        stats->io.submit_q.busy   = 0;
                         stats->io.complete_q.completed_avg.val = 0;
                         stats->io.complete_q.completed_avg.slow_cnt = 0;
 …
                         __atomic_fetch_add( &cltr->io.submit_q.alloc_avg.cnt           , proc->io.submit_q.alloc_avg.cnt           , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.alloc_avg.block         , proc->io.submit_q.alloc_avg.block         , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.helped                  , proc->io.submit_q.helped                  , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.leader                  , proc->io.submit_q.leader                  , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.busy                    , proc->io.submit_q.busy                    , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.complete_q.completed_avg.val     , proc->io.complete_q.completed_avg.val     , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.complete_q.completed_avg.slow_cnt, proc->io.complete_q.completed_avg.slow_cnt, __ATOMIC_SEQ_CST );
 …
                                 double avgrdy = ((double)io.submit_q.submit_avg.rdy) / io.submit_q.submit_avg.cnt;
                                 double avgcsm = ((double)io.submit_q.submit_avg.csm) / io.submit_q.submit_avg.cnt;
-                                double avgavl = ((double)io.submit_q.submit_avg.avl) / io.submit_q.submit_avg.cnt;
                                 double lavgv = 0;
 …
                                         "- avg ready entries      : %'18.2lf\n"
                                         "- avg submitted entries  : %'18.2lf\n"
+                                        "- avg available entries  : %'18.2lf\n"
+                                        "- total helped entries   : %'15" PRIu64 "\n"
+                                        "- total leader entries   : %'15" PRIu64 "\n"
+                                        "- total busy submit      : %'15" PRIu64 "\n"
                                         "- total ready search     : %'15" PRIu64 "\n"
                                         "- avg ready search len   : %'18.2lf\n"
 …
                                         , cluster ? "Cluster" : "Processor",  name, id
                                         , io.submit_q.submit_avg.cnt
+                                        , avgrdy, avgcsm, avgavl
+                                        , avgrdy, avgcsm
+                                        , io.submit_q.helped, io.submit_q.leader, io.submit_q.busy
                                         , io.submit_q.look_avg.cnt
                                         , lavgv, lavgb

libcfa/src/concurrency/stats.hfa

-                      r49cad912
+                      r3a32b3a
                                         volatile uint64_t block;
                                 } alloc_avg;
+                                volatile uint64_t helped;
+                                volatile uint64_t leader;
+                                volatile uint64_t busy;
                         } submit_q;
                         struct {

Context Navigation

Changes in / [49cad912:3a32b3a]

Legend:

benchmark/io/batch-readv.c

benchmark/io/readv.cfa

libcfa/src/concurrency/io.cfa

libcfa/src/concurrency/kernel.hfa

libcfa/src/concurrency/kernel_private.hfa

libcfa/src/concurrency/preemption.cfa

libcfa/src/concurrency/stats.cfa

libcfa/src/concurrency/stats.hfa

Download in other formats: