Diff [2802824140076a430c44546624325795e3ff40dd:33e62f1b67260ca777fa767e6ee88452df8e052c] for / – Cforall

benchmark/io/readv.cfa

-              r2802824
+              r33e62f1b
         unsigned long int nthreads = 2;
         unsigned long int nprocs   = 1;
+        unsigned flags = 0;
+        unsigned sublen = 16;
+        int flags = 0;
         arg_loop:
         for(;;) {
                 static struct option options[] = {
+                        {"duration",     required_argument, 0, 'd'},
+                        {"nthreads",     required_argument, 0, 't'},
+                        {"nprocs",       required_argument, 0, 'p'},
+                        {"bufsize",      required_argument, 0, 'b'},
+                        {"userthread",   no_argument      , 0, 'u'},
+                        {"submitthread", no_argument      , 0, 's'},
+                        {"submitlength", required_argument, 0, 'l'},
+                        {"duration",   required_argument, 0, 'd'},
+                        {"nthreads",   required_argument, 0, 't'},
+                        {"nprocs",     required_argument, 0, 'p'},
+                        {"bufsize",    required_argument, 0, 'b'},
+                        {"userthread", no_argument      , 0, 'u'},
                         {0, 0, 0, 0}
                 };
                 int idx = 0;
                 int opt = getopt_long(argc, argv, "d:t:p:b:usl:", options, &idx);
+                int opt = getopt_long(argc, argv, "d:t:p:b:u", options, &idx);
                 const char * arg = optarg ? optarg : "";
 …
                                 flags |= CFA_CLUSTER_IO_POLLER_USER_THREAD;
                                 break;
-                        case 's':
-                                flags |= CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS;
-                                break;
-                        case 'l':
-                                sublen = strtoul(arg, &end, 10);
-                                if(*end != '\0' && sublen < 16) {
-                                        fprintf(stderr, "Submit length must be at least 16, was %s\n", arg);
-                                        goto usage;
+                                }
-                                flags |= (sublen << CFA_CLUSTER_IO_BUFFLEN_OFFSET);
-                                break;
                         // Other cases
                         default: /* ? */
 …
                                 fprintf(stderr, "  -p, --nprocs=NPROCS      Number of kernel threads\n");
                                 fprintf(stderr, "  -b, --buflen=SIZE        Number of bytes to read per request\n");
-                                fprintf(stderr, "  -u, --userthread         If set, cluster uses user-thread to poll I/O\n");
-                                fprintf(stderr, "  -s, --submitthread       If set, cluster uses polling thread to submit I/O\n");
                                 exit(EXIT_FAILURE);
+                }

libcfa/src/concurrency/io.cfa

-              r2802824
+              r33e62f1b
 #include "kernel.hfa"
-#include "bitmanip.hfa"
 #if !defined(HAVE_LINUX_IO_URING_H)
         void __kernel_io_startup( cluster &, unsigned, bool ) {
+        void __kernel_io_startup( cluster &, int, bool ) {
                 // Nothing to do without io_uring
+        }
 …
         struct __io_poller_fast {
                 struct __io_data * ring;
+                bool waiting;
                 $thread thrd;
         };
 …
         void ?{}( __io_poller_fast & this, struct cluster & cltr ) {
                 this.ring = cltr.io;
+                this.waiting = true;
                 (this.thrd){ "Fast I/O Poller", cltr };
+        }
 …
                 // Like head/tail but not seen by the kernel
                 volatile uint32_t alloc;
+                volatile uint32_t * ready;
+                uint32_t ready_cnt;
+                volatile uint32_t ready;
                 __spinlock_t lock;
 …
                                         volatile unsigned long long int block;
                                 } submit_avg;
-                                struct {
-                                        volatile unsigned long long int val;
-                                        volatile unsigned long long int cnt;
-                                        volatile unsigned long long int block;
-                                } look_avg;
                         } stats;
                 #endif
 …
                                 void * stack;
                                 pthread_t kthrd;
-                                volatile bool blocked;
                         } slow;
                         __io_poller_fast fast;
 …
 // I/O Startup / Shutdown logic
 //=============================================================================================
         void __kernel_io_startup( cluster & this, unsigned io_flags, bool main_cluster ) {
+        void __kernel_io_startup( cluster & this, int io_flags, bool main_cluster ) {
                 this.io = malloc();
 …
                 sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
                 sq.alloc = *sq.tail;
+                if( io_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
+                        /* paranoid */ verify( is_pow2( io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET ) || ((io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET) < 8)  );
+                        sq.ready_cnt = max(io_flags >> CFA_CLUSTER_IO_BUFFLEN_OFFSET, 8);
+                        sq.ready = alloc_align( 64, sq.ready_cnt );
+                        for(i; sq.ready_cnt) {
+                                sq.ready[i] = -1ul32;
+                        }
+                }
+                else {
+                        sq.ready_cnt = 0;
+                        sq.ready = 0p;
+                }
+                sq.ready = *sq.tail;
                 // completion queue
 …
                         this.io->submit_q.stats.submit_avg.cnt   = 0;
                         this.io->submit_q.stats.submit_avg.block = 0;
-                        this.io->submit_q.stats.look_avg.val   = 0;
-                        this.io->submit_q.stats.look_avg.cnt   = 0;
-                        this.io->submit_q.stats.look_avg.block = 0;
                         this.io->completion_q.stats.completed_avg.val = 0;
                         this.io->completion_q.stats.completed_avg.slow_cnt = 0;
 …
                 // Create the poller thread
                 __cfadbg_print_safe(io_core, "Kernel I/O : Creating slow poller for cluter %p\n", &this);
-                this.io->poller.slow.blocked = false;
                 this.io->poller.slow.stack = __create_pthread( &this.io->poller.slow.kthrd, __io_poller_slow, &this );
+        }
 …
                 if( this.io->cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD ) {
                         with( this.io->poller.fast ) {
+                                /* paranoid */ verify( waiting ); // The thread shouldn't be in a system call
                                 /* paranoid */ verify( this.procs.head == 0p || &this == mainCluster );
                                 /* paranoid */ verify( this.idles.head == 0p || &this == mainCluster );
                                 // We need to adjust the clean-up based on where the thread is
                                 if( thrd.state == Ready || thrd.preempted != __NO_PREEMPTION ) {
+                                if( thrd.preempted != __NO_PREEMPTION ) {
                                         // This is the tricky case
                                         // The thread was preempted and now it is on the ready queue
+                                        /* paranoid */ verify( thrd.state == Active );           // The thread better be in this state
                                         /* paranoid */ verify( thrd.next != 0p );                // The thread should be the last on the list
                                         /* paranoid */ verify( this.ready_queue.head == &thrd ); // The thread should be the only thing on the list
 …
                         if(this.print_stats) {
                                 with(this.io->submit_q.stats, this.io->completion_q.stats) {
+                                        double lavgv = 0;
+                                        double lavgb = 0;
+                                        if(look_avg.cnt != 0) {
+                                                lavgv = ((double)look_avg.val  ) / look_avg.cnt;
+                                                lavgb = ((double)look_avg.block) / look_avg.cnt;
+                                        }
+                                        __cfaabi_bits_print_safe( STDOUT_FILENO,
+                                        __cfaabi_bits_print_safe( STDERR_FILENO,
                                                 "----- I/O uRing Stats -----\n"
+                                                "- total submit calls     : %'15llu\n"
+                                                "- avg submit             : %'18.2lf\n"
+                                                "- pre-submit block %%     : %'18.2lf\n"
+                                                "- total ready search     : %'15llu\n"
+                                                "- avg ready search len   : %'18.2lf\n"
+                                                "- avg ready search block : %'18.2lf\n"
+                                                "- total wait calls       : %'15llu   (%'llu slow, %'llu fast)\n"
+                                                "- avg completion/wait    : %'18.2lf\n",
+                                                "- total submit calls  : %'15llu\n"
+                                                "- avg submit          : %'18.2lf\n"
+                                                "- pre-submit block %%  : %'18.2lf\n"
+                                                "- total wait calls    : %'15llu   (%'llu slow, %'llu fast)\n"
+                                                "- avg completion/wait : %'18.2lf\n",
                                                 submit_avg.cnt,
                                                 ((double)submit_avg.val) / submit_avg.cnt,
                                                 (100.0 * submit_avg.block) / submit_avg.cnt,
-                                                look_avg.cnt,
-                                                lavgv,
-                                                lavgb,
                                                 completed_avg.slow_cnt + completed_avg.fast_cnt,
                                                 completed_avg.slow_cnt,  completed_avg.fast_cnt,
 …
                 close(this.io->fd);
-                free( this.io->submit_q.ready ); // Maybe null, doesn't matter
                 free( this.io );
+        }
 …
         // Process a single completion message from the io_uring
         // This is NOT thread-safe
+        static [int, bool] __drain_io( & struct __io_data ring, * sigset_t mask, int waitcnt, bool in_kernel ) {
+                unsigned to_submit = 0;
+                if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
+                        // If the poller thread also submits, then we need to aggregate the submissions which are ready
+                        uint32_t * tail = ring.submit_q.tail;
+                        const uint32_t mask = *ring.submit_q.mask;
+                        // Go through the list of ready submissions
+                        for( i; ring.submit_q.ready_cnt ) {
+                                // replace any submission with the sentinel, to consume it.
+                                uint32_t idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
+                                // If it was already the sentinel, then we are done
+                                if( idx == -1ul32 ) continue;
+                                // If we got a real submission, append it to the list
+                                ring.submit_q.array[ ((*tail) + to_submit) & mask ] = idx & mask;
+                                to_submit++;
+                        }
+                        // Increment the tail based on how many we are ready to submit
+                        __atomic_fetch_add(tail, to_submit, __ATOMIC_SEQ_CST);
+                        // update statistics
+                        #if !defined(__CFA_NO_STATISTICS__)
+                                ring.submit_q.stats.submit_avg.val += to_submit;
+                                ring.submit_q.stats.submit_avg.cnt += 1;
+                        #endif
+                }
+                int ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, waitcnt, IORING_ENTER_GETEVENTS, mask, _NSIG / 8);
+        static int __drain_io( struct __io_data & ring, sigset_t * mask, int waitcnt, bool in_kernel ) {
+                int ret = syscall( __NR_io_uring_enter, ring.fd, 0, waitcnt, IORING_ENTER_GETEVENTS, mask, _NSIG / 8);
                 if( ret < 0 ) {
                         switch((int)errno) {
 …
                 __atomic_fetch_add( ring.completion_q.head, count, __ATOMIC_RELAXED );
                 return [count, count > 0 || to_submit > 0];
+                return count;
+        }
 …
                 if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD ) {
                         while(!__atomic_load_n(&ring.done, __ATOMIC_SEQ_CST)) {
-                                __atomic_store_n( &ring.poller.slow.blocked, true, __ATOMIC_SEQ_CST );
                                 // In the user-thread approach drain and if anything was drained,
                                 // batton pass to the user-thread
+                                int count;
+                                bool again;
+                                [count, again] = __drain_io( ring, &mask, 1, true );
+                                __atomic_store_n( &ring.poller.slow.blocked, false, __ATOMIC_SEQ_CST );
+                                int count = __drain_io( ring, &mask, 1, true );
                                 // Update statistics
 …
                                 #endif
                                 if(again) {
+                                if(count > 0) {
                                         __cfadbg_print_safe(io_core, "Kernel I/O : Moving to ring %p to fast poller\n", &ring);
                                         __unpark( &ring.poller.fast.thrd __cfaabi_dbg_ctx2 );
 …
                         while(!__atomic_load_n(&ring.done, __ATOMIC_SEQ_CST)) {
                                 //In the naive approach, just poll the io completion queue directly
+                                int count;
+                                bool again;
+                                [count, again] = __drain_io( ring, &mask, 1, true );
+                                int count = __drain_io( ring, &mask, 1, true );
                                 // Update statistics
 …
                 // Then loop until we need to start
                 while(!__atomic_load_n(&this.ring->done, __ATOMIC_SEQ_CST)) {
                         // Drain the io
+                        int count;
+                        bool again;
+                        [count, again] = __drain_io( *this.ring, 0p, 0, false );
+                        if(!again) reset++;
+                        this.waiting = false;
+                        int count = __drain_io( *this.ring, 0p, 0, false );
+                        reset += count > 0 ? 1 : 0;
                         // Update statistics
 …
                         #endif
                         // If we got something, just yield and check again
+                        this.waiting = true;
                         if(reset < 5) {
+                                // If we got something, just yield and check again
                                 yield();
+                        }
-                        // We didn't get anything baton pass to the slow poller
                         else {
+                                // We didn't get anything baton pass to the slow poller
                                 __cfadbg_print_safe(io_core, "Kernel I/O : Moving to ring %p to slow poller\n", &this.ring);
+                                post( this.ring->poller.sem );
+                                park( __cfaabi_dbg_ctx );
                                 reset = 0;
-                                // wake up the slow poller
-                                post( this.ring->poller.sem );
-                                // park this thread
-                                park( __cfaabi_dbg_ctx );
+                        }
+                }
                 __cfadbg_print_safe(io_core, "Kernel I/O : Fast poller for ring %p stopping\n", &this.ring);
+        }
-        static inline void __wake_poller( struct __io_data & ring ) __attribute__((artificial));
-        static inline void __wake_poller( struct __io_data & ring ) {
-                if(!__atomic_load_n( &ring.poller.slow.blocked, __ATOMIC_SEQ_CST)) return;
-                sigval val = { 1 };
-                pthread_sigqueue( ring.poller.slow.kthrd, SIGUSR1, val );
+        }
 …
                 uint32_t idx = __atomic_fetch_add(&ring.submit_q.alloc, 1ul32, __ATOMIC_SEQ_CST);
+                // Mask the idx now to allow make everything easier to check
+                idx &= *ring.submit_q.mask;
+                // Validate that we didn't overflow anything
+                // Check that nothing overflowed
+                /* paranoid */ verify( true );
+                // Check that it goes head -> tail -> alloc and never head -> alloc -> tail
+                /* paranoid */ verify( true );
                 // Return the sqe
                 return [&ring.submit_q.sqes[ idx ], idx];
+                return [&ring.submit_q.sqes[ idx & (*ring.submit_q.mask)], idx];
+        }
         static inline void __submit( struct __io_data & ring, uint32_t idx ) {
+                // Get now the data we definetely need
+                uint32_t * const tail = ring.submit_q.tail;
+                // get mutual exclusion
+                lock(ring.submit_q.lock __cfaabi_dbg_ctx2);
+                // Append to the list of ready entries
+                uint32_t * tail = ring.submit_q.tail;
                 const uint32_t mask = *ring.submit_q.mask;
+                // There are 2 submission schemes, check which one we are using
+                if( ring.cltr_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
+                        // If the poller thread submits, then we just need to add this to the ready array
+                        /* paranoid */ verify( idx <= mask   );
+                        /* paranoid */ verify( idx != -1ul32 );
+                        // We need to find a spot in the ready array
+                        __attribute((unused)) int len   = 0;
+                        __attribute((unused)) int block = 0;
+                        uint32_t expected = -1ul32;
+                        uint32_t ready_mask = ring.submit_q.ready_cnt - 1;
+                        uint32_t off = __tls_rand();
+                        LOOKING: for() {
+                                for(i; ring.submit_q.ready_cnt) {
+                                        uint32_t ii = (i + off) & ready_mask;
+                                        if( __atomic_compare_exchange_n( &ring.submit_q.ready[ii], &expected, idx, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED ) ) {
+                                                break LOOKING;
+                                        }
+                                        len ++;
+                                }
+                                block++;
+                                yield();
+                ring.submit_q.array[ (*tail) & mask ] = idx & mask;
+                __atomic_fetch_add(tail, 1ul32, __ATOMIC_SEQ_CST);
+                // Submit however, many entries need to be submitted
+                int ret = syscall( __NR_io_uring_enter, ring.fd, 1, 0, 0, 0p, 0);
+                if( ret < 0 ) {
+                        switch((int)errno) {
+                        default:
+                                abort( "KERNEL ERROR: IO_URING SUBMIT - %s\n", strerror(errno) );
+                        }
+                        __wake_poller( ring );
+                        // update statistics
+                        #if !defined(__CFA_NO_STATISTICS__)
+                                __atomic_fetch_add( &ring.submit_q.stats.look_avg.val,   len,   __ATOMIC_RELAXED );
+                                __atomic_fetch_add( &ring.submit_q.stats.look_avg.block, block, __ATOMIC_RELAXED );
+                                __atomic_fetch_add( &ring.submit_q.stats.look_avg.cnt,   1,     __ATOMIC_RELAXED );
+                        #endif
+                        __cfadbg_print_safe( io, "Kernel I/O : Added %u to ready for %p\n", idx, active_thread() );
+                }
+                else {
+                        // get mutual exclusion
+                        lock(ring.submit_q.lock __cfaabi_dbg_ctx2);
+                        // Append to the list of ready entries
+                        /* paranoid */ verify( idx <= mask );
+                        ring.submit_q.array[ (*tail) & mask ] = idx & mask;
+                        __atomic_fetch_add(tail, 1ul32, __ATOMIC_SEQ_CST);
+                        // Submit however, many entries need to be submitted
+                        int ret = syscall( __NR_io_uring_enter, ring.fd, 1, 0, 0, 0p, 0);
+                        if( ret < 0 ) {
+                                switch((int)errno) {
+                                default:
+                                        abort( "KERNEL ERROR: IO_URING SUBMIT - %s\n", strerror(errno) );
+                                }
+                        }
+                        // update statistics
+                        #if !defined(__CFA_NO_STATISTICS__)
+                                ring.submit_q.stats.submit_avg.val += 1;
+                                ring.submit_q.stats.submit_avg.cnt += 1;
+                        #endif
+                        unlock(ring.submit_q.lock);
+                        __cfadbg_print_safe( io, "Kernel I/O : Performed io_submit for %p, returned %d\n", active_thread(), ret );
+                }
+                }
+                // update statistics
+                #if !defined(__CFA_NO_STATISTICS__)
+                        ring.submit_q.stats.submit_avg.val += 1;
+                        ring.submit_q.stats.submit_avg.cnt += 1;
+                #endif
+                unlock(ring.submit_q.lock);
+                // Make sure that idx was submitted
+                // Be careful to not get false positive if we cycled the entire list or that someone else submitted for us
+                __cfadbg_print_safe( io, "Kernel I/O : Performed io_submit for %p, returned %d\n", active_thread(), ret );
+        }

libcfa/src/concurrency/kernel.cfa

r2802824	r33e62f1b
258	258	}
259	259
260		void ?{}(cluster & this, const char name[], Duration preemption_rate, ~~unsigned~~ io_flags) with( this ) {
	260	void ?{}(cluster & this, const char name[], Duration preemption_rate, int io_flags) with( this ) {
261	261	this.name = name;
262	262	this.preemption_rate = preemption_rate;

libcfa/src/concurrency/kernel.hfa

-              r2802824
+              r33e62f1b
 struct __io_data;
+#define CFA_CLUSTER_IO_POLLER_USER_THREAD    1 << 0 // 0x1
+#define CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS 1 << 1 // 0x2
+// #define CFA_CLUSTER_IO_POLLER_KERNEL_SIDE 1 << 2 // 0x4
+#define CFA_CLUSTER_IO_BUFFLEN_OFFSET        16
+#define CFA_CLUSTER_IO_POLLER_USER_THREAD 1 << 0
+// #define CFA_CLUSTER_IO_POLLER_KERNEL_SIDE 1 << 1
 …
 extern Duration default_preemption();
 void ?{} (cluster & this, const char name[], Duration preemption_rate, unsigned flags);
+void ?{} (cluster & this, const char name[], Duration preemption_rate, int flags);
 void ^?{}(cluster & this);
 static inline void ?{} (cluster & this)                                           { this{"Anonymous Cluster", default_preemption(), 0}; }
 static inline void ?{} (cluster & this, Duration preemption_rate)                 { this{"Anonymous Cluster", preemption_rate, 0}; }
 static inline void ?{} (cluster & this, const char name[])                        { this{name, default_preemption(), 0}; }
 static inline void ?{} (cluster & this, unsigned flags)                           { this{"Anonymous Cluster", default_preemption(), flags}; }
 static inline void ?{} (cluster & this, Duration preemption_rate, unsigned flags) { this{"Anonymous Cluster", preemption_rate, flags}; }
 static inline void ?{} (cluster & this, const char name[], unsigned flags)        { this{name, default_preemption(), flags}; }
+static inline void ?{} (cluster & this)                                      { this{"Anonymous Cluster", default_preemption(), 0}; }
+static inline void ?{} (cluster & this, Duration preemption_rate)            { this{"Anonymous Cluster", preemption_rate, 0}; }
+static inline void ?{} (cluster & this, const char name[])                   { this{name, default_preemption(), 0}; }
+static inline void ?{} (cluster & this, int flags)                           { this{"Anonymous Cluster", default_preemption(), flags}; }
+static inline void ?{} (cluster & this, Duration preemption_rate, int flags) { this{"Anonymous Cluster", preemption_rate, flags}; }
+static inline void ?{} (cluster & this, const char name[], int flags)        { this{name, default_preemption(), flags}; }
 static inline [cluster *&, cluster *& ] __get( cluster & this ) __attribute__((const)) { return this.node.[next, prev]; }

libcfa/src/concurrency/kernel_private.hfa

r2802824	r33e62f1b
77	77	//-----------------------------------------------------------------------------
78	78	// I/O
79		void __kernel_io_startup ( cluster &, ~~unsigned~~, bool );
	79	void __kernel_io_startup ( cluster &, int, bool );
80	80	void __kernel_io_finish_start( cluster & );
81	81	void __kernel_io_prepare_stop( cluster & );

Context Navigation

Changes in / [2802824:33e62f1b]

Legend:

benchmark/io/readv.cfa

libcfa/src/concurrency/io.cfa

libcfa/src/concurrency/kernel.cfa

libcfa/src/concurrency/kernel.hfa

libcfa/src/concurrency/kernel_private.hfa

Download in other formats: