Context Navigation

← Previous Change
Next Change →

concurrency

Timestamp:

May 22, 2020, 11:49:29 AM (6 years ago)

Author:

Thierry Delisle <tdelisle@…>

Branches:

ADT, arm-eh, ast-experimental, enum, forall-pointer-decay, jacob/cs343-translation, master, new-ast, new-ast-unique-expr, pthread-emulation, qualifiedEnum, stuck-waitfor-destruct

Children:

95cb63b

Parents:

2802824 (diff), 99fea48 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.

Message:

Merge branch 'master' into relaxed_ready

Location:

libcfa/src/concurrency

Files:

: 2 edited

io.cfa (modified) (18 diffs)
kernel.cfa (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

libcfa/src/concurrency/io.cfa

-              r2802824
+              r0e4df2e
                 // Like head/tail but not seen by the kernel
-                volatile uint32_t alloc;
                 volatile uint32_t * ready;
                 uint32_t ready_cnt;
 …
                         struct {
                                 struct {
+                                        volatile unsigned long long int val;
+                                        volatile unsigned long long int rdy;
+                                        volatile unsigned long long int csm;
+                                        volatile unsigned long long int avl;
                                         volatile unsigned long long int cnt;
-                                        volatile unsigned long long int block;
                                 } submit_avg;
                                 struct {
 …
                                         volatile unsigned long long int block;
                                 } look_avg;
+                                struct {
+                                        volatile unsigned long long int val;
+                                        volatile unsigned long long int cnt;
+                                        volatile unsigned long long int block;
+                                } alloc_avg;
                         } stats;
                 #endif
 …
                 sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
                 sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
+                sq.alloc = *sq.tail;
+                {
+                        const uint32_t num = *sq.num;
+                        for( i; num ) {
+                                sq.sqes[i].user_data = 0ul64;
+                        }
+                }
                 if( io_flags & CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS ) {
 …
                 // Initialize statistics
                 #if !defined(__CFA_NO_STATISTICS__)
+                        this.io->submit_q.stats.submit_avg.val   = 0;
+                        this.io->submit_q.stats.submit_avg.cnt   = 0;
+                        this.io->submit_q.stats.submit_avg.block = 0;
+                        this.io->submit_q.stats.submit_avg.rdy = 0;
+                        this.io->submit_q.stats.submit_avg.csm = 0;
+                        this.io->submit_q.stats.submit_avg.avl = 0;
+                        this.io->submit_q.stats.submit_avg.cnt = 0;
                         this.io->submit_q.stats.look_avg.val   = 0;
                         this.io->submit_q.stats.look_avg.cnt   = 0;
                         this.io->submit_q.stats.look_avg.block = 0;
+                        this.io->submit_q.stats.alloc_avg.val   = 0;
+                        this.io->submit_q.stats.alloc_avg.cnt   = 0;
+                        this.io->submit_q.stats.alloc_avg.block = 0;
                         this.io->completion_q.stats.completed_avg.val = 0;
                         this.io->completion_q.stats.completed_avg.slow_cnt = 0;
 …
                                         this.ready_queue.head = 1p;
                                         thrd.next = 0p;
+                                        __cfaabi_dbg_debug_do( thrd.unpark_stale = true );
                                         // Fixup the thread state
 …
                         if(this.print_stats) {
                                 with(this.io->submit_q.stats, this.io->completion_q.stats) {
+                                        double avgrdy = ((double)submit_avg.rdy) / submit_avg.cnt;
+                                        double avgcsm = ((double)submit_avg.csm) / submit_avg.cnt;
+                                        double avgavl = ((double)submit_avg.avl) / submit_avg.cnt;
                                         double lavgv = 0;
                                         double lavgb = 0;
 …
+                                        }
+                                        double aavgv = 0;
+                                        double aavgb = 0;
+                                        if(alloc_avg.cnt != 0) {
+                                                aavgv = ((double)alloc_avg.val  ) / alloc_avg.cnt;
+                                                aavgb = ((double)alloc_avg.block) / alloc_avg.cnt;
+                                        }
                                         __cfaabi_bits_print_safe( STDOUT_FILENO,
                                                 "----- I/O uRing Stats -----\n"
                                                 "- total submit calls     : %'15llu\n"
+                                                "- avg submit             : %'18.2lf\n"
+                                                "- pre-submit block %%     : %'18.2lf\n"
+                                                "- avg ready entries      : %'18.2lf\n"
+                                                "- avg submitted entries  : %'18.2lf\n"
+                                                "- avg available entries  : %'18.2lf\n"
                                                 "- total ready search     : %'15llu\n"
                                                 "- avg ready search len   : %'18.2lf\n"
                                                 "- avg ready search block : %'18.2lf\n"
+                                                "- total alloc search     : %'15llu\n"
+                                                "- avg alloc search len   : %'18.2lf\n"
+                                                "- avg alloc search block : %'18.2lf\n"
                                                 "- total wait calls       : %'15llu   (%'llu slow, %'llu fast)\n"
                                                 "- avg completion/wait    : %'18.2lf\n",
                                                 submit_avg.cnt,
+                                                ((double)submit_avg.val) / submit_avg.cnt,
+                                                (100.0 * submit_avg.block) / submit_avg.cnt,
+                                                avgrdy,
+                                                avgcsm,
+                                                avgavl,
                                                 look_avg.cnt,
                                                 lavgv,
                                                 lavgb,
+                                                alloc_avg.cnt,
+                                                aavgv,
+                                                aavgb,
                                                 completed_avg.slow_cnt + completed_avg.fast_cnt,
                                                 completed_avg.slow_cnt,  completed_avg.fast_cnt,
 …
                         // If the poller thread also submits, then we need to aggregate the submissions which are ready
                         uint32_t * tail = ring.submit_q.tail;
+                        uint32_t tail = *ring.submit_q.tail;
                         const uint32_t mask = *ring.submit_q.mask;
 …
                                 // If we got a real submission, append it to the list
                                 ring.submit_q.array[ ((*tail) + to_submit) & mask ] = idx & mask;
+                                ring.submit_q.array[ (tail + to_submit) & mask ] = idx & mask;
                                 to_submit++;
+                        }
                         // Increment the tail based on how many we are ready to submit
+                        __atomic_fetch_add(tail, to_submit, __ATOMIC_SEQ_CST);
+                        // update statistics
+                        #if !defined(__CFA_NO_STATISTICS__)
+                                ring.submit_q.stats.submit_avg.val += to_submit;
+                                ring.submit_q.stats.submit_avg.cnt += 1;
+                        #endif
+                }
+                        __atomic_fetch_add(ring.submit_q.tail, to_submit, __ATOMIC_SEQ_CST);
+                }
+                const uint32_t smask = *ring.submit_q.mask;
+                uint32_t shead = *ring.submit_q.head;
                 int ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, waitcnt, IORING_ENTER_GETEVENTS, mask, _NSIG / 8);
                 if( ret < 0 ) {
 …
+                }
+                verify( (shead + ret) == *ring.submit_q.head );
+                // Release the consumed SQEs
+                for( i; ret ) {
+                        uint32_t idx = ring.submit_q.array[ (i + shead) & smask ];
+                        ring.submit_q.sqes[ idx ].user_data = 0;
+                }
+                uint32_t avail = 0;
+                uint32_t sqe_num = *ring.submit_q.num;
+                for(i; sqe_num) {
+                        if( ring.submit_q.sqes[ i ].user_data == 0 ) avail++;
+                }
+                // update statistics
+                #if !defined(__CFA_NO_STATISTICS__)
+                        ring.submit_q.stats.submit_avg.rdy += to_submit;
+                        ring.submit_q.stats.submit_avg.csm += ret;
+                        ring.submit_q.stats.submit_avg.avl += avail;
+                        ring.submit_q.stats.submit_avg.cnt += 1;
+                #endif
                 // Drain the queue
                 unsigned head = *ring.completion_q.head;
+                unsigned tail = __atomic_load_n(ring.completion_q.tail, __ATOMIC_ACQUIRE);
+                unsigned tail = *ring.completion_q.tail;
+                const uint32_t mask = *ring.completion_q.mask;
+                // Memory barrier
+                __atomic_thread_fence( __ATOMIC_SEQ_CST );
                 // Nothing was new return 0
 …
                 uint32_t count = tail - head;
                 for(i; count) {
                         unsigned idx = (head + i) & (*ring.completion_q.mask);
+                        unsigned idx = (head + i) & mask;
                         struct io_uring_cqe & cqe = ring.completion_q.cqes[idx];
 …
                 // Allow new submissions to happen
                 V(ring.submit, count);
+                // V(ring.submit, count);
                 // Mark to the kernel that the cqe has been seen
                 // Ensure that the kernel only sees the new value of the head index after the CQEs have been read.
+                __atomic_thread_fence( __ATOMIC_SEQ_CST );
                 __atomic_fetch_add( ring.completion_q.head, count, __ATOMIC_RELAXED );
 …
 //
+        static inline [* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring ) {
+                // Wait for a spot to be available
+                __attribute__((unused)) bool blocked = P(ring.submit);
+                #if !defined(__CFA_NO_STATISTICS__)
+                        __atomic_fetch_add( &ring.submit_q.stats.submit_avg.block, blocked ? 1ul64 : 0ul64, __ATOMIC_RELAXED );
+                #endif
+                // Allocate the sqe
+                uint32_t idx = __atomic_fetch_add(&ring.submit_q.alloc, 1ul32, __ATOMIC_SEQ_CST);
+                // Mask the idx now to allow make everything easier to check
+                idx &= *ring.submit_q.mask;
+                // Return the sqe
+                return [&ring.submit_q.sqes[ idx ], idx];
+        static inline [* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring, uint64_t data ) {
+                verify( data != 0 );
+                // Prepare the data we need
+                __attribute((unused)) int len   = 0;
+                __attribute((unused)) int block = 0;
+                uint32_t cnt = *ring.submit_q.num;
+                uint32_t mask = *ring.submit_q.mask;
+                uint32_t off = __tls_rand();
+                // Loop around looking for an available spot
+                LOOKING: for() {
+                        // Look through the list starting at some offset
+                        for(i; cnt) {
+                                uint64_t expected = 0;
+                                uint32_t idx = (i + off) & mask;
+                                struct io_uring_sqe * sqe = &ring.submit_q.sqes[idx];
+                                volatile uint64_t * udata = &sqe->user_data;
+                                if( *udata == expected &&
+                                        __atomic_compare_exchange_n( udata, &expected, data, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED ) )
+                                {
+                                        // update statistics
+                                        #if !defined(__CFA_NO_STATISTICS__)
+                                                __atomic_fetch_add( &ring.submit_q.stats.alloc_avg.val,   len,   __ATOMIC_RELAXED );
+                                                __atomic_fetch_add( &ring.submit_q.stats.alloc_avg.block, block, __ATOMIC_RELAXED );
+                                                __atomic_fetch_add( &ring.submit_q.stats.alloc_avg.cnt,   1,     __ATOMIC_RELAXED );
+                                        #endif
+                                        // Success return the data
+                                        return [sqe, idx];
+                                }
+                                verify(expected != data);
+                                len ++;
+                        }
+                        block++;
+                        yield();
+                }
+        }
 …
                         __attribute((unused)) int len   = 0;
                         __attribute((unused)) int block = 0;
-                        uint32_t expected = -1ul32;
                         uint32_t ready_mask = ring.submit_q.ready_cnt - 1;
                         uint32_t off = __tls_rand();
 …
                                 for(i; ring.submit_q.ready_cnt) {
                                         uint32_t ii = (i + off) & ready_mask;
+                                        uint32_t expected = -1ul32;
                                         if( __atomic_compare_exchange_n( &ring.submit_q.ready[ii], &expected, idx, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED ) ) {
                                                 break LOOKING;
+                                        }
+                                        verify(expected != idx);
                                         len ++;
 …
                         // update statistics
                         #if !defined(__CFA_NO_STATISTICS__)
                                 ring.submit_q.stats.submit_avg.val += 1;
+                                ring.submit_q.stats.submit_avg.csm += 1;
                                 ring.submit_q.stats.submit_avg.cnt += 1;
                         #endif
 …
         #define __submit_prelude \
+                struct __io_data & ring = *active_cluster()->io; \
+                io_user_data data = { 0, active_thread() }; \
+                struct __io_data & ring = *data.thrd->curr_cluster->io; \
                 struct io_uring_sqe * sqe; \
                 uint32_t idx; \
                 [sqe, idx] = __submit_alloc( ring );
+                [sqe, idx] = __submit_alloc( ring, (uint64_t)&data );
         #define __submit_wait \
-                io_user_data data = { 0, active_thread() }; \
                 /*__cfaabi_bits_print_safe( STDERR_FILENO, "Preparing user data %p for %p\n", &data, data.thrd );*/ \
                 sqe->user_data = (uint64_t)&data; \
+                verify( sqe->user_data == (uint64_t)&data ); \
                 __submit( ring, idx ); \
                 park( __cfaabi_dbg_ctx ); \

libcfa/src/concurrency/kernel.cfa

r2802824	r0e4df2e
648	648
649	649	// record activity
	650	__cfaabi_dbg_debug_do( char * old_caller = thrd->unpark_caller; )
650	651	__cfaabi_dbg_record_thrd( *thrd, false, caller );
651	652

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 0e4df2e for libcfa/src/concurrency

Legend:

libcfa/src/concurrency/io.cfa

libcfa/src/concurrency/kernel.cfa

Download in other formats: