Context Navigation

← Previous Change
Next Change →

Changeset 13d33a75 for libcfa/src/concurrency

Timestamp:

Aug 18, 2020, 4:31:19 PM (6 years ago)

Author:

Thierry Delisle <tdelisle@…>

Branches:

ADT, arm-eh, ast-experimental, enum, forall-pointer-decay, jacob/cs343-translation, master, new-ast, new-ast-unique-expr, pthread-emulation, qualifiedEnum, stuck-waitfor-destruct

Children:

Parents:

ef9988b (diff), f2384c9a (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.

Message:

Merge branch 'master' into new-ast

Location:

libcfa/src/concurrency

Files:

: 13 edited

coroutine.cfa (modified) (1 diff)
invoke.c (modified) (2 diffs)
invoke.h (modified) (3 diffs)
io.cfa (modified) (21 diffs)
io/setup.cfa (modified) (5 diffs)
io/types.hfa (modified) (5 diffs)
iocall.cfa (modified) (19 diffs)
kernel.cfa (modified) (2 diffs)
kernel/fwd.hfa (modified) (2 diffs)
kernel/startup.cfa (modified) (5 diffs)
ready_queue.cfa (modified) (3 diffs)
stats.cfa (modified) (4 diffs)
stats.hfa (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

libcfa/src/concurrency/coroutine.cfa

ref9988b	r13d33a75
215	215	return cor;
216	216	}
	217
	218	struct $coroutine * __cfactx_cor_active(void) {
	219	return active_coroutine();
	220	}
217	221	}
218	222

libcfa/src/concurrency/invoke.c

-              ref9988b
+              r13d33a75
 // Called from the kernel when starting a coroutine or task so must switch back to user mode.
+extern struct $coroutine * __cfactx_cor_active(void);
 extern struct $coroutine * __cfactx_cor_finish(void);
 extern void __cfactx_cor_leave ( struct $coroutine * );
 …
 extern void disable_interrupts() OPTIONAL_THREAD;
 extern void enable_interrupts( __cfaabi_dbg_ctx_param );
+struct exception_context_t * this_exception_context() {
+        return &__get_stack( __cfactx_cor_active() )->exception_context;
+}
 void __cfactx_invoke_coroutine(

libcfa/src/concurrency/invoke.h

-              ref9988b
+              r13d33a75
 #ifndef _INVOKE_H_
 #define _INVOKE_H_
+        struct __cfaehm_try_resume_node;
+        struct __cfaehm_base_exception_t;
+        struct exception_context_t {
+                struct __cfaehm_try_resume_node * top_resume;
+                struct __cfaehm_base_exception_t * current_exception;
+        };
         struct __stack_context_t {
 …
                 // base of stack
                 void * base;
+                // Information for exception handling.
+                struct exception_context_t exception_context;
         };
 …
         };
+        static inline struct __stack_t * __get_stack( struct $coroutine * cor ) { return (struct __stack_t*)(((uintptr_t)cor->stack.storage) & ((uintptr_t)-2)); }
+        static inline struct __stack_t * __get_stack( struct $coroutine * cor ) {
+                return (struct __stack_t*)(((uintptr_t)cor->stack.storage) & ((uintptr_t)-2));
+        }
+        struct exception_context_t * this_exception_context();
         // struct which calls the monitor is accepting

libcfa/src/concurrency/io.cfa

-              ref9988b
+              r13d33a75
         #include "kernel/fwd.hfa"
         #include "io/types.hfa"
+        // returns true of acquired as leader or second leader
+        static inline bool try_lock( __leaderlock_t & this ) {
+                const uintptr_t thrd = 1z | (uintptr_t)active_thread();
+                bool block;
+                disable_interrupts();
+                for() {
+                        struct $thread * expected = this.value;
+                        if( 1p != expected && 0p != expected ) {
+                                /* paranoid */ verify( thrd != (uintptr_t)expected ); // We better not already be the next leader
+                                enable_interrupts( __cfaabi_dbg_ctx );
+                                return false;
+                        }
+                        struct $thread * desired;
+                        if( 0p == expected ) {
+                                // If the lock isn't locked acquire it, no need to block
+                                desired = 1p;
+                                block = false;
+                        }
+                        else {
+                                // If the lock is already locked try becomming the next leader
+                                desired = (struct $thread *)thrd;
+                                block = true;
+                        }
+                        if( __atomic_compare_exchange_n(&this.value, &expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) ) break;
+                }
+                if( block ) {
+                        enable_interrupts( __cfaabi_dbg_ctx );
+                        park( __cfaabi_dbg_ctx );
+                        disable_interrupts();
+                }
+                return true;
+        }
+        static inline bool next( __leaderlock_t & this ) {
+                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                struct $thread * nextt;
+                for() {
+                        struct $thread * expected = this.value;
+                        /* paranoid */ verify( (1 & (uintptr_t)expected) == 1 ); // The lock better be locked
+                        struct $thread * desired;
+                        if( 1p == expected ) {
+                                // No next leader, just unlock
+                                desired = 0p;
+                                nextt   = 0p;
+                        }
+                        else {
+                                // There is a next leader, remove but keep locked
+                                desired = 1p;
+                                nextt   = (struct $thread *)(~1z & (uintptr_t)expected);
+                        }
+                        if( __atomic_compare_exchange_n(&this.value, &expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) ) break;
+                }
+                if(nextt) {
+                        unpark( nextt __cfaabi_dbg_ctx2 );
+                        enable_interrupts( __cfaabi_dbg_ctx );
+                        return true;
+                }
+                enable_interrupts( __cfaabi_dbg_ctx );
+                return false;
+        }
 //=============================================================================================
 …
 //=============================================================================================
         static unsigned __collect_submitions( struct __io_data & ring );
         static uint32_t __release_consumed_submission( struct __io_data & ring );
+        static __u32 __release_consumed_submission( struct __io_data & ring );
         static inline void process(struct io_uring_cqe & cqe ) {
 …
                 data->result = cqe.res;
                 unpark( data->thrd __cfaabi_dbg_ctx2 );
+                post( data->sem );
+        }
 …
                 unsigned head = *ring.completion_q.head;
                 unsigned tail = *ring.completion_q.tail;
                 const uint32_t mask = *ring.completion_q.mask;
+                const __u32 mask = *ring.completion_q.mask;
                 // Nothing was new return 0
 …
+                }
                 uint32_t count = tail - head;
+                __u32 count = tail - head;
                 /* paranoid */ verify( count != 0 );
                 for(i; count) {
 …
                                 __STATS__( true,
                                         io.complete_q.completed_avg.val += count;
                                         io.complete_q.completed_avg.fast_cnt += 1;
+                                        io.complete_q.completed_avg.cnt += 1;
+                                )
                         enable_interrupts( __cfaabi_dbg_ctx );
 …
                         // We didn't get anything baton pass to the slow poller
                         else {
+                                __STATS__( false,
+                                        io.complete_q.blocks += 1;
+                                )
                                 __cfadbg_print_safe(io_core, "Kernel I/O : Parking io poller %p\n", &this.self);
                                 reset = 0;
 …
 //
         [* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring, uint64_t data ) {
+        [* struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data ) {
                 /* paranoid */ verify( data != 0 );
 …
                 __attribute((unused)) int len   = 0;
                 __attribute((unused)) int block = 0;
                 uint32_t cnt = *ring.submit_q.num;
                 uint32_t mask = *ring.submit_q.mask;
+                __u32 cnt = *ring.submit_q.num;
+                __u32 mask = *ring.submit_q.mask;
                 disable_interrupts();
                         uint32_t off = __tls_rand();
+                        __u32 off = __tls_rand();
                 enable_interrupts( __cfaabi_dbg_ctx );
 …
                         // Look through the list starting at some offset
                         for(i; cnt) {
                                 uint64_t expected = 0;
                                 uint32_t idx = (i + off) & mask;
+                                __u64 expected = 0;
+                                __u32 idx = (i + off) & mask;
                                 struct io_uring_sqe * sqe = &ring.submit_q.sqes[idx];
                                 volatile uint64_t * udata = (volatile uint64_t *)&sqe->user_data;
+                                volatile __u64 * udata = &sqe->user_data;
                                 if( *udata == expected &&
 …
+        }
         static inline uint32_t __submit_to_ready_array( struct __io_data & ring, uint32_t idx, const uint32_t mask ) {
+        static inline __u32 __submit_to_ready_array( struct __io_data & ring, __u32 idx, const __u32 mask ) {
                 /* paranoid */ verify( idx <= mask   );
                 /* paranoid */ verify( idx != -1ul32 );
 …
                 __attribute((unused)) int len   = 0;
                 __attribute((unused)) int block = 0;
                 uint32_t ready_mask = ring.submit_q.ready_cnt - 1;
+                __u32 ready_mask = ring.submit_q.ready_cnt - 1;
                 disable_interrupts();
                         uint32_t off = __tls_rand();
+                        __u32 off = __tls_rand();
                 enable_interrupts( __cfaabi_dbg_ctx );
                 uint32_t picked;
+                __u32 picked;
                 LOOKING: for() {
                         for(i; ring.submit_q.ready_cnt) {
                                 picked = (i + off) & ready_mask;
                                 uint32_t expected = -1ul32;
+                                __u32 expected = -1ul32;
                                 if( __atomic_compare_exchange_n( &ring.submit_q.ready[picked], &expected, idx, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED ) ) {
                                         break LOOKING;
 …
                         block++;
+                        if( try_lock(ring.submit_q.lock __cfaabi_dbg_ctx2) ) {
+                                __release_consumed_submission( ring );
+                                unlock( ring.submit_q.lock );
+                        }
+                        else {
+                        __u32 released = __release_consumed_submission( ring );
+                        if( released == 0 ) {
                                 yield();
+                        }
 …
+        }
         void __submit( struct io_context * ctx, uint32_t idx ) __attribute__((nonnull (1))) {
+        void __submit( struct io_context * ctx, __u32 idx ) __attribute__((nonnull (1))) {
                 __io_data & ring = *ctx->thrd.ring;
                 // Get now the data we definetely need
                 volatile uint32_t * const tail = ring.submit_q.tail;
                 const uint32_t mask  = *ring.submit_q.mask;
+                volatile __u32 * const tail = ring.submit_q.tail;
+                const __u32 mask  = *ring.submit_q.mask;
                 // There are 2 submission schemes, check which one we are using
 …
+                }
                 else if( ring.eager_submits ) {
+                        uint32_t picked = __submit_to_ready_array( ring, idx, mask );
+                        for() {
+                                yield();
+                                // If some one else collected our index, we are done
+                                #warning ABA problem
+                                if( ring.submit_q.ready[picked] != idx ) {
+                        __u32 picked = __submit_to_ready_array( ring, idx, mask );
+                        #if defined(LEADER_LOCK)
+                                if( !try_lock(ring.submit_q.submit_lock) ) {
                                         __STATS__( false,
                                                 io.submit_q.helped += 1;
 …
                                         return;
+                                }
+                                if( try_lock(ring.submit_q.lock __cfaabi_dbg_ctx2) ) {
+                                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                                __STATS__( true,
+                                        io.submit_q.leader += 1;
+                                )
+                        #else
+                                for() {
+                                        yield();
+                                        if( try_lock(ring.submit_q.submit_lock __cfaabi_dbg_ctx2) ) {
+                                                __STATS__( false,
+                                                        io.submit_q.leader += 1;
+                                                )
+                                                break;
+                                        }
+                                        // If some one else collected our index, we are done
+                                        #warning ABA problem
+                                        if( ring.submit_q.ready[picked] != idx ) {
+                                                __STATS__( false,
+                                                        io.submit_q.helped += 1;
+                                                )
+                                                return;
+                                        }
                                         __STATS__( false,
                                                 io.submit_q.leader += 1;
+                                                io.submit_q.busy += 1;
+                                        )
+                                        break;
+                                }
+                                __STATS__( false,
+                                        io.submit_q.busy += 1;
+                                )
+                        }
+                                }
+                        #endif
                         // We got the lock
+                        // Collect the submissions
                         unsigned to_submit = __collect_submitions( ring );
+                        // Actually submit
                         int ret = __io_uring_enter( ring, to_submit, false );
+                        if( ret < 0 ) {
+                                unlock(ring.submit_q.lock);
+                                return;
+                        }
+                        /* paranoid */ verify( ret > 0 || to_submit == 0 || (ring.ring_flags & IORING_SETUP_SQPOLL) );
+                        #if defined(LEADER_LOCK)
+                                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                                next(ring.submit_q.submit_lock);
+                        #else
+                                unlock(ring.submit_q.submit_lock);
+                        #endif
+                        if( ret < 0 ) return;
                         // Release the consumed SQEs
 …
                         // update statistics
                         __STATS__( true,
+                        __STATS__( false,
                                 io.submit_q.submit_avg.rdy += to_submit;
                                 io.submit_q.submit_avg.csm += ret;
                                 io.submit_q.submit_avg.cnt += 1;
+                        )
-                        unlock(ring.submit_q.lock);
+                }
                 else {
                         // get mutual exclusion
+                        lock(ring.submit_q.lock __cfaabi_dbg_ctx2);
+                        #if defined(LEADER_LOCK)
+                                while(!try_lock(ring.submit_q.submit_lock));
+                        #else
+                                lock(ring.submit_q.submit_lock __cfaabi_dbg_ctx2);
+                        #endif
                         /* paranoid */ verifyf( ring.submit_q.sqes[ idx ].user_data != 0,
 …
                         __release_consumed_submission( ring );
+                        unlock(ring.submit_q.lock);
+                        #if defined(LEADER_LOCK)
+                                next(ring.submit_q.submit_lock);
+                        #else
+                                unlock(ring.submit_q.submit_lock);
+                        #endif
                         __cfadbg_print_safe( io, "Kernel I/O : Performed io_submit for %p, returned %d\n", active_thread(), ret );
 …
+        }
+        // #define PARTIAL_SUBMIT 32
         static unsigned __collect_submitions( struct __io_data & ring ) {
                 /* paranoid */ verify( ring.submit_q.ready != 0p );
 …
                 unsigned to_submit = 0;
+                uint32_t tail = *ring.submit_q.tail;
+                const uint32_t mask = *ring.submit_q.mask;
+                __u32 tail = *ring.submit_q.tail;
+                const __u32 mask = *ring.submit_q.mask;
+                #if defined(PARTIAL_SUBMIT)
+                        #if defined(LEADER_LOCK)
+                                #error PARTIAL_SUBMIT and LEADER_LOCK cannot co-exist
+                        #endif
+                        const __u32 cnt = ring.submit_q.ready_cnt > PARTIAL_SUBMIT ? PARTIAL_SUBMIT : ring.submit_q.ready_cnt;
+                        const __u32 offset = ring.submit_q.prev_ready;
+                        ring.submit_q.prev_ready += cnt;
+                #else
+                        const __u32 cnt = ring.submit_q.ready_cnt;
+                        const __u32 offset = 0;
+                #endif
                 // Go through the list of ready submissions
+                for( i; ring.submit_q.ready_cnt ) {
+                for( c; cnt ) {
+                        __u32 i = (offset + c) % ring.submit_q.ready_cnt;
                         // replace any submission with the sentinel, to consume it.
                         uint32_t idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
+                        __u32 idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
                         // If it was already the sentinel, then we are done
 …
+        }
         static uint32_t __release_consumed_submission( struct __io_data & ring ) {
                 const uint32_t smask = *ring.submit_q.mask;
+        static __u32 __release_consumed_submission( struct __io_data & ring ) {
+                const __u32 smask = *ring.submit_q.mask;
                 if( !try_lock(ring.submit_q.release_lock __cfaabi_dbg_ctx2) ) return 0;
                 uint32_t chead = *ring.submit_q.head;
                 uint32_t phead = ring.submit_q.prev_head;
+                __u32 chead = *ring.submit_q.head;
+                __u32 phead = ring.submit_q.prev_head;
                 ring.submit_q.prev_head = chead;
                 unlock(ring.submit_q.release_lock);
                 uint32_t count = chead - phead;
+                __u32 count = chead - phead;
                 for( i; count ) {
                         uint32_t idx = ring.submit_q.array[ (phead + i) & smask ];
+                        __u32 idx = ring.submit_q.array[ (phead + i) & smask ];
                         ring.submit_q.sqes[ idx ].user_data = 0;
+                }

libcfa/src/concurrency/io/setup.cfa

-              ref9988b
+              r13d33a75
                 if( params_in.poll_complete ) params.flags |= IORING_SETUP_IOPOLL;
+                uint32_t nentries = params_in.num_entries;
+                __u32 nentries = params_in.num_entries != 0 ? params_in.num_entries : 256;
+                if( !is_pow2(nentries) ) {
+                        abort("ERROR: I/O setup 'num_entries' must be a power of 2\n");
+                }
+                if( params_in.poller_submits && params_in.eager_submits ) {
+                        abort("ERROR: I/O setup 'poller_submits' and 'eager_submits' cannot be used together\n");
+                }
                 int fd = syscall(__NR_io_uring_setup, nentries, &params );
 …
                 // Get the pointers from the kernel to fill the structure
                 // submit queue
                 sq.head    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
                 sq.tail    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
                 sq.mask    = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
                 sq.num     = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
                 sq.flags   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
                 sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
                 sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
+                sq.head    = (volatile __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
+                sq.tail    = (volatile __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
+                sq.mask    = (   const __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
+                sq.num     = (   const __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
+                sq.flags   = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
+                sq.dropped = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
+                sq.array   = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
                 sq.prev_head = *sq.head;
+                {
                         const uint32_t num = *sq.num;
+                        const __u32 num = *sq.num;
                         for( i; num ) {
                                 sq.sqes[i].user_data = 0ul64;
 …
+                }
                 (sq.lock){};
+                (sq.submit_lock){};
                 (sq.release_lock){};
 …
                                 sq.ready[i] = -1ul32;
+                        }
+                        sq.prev_ready = 0;
+                }
                 else {
                         sq.ready_cnt = 0;
                         sq.ready = 0p;
+                        sq.prev_ready = 0;
+                }
                 // completion queue
                 cq.head     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
                 cq.tail     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
                 cq.mask     = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
                 cq.num      = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
                 cq.overflow = (         uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
                 cq.cqes   = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
+                cq.head      = (volatile __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
+                cq.tail      = (volatile __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
+                cq.mask      = (   const __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
+                cq.num       = (   const __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
+                cq.overflow  = (         __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
+                cq.cqes = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
                 // some paranoid checks
 …
         void __ioctx_register($io_ctx_thread & ctx, struct epoll_event & ev) {
                 ev.events = EPOLLIN | EPOLLONESHOT;
                 ev.data.u64 = (uint64_t)&ctx;
+                ev.data.u64 = (__u64)&ctx;
                 int ret = epoll_ctl(iopoll.epollfd, EPOLL_CTL_ADD, ctx.ring->fd, &ev);
                 if (ret < 0) {

libcfa/src/concurrency/io/types.hfa

-              ref9988b
+              r13d33a75
 #if defined(CFA_HAVE_LINUX_IO_URING_H)
+        extern "C" {
+                #include <linux/types.h>
+        }
       #include "bits/locks.hfa"
+        #define LEADER_LOCK
+        struct __leaderlock_t {
+                struct $thread * volatile value;        // ($thread) next_leader | (bool:1) is_locked
+        };
+        static inline void ?{}( __leaderlock_t & this ) { this.value = 0p; }
         //-----------------------------------------------------------------------
 …
       struct __submition_data {
                 // Head and tail of the ring (associated with array)
                 volatile uint32_t * head;
                 volatile uint32_t * tail;
                 volatile uint32_t prev_head;
+                volatile __u32 * head;
+                volatile __u32 * tail;
+                volatile __u32 prev_head;
                 // The actual kernel ring which uses head/tail
                 // indexes into the sqes arrays
                 uint32_t * array;
+                __u32 * array;
                 // number of entries and mask to go with it
                 const uint32_t * num;
                 const uint32_t * mask;
+                const __u32 * num;
+                const __u32 * mask;
                 // Submission flags (Not sure what for)
                 uint32_t * flags;
+                __u32 * flags;
                 // number of sqes not submitted (whatever that means)
                 uint32_t * dropped;
+                __u32 * dropped;
                 // Like head/tail but not seen by the kernel
+                volatile uint32_t * ready;
+                uint32_t ready_cnt;
+                volatile __u32 * ready;
+                __u32 ready_cnt;
+                __u32 prev_ready;
+                __spinlock_t lock;
+                __spinlock_t release_lock;
+                #if defined(LEADER_LOCK)
+                        __leaderlock_t submit_lock;
+                #else
+                        __spinlock_t submit_lock;
+                #endif
+                __spinlock_t  release_lock;
                 // A buffer of sqes (not the actual ring)
 …
         struct __completion_data {
                 // Head and tail of the ring
                 volatile uint32_t * head;
                 volatile uint32_t * tail;
+                volatile __u32 * head;
+                volatile __u32 * tail;
                 // number of entries and mask to go with it
                 const uint32_t * mask;
                 const uint32_t * num;
+                const __u32 * mask;
+                const __u32 * num;
                 // number of cqes not submitted (whatever that means)
                 uint32_t * overflow;
+                __u32 * overflow;
                 // the kernel ring
 …
                 struct __submition_data submit_q;
                 struct __completion_data completion_q;
                 uint32_t ring_flags;
+                __u32 ring_flags;
                 int fd;
                 bool eager_submits:1;
 …
         // IO user data
         struct __io_user_data_t {
                 int32_t result;
                 $thread * thrd;
+                __s32 result;
+                oneshot sem;
         };

libcfa/src/concurrency/iocall.cfa

-              ref9988b
+              r13d33a75
         #include "io/types.hfa"
         extern [* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring, uint64_t data );
         extern void __submit( struct io_context * ctx, uint32_t idx ) __attribute__((nonnull (1)));
         static inline void ?{}(struct io_uring_sqe & this, uint8_t opcode, int fd) {
+        extern [* struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data );
+        extern void __submit( struct io_context * ctx, __u32 idx ) __attribute__((nonnull (1)));
+        static inline void ?{}(struct io_uring_sqe & this, __u8 opcode, int fd) {
                 this.opcode = opcode;
                 #if !defined(IOSQE_ASYNC)
 …
+        }
         static inline void ?{}(struct io_uring_sqe & this, uint8_t opcode, int fd, void * addr, uint32_t len, uint64_t off ) {
+        static inline void ?{}(struct io_uring_sqe & this, __u8 opcode, int fd, void * addr, __u32 len, __u64 off ) {
                 (this){ opcode, fd };
                 this.off = off;
                 this.addr = (uint64_t)(uintptr_t)addr;
+                this.addr = (__u64)(uintptr_t)addr;
                 this.len = len;
+        }
 …
         #endif
         #define __submit_prelude \
                 if( 0 != (submit_flags & LINK_FLAGS) ) { errno = ENOTSUP; return -1; } \
                 (void)timeout; (void)cancellation; \
                 if( !context ) context = __get_io_context(); \
                 __io_user_data_t data = { 0, active_thread() }; \
+                __io_user_data_t data = { 0 }; \
                 struct __io_data & ring = *context->thrd.ring; \
                 struct io_uring_sqe * sqe; \
+                uint32_t idx; \
+                [sqe, idx] = __submit_alloc( ring, (uint64_t)(uintptr_t)&data ); \
+                sqe->flags = REGULAR_FLAGS & submit_flags;
+                __u32 idx; \
+                __u8 sflags = REGULAR_FLAGS & submit_flags; \
+                [sqe, idx] = __submit_alloc( ring, (__u64)(uintptr_t)&data ); \
+                sqe->flags = sflags;
         #define __submit_wait \
                 /*__cfaabi_bits_print_safe( STDERR_FILENO, "Preparing user data %p for %p\n", &data, data.thrd );*/ \
                 verify( sqe->user_data == (uint64_t)(uintptr_t)&data ); \
+                verify( sqe->user_data == (__u64)(uintptr_t)&data ); \
                 __submit( context, idx ); \
                 park( __cfaabi_dbg_ctx ); \
+                wait( data.sem ); \
                 if( data.result < 0 ) { \
                         errno = -data.result; \
 …
         extern int fsync(int fd);
+        extern int sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags);
+        typedef __off64_t off_t;
+        typedef __off64_t off64_t;
+        extern int sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags);
         struct msghdr;
 …
         extern int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
         extern int fallocate(int fd, int mode, uint64_t offset, uint64_t len);
         extern int posix_fadvise(int fd, uint64_t offset, uint64_t len, int advice);
+        extern int fallocate(int fd, int mode, off_t offset, off_t len);
+        extern int posix_fadvise(int fd, off_t offset, off_t len, int advice);
         extern int madvise(void *addr, size_t length, int advice);
 …
                         __submit_prelude
+                        (*sqe){ IORING_OP_READV, fd, iov, iovcnt, offset };
+                        sqe->opcode = IORING_OP_READV;
+                        sqe->ioprio = 0;
+                        sqe->fd = fd;
+                        sqe->off = offset;
+                        sqe->addr = (__u64)iov;
+                        sqe->len = iovcnt;
+                        sqe->rw_flags = 0;
+                        sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
                         __submit_wait
 …
                         __submit_prelude
+                        (*sqe){ IORING_OP_WRITEV, fd, iov, iovcnt, offset };
+                        sqe->opcode = IORING_OP_WRITEV;
+                        sqe->ioprio = 0;
+                        sqe->fd = fd;
+                        sqe->off = offset;
+                        sqe->addr = (__u64)iov;
+                        sqe->len = iovcnt;
+                        sqe->rw_flags = 0;
+                        sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
                         __submit_wait
 …
                 __submit_prelude
+                (*sqe){ IORING_OP_FSYNC, fd };
+                __submit_wait
+        #endif
+}
+int cfa_sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
+                sqe->opcode = IORING_OP_FSYNC;
+                sqe->ioprio = 0;
+                sqe->fd = fd;
+                sqe->off = 0;
+                sqe->addr = 0;
+                sqe->len = 0;
+                sqe->rw_flags = 0;
+                sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
+                __submit_wait
+        #endif
+}
+int cfa_sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
         #if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SYNC_FILE_RANGE)
                 return sync_file_range(fd, offset, nbytes, flags);
 …
                 (*sqe){ IORING_OP_SEND, sockfd };
                 sqe->addr = (uint64_t)buf;
+                sqe->addr = (__u64)buf;
                 sqe->len = len;
                 sqe->msg_flags = flags;
 …
                 (*sqe){ IORING_OP_RECV, sockfd };
                 sqe->addr = (uint64_t)buf;
+                sqe->addr = (__u64)buf;
                 sqe->len = len;
                 sqe->msg_flags = flags;
 …
                 (*sqe){ IORING_OP_ACCEPT, sockfd };
                 sqe->addr = (uint64_t)(uintptr_t)addr;
                 sqe->addr2 = (uint64_t)(uintptr_t)addrlen;
+                sqe->addr  = (__u64)addr;
+                sqe->addr2 = (__u64)addrlen;
                 sqe->accept_flags = flags;
 …
                 (*sqe){ IORING_OP_CONNECT, sockfd };
                 sqe->addr = (uint64_t)(uintptr_t)addr;
                 sqe->off  = (uint64_t)(uintptr_t)addrlen;
                 __submit_wait
         #endif
+}
 int cfa_fallocate(int fd, int mode, uint64_t offset, uint64_t len, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
+                sqe->addr = (__u64)addr;
+                sqe->off  = (__u64)addrlen;
+                __submit_wait
+        #endif
+}
+int cfa_fallocate(int fd, int mode, off_t offset, off_t len, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
         #if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FALLOCATE)
                 return fallocate( fd, mode, offset, len );
 …
+}
 int cfa_fadvise(int fd, uint64_t offset, uint64_t len, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
+int cfa_fadvise(int fd, off_t offset, off_t len, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
         #if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FADVISE)
                 return posix_fadvise( fd, offset, len, advice );
 …
                 (*sqe){ IORING_OP_FADVISE, fd };
                 sqe->off = (uint64_t)offset;
+                sqe->off = (__u64)offset;
                 sqe->len = len;
                 sqe->fadvise_advice = advice;
 …
                 (*sqe){ IORING_OP_MADVISE, 0 };
                 sqe->addr = (uint64_t)addr;
+                sqe->addr = (__u64)addr;
                 sqe->len = length;
                 sqe->fadvise_advice = advice;
 …
                 (*sqe){ IORING_OP_OPENAT, dirfd };
                 sqe->addr = (uint64_t)pathname;
+                sqe->addr = (__u64)pathname;
                 sqe->open_flags = flags;
                 sqe->len = mode;
 …
                 __submit_prelude
                 (*sqe){ IORING_OP_STATX, dirfd, pathname, mask, (uint64_t)statxbuf };
+                (*sqe){ IORING_OP_STATX, dirfd, pathname, mask, (__u64)statxbuf };
                 sqe->statx_flags = flags;
 …
+                }
                 else {
                         sqe->off = (uint64_t)-1;
+                        sqe->off = (__u64)-1;
+                }
                 sqe->len = len;
 …
+                }
                 else {
                         sqe->splice_off_in = (uint64_t)-1;
+                        sqe->splice_off_in = (__u64)-1;
+                }
                 sqe->splice_flags  = flags | (SPLICE_FLAGS & submit_flags);

libcfa/src/concurrency/kernel.cfa

-              ref9988b
+              r13d33a75
         // Do it here
         kernelTLS.rand_seed ^= rdtscl();
+        kernelTLS.ready_rng.fwd_seed = 25214903917_l64u * (rdtscl() ^ (uintptr_t)&runner);
+        __tls_rand_advance_bck();
         processor * this = runner.proc;
 …
                 unsigned total   = this.total;
                 processor * proc = &this.list`first;
                 // Thread fence is unnecessary, but gcc-8 and older incorrectly reorder code without it
                 __atomic_thread_fence(__ATOMIC_SEQ_CST);
+                // Compiler fence is unnecessary, but gcc-8 and older incorrectly reorder code without it
+                asm volatile("": : :"memory");
                 if(l != __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST)) { Pause(); continue; }
                 return [idle, total, proc];

libcfa/src/concurrency/kernel/fwd.hfa

-              ref9988b
+              r13d33a75
                                 uint64_t rand_seed;
                         #endif
+                        struct {
+                                uint64_t fwd_seed;
+                                uint64_t bck_seed;
+                        } ready_rng;
                 } kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
                 static inline uint64_t __tls_rand() {
 …
                                 return __xorshift64( kernelTLS.rand_seed );
                         #endif
+                }
+                #define M  (1_l64u << 48_l64u)
+                #define A  (25214903917_l64u)
+                #define AI (18446708753438544741_l64u)
+                #define C  (11_l64u)
+                #define D  (16_l64u)
+                static inline unsigned __tls_rand_fwd() {
+                        kernelTLS.ready_rng.fwd_seed = (A * kernelTLS.ready_rng.fwd_seed + C) & (M - 1);
+                        return kernelTLS.ready_rng.fwd_seed >> D;
+                }
+                static inline unsigned __tls_rand_bck() {
+                        unsigned int r = kernelTLS.ready_rng.bck_seed >> D;
+                        kernelTLS.ready_rng.bck_seed = AI * (kernelTLS.ready_rng.bck_seed - C) & (M - 1);
+                        return r;
+                }
+                #undef M
+                #undef A
+                #undef AI
+                #undef C
+                #undef D
+                static inline void __tls_rand_advance_bck(void) {
+                        kernelTLS.ready_rng.bck_seed = kernelTLS.ready_rng.fwd_seed;
+                }
+        }

libcfa/src/concurrency/kernel/startup.cfa

-              ref9988b
+              r13d33a75
 static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info);
+#if defined(__CFA_WITH_VERIFY__)
+        static bool verify_fwd_bck_rng(void);
+#endif
 //-----------------------------------------------------------------------------
 // Forward Declarations for other modules
 …
         __cfa_dbg_global_clusters.list{ __get };
         __cfa_dbg_global_clusters.lock{};
+        /* paranoid */ verify( verify_fwd_bck_rng() );
         // Initialize the global scheduler lock
 …
         ( this.terminated ){ 0 };
         ( this.runner ){};
+        init( this, name, _cltr );
+        disable_interrupts();
+                init( this, name, _cltr );
+        enable_interrupts( __cfaabi_dbg_ctx );
         __cfadbg_print_safe(runtime_core, "Kernel : Starting core %p\n", &this);
 …
         free( this.stack );
+        deinit( this );
+        disable_interrupts();
+                deinit( this );
+        enable_interrupts( __cfaabi_dbg_ctx );
+}
 …
         return stack;
+}
+#if defined(__CFA_WITH_VERIFY__)
+static bool verify_fwd_bck_rng(void) {
+        kernelTLS.ready_rng.fwd_seed = 25214903917_l64u * (rdtscl() ^ (uintptr_t)&verify_fwd_bck_rng);
+        unsigned values[10];
+        for(i; 10) {
+                values[i] = __tls_rand_fwd();
+        }
+        __tls_rand_advance_bck();
+        for ( i; 9 -~= 0 ) {
+                if(values[i] != __tls_rand_bck()) {
+                        return false;
+                }
+        }
+        return true;
+}
+#endif

libcfa/src/concurrency/ready_queue.cfa

-              ref9988b
+              r13d33a75
 //  queues or removing them.
 uint_fast32_t ready_mutate_lock( void ) with(*__scheduler_lock) {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         // Step 1 : lock global lock
         // It is needed to avoid processors that register mid Critical-Section
 …
+        }
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         return s;
+}
 void ready_mutate_unlock( uint_fast32_t last_s ) with(*__scheduler_lock) {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         // Step 1 : release local locks
         // This must be done while the global lock is held to avoid
 …
         /*paranoid*/ assert(true == lock);
         __atomic_store_n(&lock, (bool)false, __ATOMIC_RELEASE);
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+}

libcfa/src/concurrency/stats.cfa

-              ref9988b
+              r13d33a75
                         stats->io.submit_q.busy   = 0;
                         stats->io.complete_q.completed_avg.val = 0;
                         stats->io.complete_q.completed_avg.slow_cnt = 0;
                         stats->io.complete_q.completed_avg.fast_cnt = 0;
+                        stats->io.complete_q.completed_avg.cnt = 0;
+                        stats->io.complete_q.blocks = 0;
                 #endif
+        }
 …
                 #if defined(CFA_HAVE_LINUX_IO_URING_H)
                         __atomic_fetch_add( &cltr->io.submit_q.submit_avg.rdy          , proc->io.submit_q.submit_avg.rdy          , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.submit_avg.csm          , proc->io.submit_q.submit_avg.csm          , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.submit_avg.avl          , proc->io.submit_q.submit_avg.avl          , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.submit_avg.cnt          , proc->io.submit_q.submit_avg.cnt          , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.look_avg.val            , proc->io.submit_q.look_avg.val            , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.look_avg.cnt            , proc->io.submit_q.look_avg.cnt            , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.look_avg.block          , proc->io.submit_q.look_avg.block          , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.alloc_avg.val           , proc->io.submit_q.alloc_avg.val           , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.alloc_avg.cnt           , proc->io.submit_q.alloc_avg.cnt           , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.alloc_avg.block         , proc->io.submit_q.alloc_avg.block         , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.helped                  , proc->io.submit_q.helped                  , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.leader                  , proc->io.submit_q.leader                  , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.busy                    , proc->io.submit_q.busy                    , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.complete_q.completed_avg.val     , proc->io.complete_q.completed_avg.val     , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.complete_q.completed_avg.slow_cnt, proc->io.complete_q.completed_avg.slow_cnt, __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.complete_q.completed_avg.fast_cnt, proc->io.complete_q.completed_avg.fast_cnt, __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.submit_avg.rdy     , proc->io.submit_q.submit_avg.rdy     , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.submit_avg.csm     , proc->io.submit_q.submit_avg.csm     , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.submit_avg.avl     , proc->io.submit_q.submit_avg.avl     , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.submit_avg.cnt     , proc->io.submit_q.submit_avg.cnt     , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.look_avg.val       , proc->io.submit_q.look_avg.val       , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.look_avg.cnt       , proc->io.submit_q.look_avg.cnt       , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.look_avg.block     , proc->io.submit_q.look_avg.block     , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.alloc_avg.val      , proc->io.submit_q.alloc_avg.val      , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.alloc_avg.cnt      , proc->io.submit_q.alloc_avg.cnt      , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.alloc_avg.block    , proc->io.submit_q.alloc_avg.block    , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.helped             , proc->io.submit_q.helped             , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.leader             , proc->io.submit_q.leader             , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.busy               , proc->io.submit_q.busy               , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.complete_q.completed_avg.val, proc->io.complete_q.completed_avg.val, __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.complete_q.completed_avg.cnt, proc->io.complete_q.completed_avg.cnt, __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.complete_q.blocks           , proc->io.complete_q.blocks           , __ATOMIC_SEQ_CST );
                 #endif
+        }
 …
                                         "- avg alloc search len   : %'18.2lf\n"
                                         "- avg alloc search block : %'18.2lf\n"
                                         "- total wait calls       : %'15" PRIu64 "   (%'" PRIu64 " slow, %'" PRIu64 " fast)\n"
+                                        "- total wait calls       : %'15" PRIu64 "\n"
                                         "- avg completion/wait    : %'18.2lf\n"
+                                        "- total completion blocks: %'15" PRIu64 "\n"
                                         "\n"
                                         , cluster ? "Cluster" : "Processor",  name, id
 …
                                         , io.submit_q.alloc_avg.cnt
                                         , aavgv, aavgb
                                         , io.complete_q.completed_avg.slow_cnt + io.complete_q.completed_avg.fast_cnt
                                         , io.complete_q.completed_avg.slow_cnt,  io.complete_q.completed_avg.fast_cnt
                                         , ((double)io.complete_q.completed_avg.val) / (io.complete_q.completed_avg.slow_cnt + io.complete_q.completed_avg.fast_cnt)
+                                        , io.complete_q.completed_avg.cnt
+                                        , ((double)io.complete_q.completed_avg.val) / io.complete_q.completed_avg.cnt
+                                        , io.complete_q.blocks
                                 );
+                        }

libcfa/src/concurrency/stats.hfa

-              ref9988b
+              r13d33a75
                                 struct {
                                         volatile uint64_t val;
+                                        volatile uint64_t slow_cnt;
+                                        volatile uint64_t fast_cnt;
+                                        volatile uint64_t cnt;
                                 } completed_avg;
+                                volatile uint64_t blocks;
                         } complete_q;
                 };

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: