Changeset e67a82d for libcfa/src/concurrency

libcfa/src/concurrency/alarm.hfa

r67ca73e	re67a82d
23	23	#include "time.hfa"
24	24
25		#include ~~<containers/list.hfa>~~
	25	#include "containers/list.hfa"
26	26
27	27	struct $thread;

libcfa/src/concurrency/coroutine.cfa

r67ca73e	re67a82d
215	215	return cor;
216	216	}
	217
	218	struct $coroutine * __cfactx_cor_active(void) {
	219	return active_coroutine();
	220	}
217	221	}
218	222

libcfa/src/concurrency/invoke.c

-                      r67ca73e
+                      re67a82d
 // Created On       : Tue Jan 17 12:27:26 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Aug 20 18:54:34 2020
 // Update Count     : 30
+// Last Modified On : Thu Aug 20 23:43:23 2020
+// Update Count     : 31
 //
 …
 // Called from the kernel when starting a coroutine or task so must switch back to user mode.
+extern struct $coroutine * __cfactx_cor_active(void);
 extern struct $coroutine * __cfactx_cor_finish(void);
 extern void __cfactx_cor_leave ( struct $coroutine * );
 …
 extern void disable_interrupts() OPTIONAL_THREAD;
 extern void enable_interrupts( __cfaabi_dbg_ctx_param );
+struct exception_context_t * this_exception_context() {
+        return &__get_stack( __cfactx_cor_active() )->exception_context;
+}
 void __cfactx_invoke_coroutine(
 …
 #elif defined( __ARM_ARCH_32 )
+#warning ARM needs to be upgrade to use two parameters like X86/X64 (A.K.A. : I broke this and do not know how to fix it)
+#error ARM needs to be upgrade to use two parameters like X86/X64 (A.K.A. : I broke this and do not know how to fix it)
+        // More details about the error:
+        // To avoid the thunk problem, I changed the invoke routine to pass the main explicitly
+        // instead of relying on an assertion. This effectively hoists any required thunk one level
+        // which was enough to get to global scope in most cases.
+        // This means that __cfactx_invoke_... now takes two parameters and the FakeStack needs
+        // to be adjusted as a consequence of that.
+        // I don't know how to do that for ARM, hence the #error
         struct FakeStack {
                 float fpRegs[16];                                                               // floating point registers

libcfa/src/concurrency/invoke.h

-                      r67ca73e
+                      re67a82d
 #ifndef _INVOKE_H_
 #define _INVOKE_H_
+        struct __cfaehm_try_resume_node;
+        struct __cfaehm_base_exception_t;
+        struct exception_context_t {
+                struct __cfaehm_try_resume_node * top_resume;
+                struct __cfaehm_base_exception_t * current_exception;
+        };
         struct __stack_context_t {
 …
                 // base of stack
                 void * base;
+                // Information for exception handling.
+                struct exception_context_t exception_context;
         };
 …
         };
+        static inline struct __stack_t * __get_stack( struct $coroutine * cor ) { return (struct __stack_t*)(((uintptr_t)cor->stack.storage) & ((uintptr_t)-2)); }
+        static inline struct __stack_t * __get_stack( struct $coroutine * cor ) {
+                return (struct __stack_t*)(((uintptr_t)cor->stack.storage) & ((uintptr_t)-2));
+        }
+        struct exception_context_t * this_exception_context();
         // struct which calls the monitor is accepting

libcfa/src/concurrency/io.cfa

-                      r67ca73e
+                      re67a82d
         #include "kernel/fwd.hfa"
         #include "io/types.hfa"
+        // returns true of acquired as leader or second leader
+        static inline bool try_lock( __leaderlock_t & this ) {
+                const uintptr_t thrd = 1z | (uintptr_t)active_thread();
+                bool block;
+                disable_interrupts();
+                for() {
+                        struct $thread * expected = this.value;
+                        if( 1p != expected && 0p != expected ) {
+                                /* paranoid */ verify( thrd != (uintptr_t)expected ); // We better not already be the next leader
+                                enable_interrupts( __cfaabi_dbg_ctx );
+                                return false;
+                        }
+                        struct $thread * desired;
+                        if( 0p == expected ) {
+                                // If the lock isn't locked acquire it, no need to block
+                                desired = 1p;
+                                block = false;
+                        }
+                        else {
+                                // If the lock is already locked try becomming the next leader
+                                desired = (struct $thread *)thrd;
+                                block = true;
+                        }
+                        if( __atomic_compare_exchange_n(&this.value, &expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) ) break;
+                }
+                if( block ) {
+                        enable_interrupts( __cfaabi_dbg_ctx );
+                        park( __cfaabi_dbg_ctx );
+                        disable_interrupts();
+                }
+                return true;
+        }
+        static inline bool next( __leaderlock_t & this ) {
+                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                struct $thread * nextt;
+                for() {
+                        struct $thread * expected = this.value;
+                        /* paranoid */ verify( (1 & (uintptr_t)expected) == 1 ); // The lock better be locked
+                        struct $thread * desired;
+                        if( 1p == expected ) {
+                                // No next leader, just unlock
+                                desired = 0p;
+                                nextt   = 0p;
+                        }
+                        else {
+                                // There is a next leader, remove but keep locked
+                                desired = 1p;
+                                nextt   = (struct $thread *)(~1z & (uintptr_t)expected);
+                        }
+                        if( __atomic_compare_exchange_n(&this.value, &expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) ) break;
+                }
+                if(nextt) {
+                        unpark( nextt __cfaabi_dbg_ctx2 );
+                        enable_interrupts( __cfaabi_dbg_ctx );
+                        return true;
+                }
+                enable_interrupts( __cfaabi_dbg_ctx );
+                return false;
+        }
 //=============================================================================================
 …
 //=============================================================================================
         static unsigned __collect_submitions( struct __io_data & ring );
         static uint32_t __release_consumed_submission( struct __io_data & ring );
+        static __u32 __release_consumed_submission( struct __io_data & ring );
         static inline void process(struct io_uring_cqe & cqe ) {
 …
                 data->result = cqe.res;
                 unpark( data->thrd __cfaabi_dbg_ctx2 );
+                post( data->sem );
+        }
 …
                 unsigned head = *ring.completion_q.head;
                 unsigned tail = *ring.completion_q.tail;
                 const uint32_t mask = *ring.completion_q.mask;
+                const __u32 mask = *ring.completion_q.mask;
                 // Nothing was new return 0
 …
+                }
                 uint32_t count = tail - head;
+                __u32 count = tail - head;
                 /* paranoid */ verify( count != 0 );
                 for(i; count) {
 …
                                 __STATS__( true,
                                         io.complete_q.completed_avg.val += count;
                                         io.complete_q.completed_avg.fast_cnt += 1;
+                                        io.complete_q.completed_avg.cnt += 1;
+                                )
                         enable_interrupts( __cfaabi_dbg_ctx );
 …
                         // We didn't get anything baton pass to the slow poller
                         else {
+                                __STATS__( false,
+                                        io.complete_q.blocks += 1;
+                                )
                                 __cfadbg_print_safe(io_core, "Kernel I/O : Parking io poller %p\n", &this.self);
                                 reset = 0;
 …
 //
         [* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring, uint64_t data ) {
+        [* struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data ) {
                 /* paranoid */ verify( data != 0 );
 …
                 __attribute((unused)) int len   = 0;
                 __attribute((unused)) int block = 0;
                 uint32_t cnt = *ring.submit_q.num;
                 uint32_t mask = *ring.submit_q.mask;
+                __u32 cnt = *ring.submit_q.num;
+                __u32 mask = *ring.submit_q.mask;
                 disable_interrupts();
                         uint32_t off = __tls_rand();
+                        __u32 off = __tls_rand();
                 enable_interrupts( __cfaabi_dbg_ctx );
 …
                         // Look through the list starting at some offset
                         for(i; cnt) {
                                 uint64_t expected = 0;
                                 uint32_t idx = (i + off) & mask;
+                                __u64 expected = 0;
+                                __u32 idx = (i + off) & mask;
                                 struct io_uring_sqe * sqe = &ring.submit_q.sqes[idx];
                                 volatile uint64_t * udata = (volatile uint64_t *)&sqe->user_data;
+                                volatile __u64 * udata = &sqe->user_data;
                                 if( *udata == expected &&
 …
+        }
         static inline uint32_t __submit_to_ready_array( struct __io_data & ring, uint32_t idx, const uint32_t mask ) {
+        static inline __u32 __submit_to_ready_array( struct __io_data & ring, __u32 idx, const __u32 mask ) {
                 /* paranoid */ verify( idx <= mask   );
                 /* paranoid */ verify( idx != -1ul32 );
 …
                 __attribute((unused)) int len   = 0;
                 __attribute((unused)) int block = 0;
                 uint32_t ready_mask = ring.submit_q.ready_cnt - 1;
+                __u32 ready_mask = ring.submit_q.ready_cnt - 1;
                 disable_interrupts();
                         uint32_t off = __tls_rand();
+                        __u32 off = __tls_rand();
                 enable_interrupts( __cfaabi_dbg_ctx );
                 uint32_t picked;
+                __u32 picked;
                 LOOKING: for() {
                         for(i; ring.submit_q.ready_cnt) {
                                 picked = (i + off) & ready_mask;
                                 uint32_t expected = -1ul32;
+                                __u32 expected = -1ul32;
                                 if( __atomic_compare_exchange_n( &ring.submit_q.ready[picked], &expected, idx, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED ) ) {
                                         break LOOKING;
 …
                         block++;
+                        if( try_lock(ring.submit_q.lock __cfaabi_dbg_ctx2) ) {
+                                __release_consumed_submission( ring );
+                                unlock( ring.submit_q.lock );
+                        }
+                        else {
+                        __u32 released = __release_consumed_submission( ring );
+                        if( released == 0 ) {
                                 yield();
+                        }
 …
+        }
         void __submit( struct io_context * ctx, uint32_t idx ) __attribute__((nonnull (1))) {
+        void __submit( struct io_context * ctx, __u32 idx ) __attribute__((nonnull (1))) {
                 __io_data & ring = *ctx->thrd.ring;
                 // Get now the data we definetely need
                 volatile uint32_t * const tail = ring.submit_q.tail;
                 const uint32_t mask  = *ring.submit_q.mask;
+                volatile __u32 * const tail = ring.submit_q.tail;
+                const __u32 mask  = *ring.submit_q.mask;
                 // There are 2 submission schemes, check which one we are using
 …
+                }
                 else if( ring.eager_submits ) {
+                        uint32_t picked = __submit_to_ready_array( ring, idx, mask );
+                        for() {
+                                yield();
+                                // If some one else collected our index, we are done
+                                #warning ABA problem
+                                if( ring.submit_q.ready[picked] != idx ) {
+                        __u32 picked = __submit_to_ready_array( ring, idx, mask );
+                        #if defined(LEADER_LOCK)
+                                if( !try_lock(ring.submit_q.submit_lock) ) {
                                         __STATS__( false,
                                                 io.submit_q.helped += 1;
 …
                                         return;
+                                }
+                                if( try_lock(ring.submit_q.lock __cfaabi_dbg_ctx2) ) {
+                                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                                __STATS__( true,
+                                        io.submit_q.leader += 1;
+                                )
+                        #else
+                                for() {
+                                        yield();
+                                        if( try_lock(ring.submit_q.submit_lock __cfaabi_dbg_ctx2) ) {
+                                                __STATS__( false,
+                                                        io.submit_q.leader += 1;
+                                                )
+                                                break;
+                                        }
+                                        // If some one else collected our index, we are done
+                                        #warning ABA problem
+                                        if( ring.submit_q.ready[picked] != idx ) {
+                                                __STATS__( false,
+                                                        io.submit_q.helped += 1;
+                                                )
+                                                return;
+                                        }
                                         __STATS__( false,
                                                 io.submit_q.leader += 1;
+                                                io.submit_q.busy += 1;
+                                        )
+                                        break;
+                                }
+                                __STATS__( false,
+                                        io.submit_q.busy += 1;
+                                )
+                        }
+                                }
+                        #endif
                         // We got the lock
+                        // Collect the submissions
                         unsigned to_submit = __collect_submitions( ring );
+                        // Actually submit
                         int ret = __io_uring_enter( ring, to_submit, false );
+                        if( ret < 0 ) {
+                                unlock(ring.submit_q.lock);
+                                return;
+                        }
+                        /* paranoid */ verify( ret > 0 || to_submit == 0 || (ring.ring_flags & IORING_SETUP_SQPOLL) );
+                        #if defined(LEADER_LOCK)
+                                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                                next(ring.submit_q.submit_lock);
+                        #else
+                                unlock(ring.submit_q.submit_lock);
+                        #endif
+                        if( ret < 0 ) return;
                         // Release the consumed SQEs
 …
                         // update statistics
                         __STATS__( true,
+                        __STATS__( false,
                                 io.submit_q.submit_avg.rdy += to_submit;
                                 io.submit_q.submit_avg.csm += ret;
                                 io.submit_q.submit_avg.cnt += 1;
+                        )
-                        unlock(ring.submit_q.lock);
+                }
                 else {
                         // get mutual exclusion
+                        lock(ring.submit_q.lock __cfaabi_dbg_ctx2);
+                        #if defined(LEADER_LOCK)
+                                while(!try_lock(ring.submit_q.submit_lock));
+                        #else
+                                lock(ring.submit_q.submit_lock __cfaabi_dbg_ctx2);
+                        #endif
                         /* paranoid */ verifyf( ring.submit_q.sqes[ idx ].user_data != 0,
 …
                         __release_consumed_submission( ring );
+                        unlock(ring.submit_q.lock);
+                        #if defined(LEADER_LOCK)
+                                next(ring.submit_q.submit_lock);
+                        #else
+                                unlock(ring.submit_q.submit_lock);
+                        #endif
                         __cfadbg_print_safe( io, "Kernel I/O : Performed io_submit for %p, returned %d\n", active_thread(), ret );
 …
+        }
+        // #define PARTIAL_SUBMIT 32
         static unsigned __collect_submitions( struct __io_data & ring ) {
                 /* paranoid */ verify( ring.submit_q.ready != 0p );
 …
                 unsigned to_submit = 0;
+                uint32_t tail = *ring.submit_q.tail;
+                const uint32_t mask = *ring.submit_q.mask;
+                __u32 tail = *ring.submit_q.tail;
+                const __u32 mask = *ring.submit_q.mask;
+                #if defined(PARTIAL_SUBMIT)
+                        #if defined(LEADER_LOCK)
+                                #error PARTIAL_SUBMIT and LEADER_LOCK cannot co-exist
+                        #endif
+                        const __u32 cnt = ring.submit_q.ready_cnt > PARTIAL_SUBMIT ? PARTIAL_SUBMIT : ring.submit_q.ready_cnt;
+                        const __u32 offset = ring.submit_q.prev_ready;
+                        ring.submit_q.prev_ready += cnt;
+                #else
+                        const __u32 cnt = ring.submit_q.ready_cnt;
+                        const __u32 offset = 0;
+                #endif
                 // Go through the list of ready submissions
+                for( i; ring.submit_q.ready_cnt ) {
+                for( c; cnt ) {
+                        __u32 i = (offset + c) % ring.submit_q.ready_cnt;
                         // replace any submission with the sentinel, to consume it.
                         uint32_t idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
+                        __u32 idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
                         // If it was already the sentinel, then we are done
 …
+        }
         static uint32_t __release_consumed_submission( struct __io_data & ring ) {
                 const uint32_t smask = *ring.submit_q.mask;
+        static __u32 __release_consumed_submission( struct __io_data & ring ) {
+                const __u32 smask = *ring.submit_q.mask;
                 if( !try_lock(ring.submit_q.release_lock __cfaabi_dbg_ctx2) ) return 0;
                 uint32_t chead = *ring.submit_q.head;
                 uint32_t phead = ring.submit_q.prev_head;
+                __u32 chead = *ring.submit_q.head;
+                __u32 phead = ring.submit_q.prev_head;
                 ring.submit_q.prev_head = chead;
                 unlock(ring.submit_q.release_lock);
                 uint32_t count = chead - phead;
+                __u32 count = chead - phead;
                 for( i; count ) {
                         uint32_t idx = ring.submit_q.array[ (phead + i) & smask ];
+                        __u32 idx = ring.submit_q.array[ (phead + i) & smask ];
                         ring.submit_q.sqes[ idx ].user_data = 0;
+                }

libcfa/src/concurrency/io/setup.cfa

-                      r67ca73e
+                      re67a82d
                 if( cluster_context ) {
                         cluster & cltr = *thrd.curr_cluster;
                         /* paranoid */ verify( cltr.nprocessors == 0 || &cltr == mainCluster );
+                        /* paranoid */ verify( cltr.idles.total == 0 || &cltr == mainCluster );
                         /* paranoid */ verify( !ready_mutate_islocked() );
 …
                 if( params_in.poll_complete ) params.flags |= IORING_SETUP_IOPOLL;
+                uint32_t nentries = params_in.num_entries;
+                __u32 nentries = params_in.num_entries != 0 ? params_in.num_entries : 256;
+                if( !is_pow2(nentries) ) {
+                        abort("ERROR: I/O setup 'num_entries' must be a power of 2\n");
+                }
+                if( params_in.poller_submits && params_in.eager_submits ) {
+                        abort("ERROR: I/O setup 'poller_submits' and 'eager_submits' cannot be used together\n");
+                }
                 int fd = syscall(__NR_io_uring_setup, nentries, &params );
 …
                 // Get the pointers from the kernel to fill the structure
                 // submit queue
                 sq.head    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
                 sq.tail    = (volatile uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
                 sq.mask    = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
                 sq.num     = (   const uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
                 sq.flags   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
                 sq.dropped = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
                 sq.array   = (         uint32_t *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
+                sq.head    = (volatile __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
+                sq.tail    = (volatile __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
+                sq.mask    = (   const __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
+                sq.num     = (   const __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
+                sq.flags   = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
+                sq.dropped = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
+                sq.array   = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
                 sq.prev_head = *sq.head;
+                {
                         const uint32_t num = *sq.num;
+                        const __u32 num = *sq.num;
                         for( i; num ) {
                                 sq.sqes[i].user_data = 0ul64;
 …
+                }
                 (sq.lock){};
+                (sq.submit_lock){};
                 (sq.release_lock){};
 …
                                 sq.ready[i] = -1ul32;
+                        }
+                        sq.prev_ready = 0;
+                }
                 else {
                         sq.ready_cnt = 0;
                         sq.ready = 0p;
+                        sq.prev_ready = 0;
+                }
                 // completion queue
                 cq.head     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
                 cq.tail     = (volatile uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
                 cq.mask     = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
                 cq.num      = (   const uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
                 cq.overflow = (         uint32_t *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
                 cq.cqes   = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
+                cq.head      = (volatile __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.head);
+                cq.tail      = (volatile __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail);
+                cq.mask      = (   const __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_mask);
+                cq.num       = (   const __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.ring_entries);
+                cq.overflow  = (         __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.overflow);
+                cq.cqes = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
                 // some paranoid checks
 …
         void __ioctx_register($io_ctx_thread & ctx, struct epoll_event & ev) {
                 ev.events = EPOLLIN | EPOLLONESHOT;
                 ev.data.u64 = (uint64_t)&ctx;
+                ev.data.u64 = (__u64)&ctx;
                 int ret = epoll_ctl(iopoll.epollfd, EPOLL_CTL_ADD, ctx.ring->fd, &ev);
                 if (ret < 0) {

libcfa/src/concurrency/io/types.hfa

-                      r67ca73e
+                      re67a82d
 #if defined(CFA_HAVE_LINUX_IO_URING_H)
+        extern "C" {
+                #include <linux/types.h>
+        }
       #include "bits/locks.hfa"
+        #define LEADER_LOCK
+        struct __leaderlock_t {
+                struct $thread * volatile value;        // ($thread) next_leader | (bool:1) is_locked
+        };
+        static inline void ?{}( __leaderlock_t & this ) { this.value = 0p; }
         //-----------------------------------------------------------------------
 …
       struct __submition_data {
                 // Head and tail of the ring (associated with array)
                 volatile uint32_t * head;
                 volatile uint32_t * tail;
                 volatile uint32_t prev_head;
+                volatile __u32 * head;
+                volatile __u32 * tail;
+                volatile __u32 prev_head;
                 // The actual kernel ring which uses head/tail
                 // indexes into the sqes arrays
                 uint32_t * array;
+                __u32 * array;
                 // number of entries and mask to go with it
                 const uint32_t * num;
                 const uint32_t * mask;
+                const __u32 * num;
+                const __u32 * mask;
                 // Submission flags (Not sure what for)
                 uint32_t * flags;
+                __u32 * flags;
                 // number of sqes not submitted (whatever that means)
                 uint32_t * dropped;
+                __u32 * dropped;
                 // Like head/tail but not seen by the kernel
+                volatile uint32_t * ready;
+                uint32_t ready_cnt;
+                volatile __u32 * ready;
+                __u32 ready_cnt;
+                __u32 prev_ready;
+                __spinlock_t lock;
+                __spinlock_t release_lock;
+                #if defined(LEADER_LOCK)
+                        __leaderlock_t submit_lock;
+                #else
+                        __spinlock_t submit_lock;
+                #endif
+                __spinlock_t  release_lock;
                 // A buffer of sqes (not the actual ring)
 …
         struct __completion_data {
                 // Head and tail of the ring
                 volatile uint32_t * head;
                 volatile uint32_t * tail;
+                volatile __u32 * head;
+                volatile __u32 * tail;
                 // number of entries and mask to go with it
                 const uint32_t * mask;
                 const uint32_t * num;
+                const __u32 * mask;
+                const __u32 * num;
                 // number of cqes not submitted (whatever that means)
                 uint32_t * overflow;
+                __u32 * overflow;
                 // the kernel ring
 …
                 struct __submition_data submit_q;
                 struct __completion_data completion_q;
                 uint32_t ring_flags;
+                __u32 ring_flags;
                 int fd;
                 bool eager_submits:1;
 …
         // IO user data
         struct __io_user_data_t {
                 int32_t result;
                 $thread * thrd;
+                __s32 result;
+                oneshot sem;
         };

libcfa/src/concurrency/iocall.cfa

-                      r67ca73e
+                      re67a82d
         #include "io/types.hfa"
         extern [* struct io_uring_sqe, uint32_t] __submit_alloc( struct __io_data & ring, uint64_t data );
         extern void __submit( struct io_context * ctx, uint32_t idx ) __attribute__((nonnull (1)));
         static inline void ?{}(struct io_uring_sqe & this, uint8_t opcode, int fd) {
+        extern [* struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data );
+        extern void __submit( struct io_context * ctx, __u32 idx ) __attribute__((nonnull (1)));
+        static inline void ?{}(struct io_uring_sqe & this, __u8 opcode, int fd) {
                 this.opcode = opcode;
                 #if !defined(IOSQE_ASYNC)
 …
+        }
         static inline void ?{}(struct io_uring_sqe & this, uint8_t opcode, int fd, void * addr, uint32_t len, uint64_t off ) {
+        static inline void ?{}(struct io_uring_sqe & this, __u8 opcode, int fd, void * addr, __u32 len, __u64 off ) {
                 (this){ opcode, fd };
                 this.off = off;
                 this.addr = (uint64_t)(uintptr_t)addr;
+                this.addr = (__u64)(uintptr_t)addr;
                 this.len = len;
+        }
 …
         #endif
         #define __submit_prelude \
                 if( 0 != (submit_flags & LINK_FLAGS) ) { errno = ENOTSUP; return -1; } \
                 (void)timeout; (void)cancellation; \
                 if( !context ) context = __get_io_context(); \
                 __io_user_data_t data = { 0, active_thread() }; \
+                __io_user_data_t data = { 0 }; \
                 struct __io_data & ring = *context->thrd.ring; \
                 struct io_uring_sqe * sqe; \
+                uint32_t idx; \
+                [sqe, idx] = __submit_alloc( ring, (uint64_t)(uintptr_t)&data ); \
+                sqe->flags = REGULAR_FLAGS & submit_flags;
+                __u32 idx; \
+                __u8 sflags = REGULAR_FLAGS & submit_flags; \
+                [sqe, idx] = __submit_alloc( ring, (__u64)(uintptr_t)&data ); \
+                sqe->flags = sflags;
         #define __submit_wait \
                 /*__cfaabi_bits_print_safe( STDERR_FILENO, "Preparing user data %p for %p\n", &data, data.thrd );*/ \
                 verify( sqe->user_data == (uint64_t)(uintptr_t)&data ); \
+                verify( sqe->user_data == (__u64)(uintptr_t)&data ); \
                 __submit( context, idx ); \
                 park( __cfaabi_dbg_ctx ); \
+                wait( data.sem ); \
                 if( data.result < 0 ) { \
                         errno = -data.result; \
 …
         extern int fsync(int fd);
+        extern int sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags);
+        #if __OFF_T_MATCHES_OFF64_T
+                typedef __off64_t off_t;
+        #else
+                typedef __off_t off_t;
+        #endif
+        typedef __off64_t off64_t;
+        extern int sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags);
         struct msghdr;
 …
         extern int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
         extern int fallocate(int fd, int mode, uint64_t offset, uint64_t len);
         extern int posix_fadvise(int fd, uint64_t offset, uint64_t len, int advice);
+        extern int fallocate(int fd, int mode, off_t offset, off_t len);
+        extern int posix_fadvise(int fd, off_t offset, off_t len, int advice);
         extern int madvise(void *addr, size_t length, int advice);
 …
                         __submit_prelude
+                        (*sqe){ IORING_OP_READV, fd, iov, iovcnt, offset };
+                        sqe->opcode = IORING_OP_READV;
+                        sqe->ioprio = 0;
+                        sqe->fd = fd;
+                        sqe->off = offset;
+                        sqe->addr = (__u64)iov;
+                        sqe->len = iovcnt;
+                        sqe->rw_flags = 0;
+                        sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
                         __submit_wait
 …
                         __submit_prelude
+                        (*sqe){ IORING_OP_WRITEV, fd, iov, iovcnt, offset };
+                        sqe->opcode = IORING_OP_WRITEV;
+                        sqe->ioprio = 0;
+                        sqe->fd = fd;
+                        sqe->off = offset;
+                        sqe->addr = (__u64)iov;
+                        sqe->len = iovcnt;
+                        sqe->rw_flags = 0;
+                        sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
                         __submit_wait
 …
                 __submit_prelude
+                (*sqe){ IORING_OP_FSYNC, fd };
+                __submit_wait
+        #endif
+}
+int cfa_sync_file_range(int fd, int64_t offset, int64_t nbytes, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
+                sqe->opcode = IORING_OP_FSYNC;
+                sqe->ioprio = 0;
+                sqe->fd = fd;
+                sqe->off = 0;
+                sqe->addr = 0;
+                sqe->len = 0;
+                sqe->rw_flags = 0;
+                sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
+                __submit_wait
+        #endif
+}
+int cfa_sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
         #if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_SYNC_FILE_RANGE)
                 return sync_file_range(fd, offset, nbytes, flags);
 …
                 (*sqe){ IORING_OP_SEND, sockfd };
                 sqe->addr = (uint64_t)buf;
+                sqe->addr = (__u64)buf;
                 sqe->len = len;
                 sqe->msg_flags = flags;
 …
                 (*sqe){ IORING_OP_RECV, sockfd };
                 sqe->addr = (uint64_t)buf;
+                sqe->addr = (__u64)buf;
                 sqe->len = len;
                 sqe->msg_flags = flags;
 …
                 (*sqe){ IORING_OP_ACCEPT, sockfd };
                 sqe->addr = (uint64_t)(uintptr_t)addr;
                 sqe->addr2 = (uint64_t)(uintptr_t)addrlen;
+                sqe->addr  = (__u64)addr;
+                sqe->addr2 = (__u64)addrlen;
                 sqe->accept_flags = flags;
 …
                 (*sqe){ IORING_OP_CONNECT, sockfd };
                 sqe->addr = (uint64_t)(uintptr_t)addr;
                 sqe->off  = (uint64_t)(uintptr_t)addrlen;
                 __submit_wait
         #endif
+}
 int cfa_fallocate(int fd, int mode, uint64_t offset, uint64_t len, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
+                sqe->addr = (__u64)addr;
+                sqe->off  = (__u64)addrlen;
+                __submit_wait
+        #endif
+}
+int cfa_fallocate(int fd, int mode, off_t offset, off_t len, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
         #if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FALLOCATE)
                 return fallocate( fd, mode, offset, len );
 …
+}
 int cfa_fadvise(int fd, uint64_t offset, uint64_t len, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
+int cfa_fadvise(int fd, off_t offset, off_t len, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {
         #if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_FADVISE)
                 return posix_fadvise( fd, offset, len, advice );
 …
                 (*sqe){ IORING_OP_FADVISE, fd };
                 sqe->off = (uint64_t)offset;
+                sqe->off = (__u64)offset;
                 sqe->len = len;
                 sqe->fadvise_advice = advice;
 …
                 (*sqe){ IORING_OP_MADVISE, 0 };
                 sqe->addr = (uint64_t)addr;
+                sqe->addr = (__u64)addr;
                 sqe->len = length;
                 sqe->fadvise_advice = advice;
 …
                 (*sqe){ IORING_OP_OPENAT, dirfd };
                 sqe->addr = (uint64_t)pathname;
+                sqe->addr = (__u64)pathname;
                 sqe->open_flags = flags;
                 sqe->len = mode;
 …
                 __submit_prelude
                 (*sqe){ IORING_OP_STATX, dirfd, pathname, mask, (uint64_t)statxbuf };
+                (*sqe){ IORING_OP_STATX, dirfd, pathname, mask, (__u64)statxbuf };
                 sqe->statx_flags = flags;
 …
+                }
                 else {
                         sqe->off = (uint64_t)-1;
+                        sqe->off = (__u64)-1;
+                }
                 sqe->len = len;
 …
+                }
                 else {
                         sqe->splice_off_in = (uint64_t)-1;
+                        sqe->splice_off_in = (__u64)-1;
+                }
                 sqe->splice_flags  = flags | (SPLICE_FLAGS & submit_flags);

libcfa/src/concurrency/kernel.cfa

-                      r67ca73e
+                      re67a82d
 // Kernel Scheduling logic
 static $thread * __next_thread(cluster * this);
 static bool __has_next_thread(cluster * this);
+static $thread * __next_thread_slow(cluster * this);
 static void __run_thread(processor * this, $thread * dst);
+static bool __wake_one(struct __processor_id_t * id, cluster * cltr);
+static void __halt(processor * this);
+bool __wake_proc(processor *);
+static void __wake_one(struct __processor_id_t * id, cluster * cltr);
+static void push  (__cluster_idles & idles, processor & proc);
+static void remove(__cluster_idles & idles, processor & proc);
+static [unsigned idle, unsigned total, * processor] query( & __cluster_idles idles );
 //=============================================================================================
 …
         // Do it here
         kernelTLS.rand_seed ^= rdtscl();
+        kernelTLS.ready_rng.fwd_seed = 25214903917_l64u * (rdtscl() ^ (uintptr_t)&runner);
+        __tls_rand_advance_bck();
         processor * this = runner.proc;
 …
                 $thread * readyThread = 0p;
+                for( unsigned int spin_count = 0;; spin_count++ ) {
+                MAIN_LOOP:
+                for() {
                         // Try to get the next thread
                         readyThread = __next_thread( this->cltr );
+                        // Check if we actually found a thread
+                        if( readyThread ) {
+                                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                                /* paranoid */ verifyf( readyThread->state == Ready || readyThread->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", readyThread->state, readyThread->preempted);
+                                /* paranoid */ verifyf( readyThread->link.next == 0p, "Expected null got %p", readyThread->link.next );
+                                __builtin_prefetch( readyThread->context.SP );
+                                // We found a thread run it
+                                __run_thread(this, readyThread);
+                                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                        if( !readyThread ) {
+                                readyThread = __next_thread_slow( this->cltr );
+                        }
+                        if(__atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST)) break;
+                        HALT:
                         if( !readyThread ) {
+                                // Block until a thread is ready
+                                __halt(this);
+                                // Don't block if we are done
+                                if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
+                                #if !defined(__CFA_NO_STATISTICS__)
+                                        __tls_stats()->ready.sleep.halts++;
+                                #endif
+                                // Push self to idle stack
+                                push(this->cltr->idles, * this);
+                                // Confirm the ready-queue is empty
+                                readyThread = __next_thread_slow( this->cltr );
+                                if( readyThread ) {
+                                        // A thread was found, cancel the halt
+                                        remove(this->cltr->idles, * this);
+                                        #if !defined(__CFA_NO_STATISTICS__)
+                                                __tls_stats()->ready.sleep.cancels++;
+                                        #endif
+                                        // continue the mai loop
+                                        break HALT;
+                                }
+                                #if !defined(__CFA_NO_STATISTICS__)
+                                        if(this->print_halts) {
+                                                __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->id, rdtscl());
+                                        }
+                                #endif
+                                wait( this->idle );
+                                #if !defined(__CFA_NO_STATISTICS__)
+                                        if(this->print_halts) {
+                                                __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->id, rdtscl());
+                                        }
+                                #endif
+                                // We were woken up, remove self from idle
+                                remove(this->cltr->idles, * this);
+                                // DON'T just proceed, start looking again
+                                continue MAIN_LOOP;
+                        }
+                        /* paranoid */ verify( readyThread );
+                        // We found a thread run it
+                        __run_thread(this, readyThread);
+                        // Are we done?
+                        if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
+                }
 …
 // from the processor coroutine to the target thread
 static void __run_thread(processor * this, $thread * thrd_dst) {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        /* paranoid */ verifyf( thrd_dst->state == Ready || thrd_dst->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", thrd_dst->state, thrd_dst->preempted);
+        /* paranoid */ verifyf( thrd_dst->link.next == 0p, "Expected null got %p", thrd_dst->link.next );
+        __builtin_prefetch( thrd_dst->context.SP );
         $coroutine * proc_cor = get_coroutine(this->runner);
 …
         proc_cor->state = Active;
         kernelTLS.this_thread = 0p;
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+}
 …
         ready_schedule_lock  ( id );
                 push( thrd->curr_cluster, thrd );
+                #if !defined(__CFA_NO_STATISTICS__)
+                        bool woke =
+                #endif
+                        __wake_one(id, thrd->curr_cluster);
+                #if !defined(__CFA_NO_STATISTICS__)
+                        if(woke) __tls_stats()->ready.sleep.wakes++;
+                #endif
+                __wake_one(id, thrd->curr_cluster);
         ready_schedule_unlock( id );
 …
 // KERNEL ONLY
 static $thread * __next_thread(cluster * this) with( *this ) {
+static inline $thread * __next_thread(cluster * this) with( *this ) {
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         ready_schedule_lock  ( (__processor_id_t*)kernelTLS.this_processor );
                 $thread * head = pop( this );
+                $thread * thrd = pop( this );
         ready_schedule_unlock( (__processor_id_t*)kernelTLS.this_processor );
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         return head;
+        return thrd;
+}
 // KERNEL ONLY
 static bool __has_next_thread(cluster * this) with( *this ) {
+static inline $thread * __next_thread_slow(cluster * this) with( *this ) {
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         ready_schedule_lock  ( (__processor_id_t*)kernelTLS.this_processor );
                 bool not_empty = query( this );
+                $thread * thrd = pop_slow( this );
         ready_schedule_unlock( (__processor_id_t*)kernelTLS.this_processor );
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         return not_empty;
+        return thrd;
+}
 …
 //=============================================================================================
 // Wake a thread from the front if there are any
+static bool __wake_one(struct __processor_id_t * id, cluster * this) {
+static void __wake_one(struct __processor_id_t * id, cluster * this) {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         /* paranoid */ verify( ready_schedule_islocked( id ) );
         // Check if there is a sleeping processor
+        processor * p = pop(this->idles);
+        processor * p;
+        unsigned idle;
+        unsigned total;
+        [idle, total, p] = query(this->idles);
         // If no one is sleeping, we are done
         if( 0p == p ) return false;
+        if( idle == 0 ) return;
         // We found a processor, wake it up
         post( p->idle );
+        return true;
+        #if !defined(__CFA_NO_STATISTICS__)
+                __tls_stats()->ready.sleep.wakes++;
+        #endif
+        /* paranoid */ verify( ready_schedule_islocked( id ) );
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        return;
+}
 // Unconditionnaly wake a thread
 bool __wake_proc(processor * this) {
+void __wake_proc(processor * this) {
         __cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this);
 …
                 bool ret = post( this->idle );
         enable_interrupts( __cfaabi_dbg_ctx );
+        return ret;
+}
+static void __halt(processor * this) with( *this ) {
+        if( do_terminate ) return;
+        #if !defined(__CFA_NO_STATISTICS__)
+                __tls_stats()->ready.sleep.halts++;
+        #endif
+        // Push self to queue
+        push(cltr->idles, *this);
+        // Makre sure we don't miss a thread
+        if( __has_next_thread(cltr) ) {
+                // A thread was posted, make sure a processor is woken up
+                struct __processor_id_t *id = (struct __processor_id_t *) this;
+                ready_schedule_lock  ( id );
+                        __wake_one( id, cltr );
+                ready_schedule_unlock( id );
+                #if !defined(__CFA_NO_STATISTICS__)
+                        __tls_stats()->ready.sleep.cancels++;
+                #endif
+        }
+        #if !defined(__CFA_NO_STATISTICS__)
+                if(this->print_halts) {
+                        __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->id, rdtscl());
+                }
+        #endif
+        wait( idle );
+        #if !defined(__CFA_NO_STATISTICS__)
+                if(this->print_halts) {
+                        __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->id, rdtscl());
+                }
+        #endif
+}
+static void push  (__cluster_idles & this, processor & proc) {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        lock( this );
+                this.idle++;
+                /* paranoid */ verify( this.idle <= this.total );
+                insert_first(this.list, proc);
+        unlock( this );
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+}
+static void remove(__cluster_idles & this, processor & proc) {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        lock( this );
+                this.idle--;
+                /* paranoid */ verify( this.idle >= 0 );
+                remove(proc);
+        unlock( this );
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+}
+static [unsigned idle, unsigned total, * processor] query( & __cluster_idles this ) {
+        for() {
+                uint64_t l = __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST);
+                if( 1 == (l % 2) ) { Pause(); continue; }
+                unsigned idle    = this.idle;
+                unsigned total   = this.total;
+                processor * proc = &this.list`first;
+                // Compiler fence is unnecessary, but gcc-8 and older incorrectly reorder code without it
+                asm volatile("": : :"memory");
+                if(l != __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST)) { Pause(); continue; }
+                return [idle, total, proc];
+        }
+}

libcfa/src/concurrency/kernel.hfa

-                      r67ca73e
+                      re67a82d
 #include "coroutine.hfa"
 #include "containers/stackLockFree.hfa"
+#include "containers/list.hfa"
 extern "C" {
 …
         // Link lists fields
         Link(processor) link;
+        DLISTED_MGD_IMPL_IN(processor)
         #if !defined(__CFA_NO_STATISTICS__)
 …
 static inline void  ?{}(processor & this, const char name[]) { this{name, *mainCluster }; }
+static inline Link(processor) * ?`next( processor * this ) { return &this->link; }
+DLISTED_MGD_IMPL_OUT(processor)
 //-----------------------------------------------------------------------------
 …
 void ^?{}(__ready_queue_t & this);
+// Idle Sleep
+struct __cluster_idles {
+        // Spin lock protecting the queue
+        volatile uint64_t lock;
+        // Total number of processors
+        unsigned total;
+        // Total number of idle processors
+        unsigned idle;
+        // List of idle processors
+        dlist(processor, processor) list;
+};
 //-----------------------------------------------------------------------------
 // Cluster
 …
         // List of idle processors
+        StackLF(processor) idles;
+        volatile unsigned int nprocessors;
+        __cluster_idles idles;
         // List of threads

libcfa/src/concurrency/kernel/fwd.hfa

-                      r67ca73e
+                      re67a82d
                                 uint64_t rand_seed;
                         #endif
+                        struct {
+                                uint64_t fwd_seed;
+                                uint64_t bck_seed;
+                        } ready_rng;
                 } kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
                 static inline uint64_t __tls_rand() {
 …
                                 return __xorshift64( kernelTLS.rand_seed );
                         #endif
+                }
+                #define M  (1_l64u << 48_l64u)
+                #define A  (25214903917_l64u)
+                #define AI (18446708753438544741_l64u)
+                #define C  (11_l64u)
+                #define D  (16_l64u)
+                static inline unsigned __tls_rand_fwd() {
+                        kernelTLS.ready_rng.fwd_seed = (A * kernelTLS.ready_rng.fwd_seed + C) & (M - 1);
+                        return kernelTLS.ready_rng.fwd_seed >> D;
+                }
+                static inline unsigned __tls_rand_bck() {
+                        unsigned int r = kernelTLS.ready_rng.bck_seed >> D;
+                        kernelTLS.ready_rng.bck_seed = AI * (kernelTLS.ready_rng.bck_seed - C) & (M - 1);
+                        return r;
+                }
+                #undef M
+                #undef A
+                #undef AI
+                #undef C
+                #undef D
+                static inline void __tls_rand_advance_bck(void) {
+                        kernelTLS.ready_rng.bck_seed = kernelTLS.ready_rng.fwd_seed;
+                }
+        }

libcfa/src/concurrency/kernel/startup.cfa

-                      r67ca73e
+                      re67a82d
 static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info);
+#if defined(__CFA_WITH_VERIFY__)
+        static bool verify_fwd_bck_rng(void);
+#endif
 //-----------------------------------------------------------------------------
 // Forward Declarations for other modules
 …
 //-----------------------------------------------------------------------------
 // Other Forward Declarations
 extern bool __wake_proc(processor *);
+extern void __wake_proc(processor *);
 //-----------------------------------------------------------------------------
 …
         __cfa_dbg_global_clusters.list{ __get };
         __cfa_dbg_global_clusters.lock{};
+        /* paranoid */ verify( verify_fwd_bck_rng() );
         // Initialize the global scheduler lock
 …
         #endif
+        int target = __atomic_add_fetch( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+        lock( this.cltr->idles );
+                int target = this.cltr->idles.total += 1u;
+        unlock( this.cltr->idles );
         id = doregister((__processor_id_t*)&this);
 …
 // Not a ctor, it just preps the destruction but should not destroy members
 static void deinit(processor & this) {
+        int target = __atomic_sub_fetch( &this.cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+        lock( this.cltr->idles );
+                int target = this.cltr->idles.total -= 1u;
+        unlock( this.cltr->idles );
         // Lock the RWlock so no-one pushes/pops while we are changing the queue
 …
                 // Adjust the ready queue size
                 ready_queue_shrink( this.cltr, target );
-                // Make sure we aren't on the idle queue
-                unsafe_remove( this.cltr->idles, &this );
         // Unlock the RWlock
 …
         ( this.terminated ){ 0 };
         ( this.runner ){};
+        init( this, name, _cltr );
+        disable_interrupts();
+                init( this, name, _cltr );
+        enable_interrupts( __cfaabi_dbg_ctx );
         __cfadbg_print_safe(runtime_core, "Kernel : Starting core %p\n", &this);
 …
         free( this.stack );
+        deinit( this );
+        disable_interrupts();
+                deinit( this );
+        enable_interrupts( __cfaabi_dbg_ctx );
+}
 //-----------------------------------------------------------------------------
 // Cluster
+static void ?{}(__cluster_idles & this) {
+        this.lock  = 0;
+        this.idle  = 0;
+        this.total = 0;
+        (this.list){};
+}
 void ?{}(cluster & this, const char name[], Duration preemption_rate, unsigned num_io, const io_context_params & io_params) with( this ) {
         this.name = name;
         this.preemption_rate = preemption_rate;
-        this.nprocessors = 0;
         ready_queue{};
 …
         return stack;
+}
+#if defined(__CFA_WITH_VERIFY__)
+static bool verify_fwd_bck_rng(void) {
+        kernelTLS.ready_rng.fwd_seed = 25214903917_l64u * (rdtscl() ^ (uintptr_t)&verify_fwd_bck_rng);
+        unsigned values[10];
+        for(i; 10) {
+                values[i] = __tls_rand_fwd();
+        }
+        __tls_rand_advance_bck();
+        for ( i; 9 -~= 0 ) {
+                if(values[i] != __tls_rand_bck()) {
+                        return false;
+                }
+        }
+        return true;
+}
+#endif

libcfa/src/concurrency/kernel_private.hfa

-                      r67ca73e
+                      re67a82d
 void     unregister( struct __processor_id_t * proc );
+//-----------------------------------------------------------------------
+// Cluster idle lock/unlock
+static inline void lock(__cluster_idles & this) {
+        for() {
+                uint64_t l = this.lock;
+                if(
+                        (0 == (l % 2))
+                        && __atomic_compare_exchange_n(&this.lock, &l, l + 1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+                ) return;
+                Pause();
+        }
+}
+static inline void unlock(__cluster_idles & this) {
+        /* paranoid */ verify( 1 == (this.lock % 2) );
+        __atomic_fetch_add( &this.lock, 1, __ATOMIC_SEQ_CST );
+}
 //=======================================================================
 // Reader-writer lock implementation
 …
 // pop thread from the ready queue of a cluster
 // returns 0p if empty
+// May return 0p spuriously
 __attribute__((hot)) struct $thread * pop(struct cluster * cltr);
+//-----------------------------------------------------------------------
+// pop thread from the ready queue of a cluster
+// returns 0p if empty
+// guaranteed to find any threads added before this call
+__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr);
 //-----------------------------------------------------------------------

libcfa/src/concurrency/ready_queue.cfa

-                      r67ca73e
+                      re67a82d
 // #define __CFA_DEBUG_PRINT_READY_QUEUE__
+// #define USE_SNZI
 #include "bits/defs.hfa"
 #include "kernel_private.hfa"
 …
 //  queues or removing them.
 uint_fast32_t ready_mutate_lock( void ) with(*__scheduler_lock) {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         // Step 1 : lock global lock
         // It is needed to avoid processors that register mid Critical-Section
 …
+        }
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         return s;
+}
 void ready_mutate_unlock( uint_fast32_t last_s ) with(*__scheduler_lock) {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         // Step 1 : release local locks
         // This must be done while the global lock is held to avoid
 …
         /*paranoid*/ assert(true == lock);
         __atomic_store_n(&lock, (bool)false, __ATOMIC_RELEASE);
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+}
 …
 void ^?{}(__ready_queue_t & this) with (this) {
         verify( 1 == lanes.count );
+        verify( !query( snzi ) );
+        #ifdef USE_SNZI
+                verify( !query( snzi ) );
+        #endif
         free(lanes.data);
+}
 …
 //-----------------------------------------------------------------------
 __attribute__((hot)) bool query(struct cluster * cltr) {
+        return query(cltr->ready_queue.snzi);
+        #ifdef USE_SNZI
+                return query(cltr->ready_queue.snzi);
+        #endif
+        return true;
+}
 …
         bool lane_first = push(lanes.data[i], thrd);
+        // If this lane used to be empty we need to do more
+        if(lane_first) {
+                // Check if the entire queue used to be empty
+                first = !query(snzi);
+                // Update the snzi
+                arrive( snzi, i );
+        }
+        #ifdef USE_SNZI
+                // If this lane used to be empty we need to do more
+                if(lane_first) {
+                        // Check if the entire queue used to be empty
+                        first = !query(snzi);
+                        // Update the snzi
+                        arrive( snzi, i );
+                }
+        #endif
         // Unlock and return
 …
 __attribute__((hot)) $thread * pop(struct cluster * cltr) with (cltr->ready_queue) {
         /* paranoid */ verify( lanes.count > 0 );
+        unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
         #if defined(BIAS)
                 // Don't bother trying locally too much
 …
         // As long as the list is not empty, try finding a lane that isn't empty and pop from it
+        while( query(snzi) ) {
+        #ifdef USE_SNZI
+                while( query(snzi) ) {
+        #else
+                for(25) {
+        #endif
                 // Pick two lists at random
                 unsigned i,j;
 …
                 #endif
                 i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
                 j %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+                i %= count;
+                j %= count;
                 // try popping from the 2 picked lists
 …
+}
+__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+        /* paranoid */ verify( lanes.count > 0 );
+        unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+        unsigned offset = __tls_rand();
+        for(i; count) {
+                unsigned idx = (offset + i) % count;
+                struct $thread * thrd = try_pop(cltr, idx);
+                if(thrd) {
+                        return thrd;
+                }
+        }
+        // All lanes where empty return 0p
+        return 0p;
+}
 //-----------------------------------------------------------------------
 // Given 2 indexes, pick the list with the oldest push an try to pop from it
 …
         // Actually pop the list
         struct $thread * thrd;
+        bool emptied;
+        [thrd, emptied] = pop(lane);
+        thrd = pop(lane);
         /* paranoid */ verify(thrd);
         /* paranoid */ verify(lane.lock);
+        // If this was the last element in the lane
+        if(emptied) {
+                depart( snzi, w );
+        }
+        #ifdef USE_SNZI
+                // If this was the last element in the lane
+                if(emptied) {
+                        depart( snzi, w );
+                }
+        #endif
         // Unlock and return
 …
                         if(head(lane)->link.next == thrd) {
                                 $thread * pthrd;
+                                bool emptied;
+                                [pthrd, emptied] = pop(lane);
+                                pthrd = pop(lane);
                                 /* paranoid */ verify( pthrd == thrd );
                                 removed = true;
+                                if(emptied) {
+                                        depart( snzi, i );
+                                }
+                                #ifdef USE_SNZI
+                                        if(emptied) {
+                                                depart( snzi, i );
+                                        }
+                                #endif
+                        }
                 __atomic_unlock(&lane.lock);
 …
         // grow the ready queue
         with( cltr->ready_queue ) {
+                ^(snzi){};
+                #ifdef USE_SNZI
+                        ^(snzi){};
+                #endif
                 // Find new count
 …
                 lanes.count = ncount;
+                // Re-create the snzi
+                snzi{ log2( lanes.count / 8 ) };
+                for( idx; (size_t)lanes.count ) {
+                        if( !is_empty(lanes.data[idx]) ) {
+                                arrive(snzi, idx);
+                        }
+                }
+                #ifdef USE_SNZI
+                        // Re-create the snzi
+                        snzi{ log2( lanes.count / 8 ) };
+                        for( idx; (size_t)lanes.count ) {
+                                if( !is_empty(lanes.data[idx]) ) {
+                                        arrive(snzi, idx);
+                                }
+                        }
+                #endif
+        }
 …
         with( cltr->ready_queue ) {
+                ^(snzi){};
+                #ifdef USE_SNZI
+                        ^(snzi){};
+                #endif
                 // Remember old count
 …
                         while(!is_empty(lanes.data[idx])) {
                                 struct $thread * thrd;
+                                __attribute__((unused)) bool _;
+                                [thrd, _] = pop(lanes.data[idx]);
+                                thrd = pop(lanes.data[idx]);
                                 push(cltr, thrd);
 …
+                }
+                // Re-create the snzi
+                snzi{ log2( lanes.count / 8 ) };
+                for( idx; (size_t)lanes.count ) {
+                        if( !is_empty(lanes.data[idx]) ) {
+                                arrive(snzi, idx);
+                        }
+                }
+                #ifdef USE_SNZI
+                        // Re-create the snzi
+                        snzi{ log2( lanes.count / 8 ) };
+                        for( idx; (size_t)lanes.count ) {
+                                if( !is_empty(lanes.data[idx]) ) {
+                                        arrive(snzi, idx);
+                                }
+                        }
+                #endif
+        }

libcfa/src/concurrency/ready_subqueue.hfa

-                      r67ca73e
+                      re67a82d
 // returns popped
 // returns true of lane was empty before push, false otherwise
 [$thread *, bool] pop(__intrusive_lane_t & this) {
+$thread * pop(__intrusive_lane_t & this) {
         /* paranoid */ verify(this.lock);
         /* paranoid */ verify(this.before.link.ts != 0ul);
 …
         head->link.next = next;
         next->link.prev = head;
+        node->link.[next, prev] = 0p;
+        node->link.next = 0p;
+        node->link.prev = 0p;
         // Update head time stamp
 …
                 /* paranoid */ verify(tail(this)->link.prev == head(this));
                 /* paranoid */ verify(head(this)->link.next == tail(this));
                 return [node, true];
+                return node;
+        }
         else {
 …
                 /* paranoid */ verify(head(this)->link.next != tail(this));
                 /* paranoid */ verify(this.before.link.ts != 0);
                 return [node, false];
+                return node;
+        }
+}

libcfa/src/concurrency/stats.cfa

-                      r67ca73e
+                      re67a82d
                         stats->io.submit_q.busy   = 0;
                         stats->io.complete_q.completed_avg.val = 0;
                         stats->io.complete_q.completed_avg.slow_cnt = 0;
                         stats->io.complete_q.completed_avg.fast_cnt = 0;
+                        stats->io.complete_q.completed_avg.cnt = 0;
+                        stats->io.complete_q.blocks = 0;
                 #endif
+        }
 …
                 #if defined(CFA_HAVE_LINUX_IO_URING_H)
                         __atomic_fetch_add( &cltr->io.submit_q.submit_avg.rdy          , proc->io.submit_q.submit_avg.rdy          , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.submit_avg.csm          , proc->io.submit_q.submit_avg.csm          , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.submit_avg.avl          , proc->io.submit_q.submit_avg.avl          , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.submit_avg.cnt          , proc->io.submit_q.submit_avg.cnt          , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.look_avg.val            , proc->io.submit_q.look_avg.val            , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.look_avg.cnt            , proc->io.submit_q.look_avg.cnt            , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.look_avg.block          , proc->io.submit_q.look_avg.block          , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.alloc_avg.val           , proc->io.submit_q.alloc_avg.val           , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.alloc_avg.cnt           , proc->io.submit_q.alloc_avg.cnt           , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.alloc_avg.block         , proc->io.submit_q.alloc_avg.block         , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.helped                  , proc->io.submit_q.helped                  , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.leader                  , proc->io.submit_q.leader                  , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.submit_q.busy                    , proc->io.submit_q.busy                    , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.complete_q.completed_avg.val     , proc->io.complete_q.completed_avg.val     , __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.complete_q.completed_avg.slow_cnt, proc->io.complete_q.completed_avg.slow_cnt, __ATOMIC_SEQ_CST );
                         __atomic_fetch_add( &cltr->io.complete_q.completed_avg.fast_cnt, proc->io.complete_q.completed_avg.fast_cnt, __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.submit_avg.rdy     , proc->io.submit_q.submit_avg.rdy     , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.submit_avg.csm     , proc->io.submit_q.submit_avg.csm     , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.submit_avg.avl     , proc->io.submit_q.submit_avg.avl     , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.submit_avg.cnt     , proc->io.submit_q.submit_avg.cnt     , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.look_avg.val       , proc->io.submit_q.look_avg.val       , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.look_avg.cnt       , proc->io.submit_q.look_avg.cnt       , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.look_avg.block     , proc->io.submit_q.look_avg.block     , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.alloc_avg.val      , proc->io.submit_q.alloc_avg.val      , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.alloc_avg.cnt      , proc->io.submit_q.alloc_avg.cnt      , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.alloc_avg.block    , proc->io.submit_q.alloc_avg.block    , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.helped             , proc->io.submit_q.helped             , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.leader             , proc->io.submit_q.leader             , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.submit_q.busy               , proc->io.submit_q.busy               , __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.complete_q.completed_avg.val, proc->io.complete_q.completed_avg.val, __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.complete_q.completed_avg.cnt, proc->io.complete_q.completed_avg.cnt, __ATOMIC_SEQ_CST );
+                        __atomic_fetch_add( &cltr->io.complete_q.blocks           , proc->io.complete_q.blocks           , __ATOMIC_SEQ_CST );
                 #endif
+        }
 …
                                         "- avg alloc search len   : %'18.2lf\n"
                                         "- avg alloc search block : %'18.2lf\n"
                                         "- total wait calls       : %'15" PRIu64 "   (%'" PRIu64 " slow, %'" PRIu64 " fast)\n"
+                                        "- total wait calls       : %'15" PRIu64 "\n"
                                         "- avg completion/wait    : %'18.2lf\n"
+                                        "- total completion blocks: %'15" PRIu64 "\n"
                                         "\n"
                                         , cluster ? "Cluster" : "Processor",  name, id
 …
                                         , io.submit_q.alloc_avg.cnt
                                         , aavgv, aavgb
                                         , io.complete_q.completed_avg.slow_cnt + io.complete_q.completed_avg.fast_cnt
                                         , io.complete_q.completed_avg.slow_cnt,  io.complete_q.completed_avg.fast_cnt
                                         , ((double)io.complete_q.completed_avg.val) / (io.complete_q.completed_avg.slow_cnt + io.complete_q.completed_avg.fast_cnt)
+                                        , io.complete_q.completed_avg.cnt
+                                        , ((double)io.complete_q.completed_avg.val) / io.complete_q.completed_avg.cnt
+                                        , io.complete_q.blocks
                                 );
+                        }

libcfa/src/concurrency/stats.hfa

-                      r67ca73e
+                      re67a82d
                                 struct {
                                         volatile uint64_t val;
+                                        volatile uint64_t slow_cnt;
+                                        volatile uint64_t fast_cnt;
+                                        volatile uint64_t cnt;
                                 } completed_avg;
+                                volatile uint64_t blocks;
                         } complete_q;
                 };

Context Navigation

Legend:

libcfa/src/concurrency/alarm.hfa

libcfa/src/concurrency/coroutine.cfa

libcfa/src/concurrency/invoke.c

libcfa/src/concurrency/invoke.h

libcfa/src/concurrency/io.cfa

libcfa/src/concurrency/io/setup.cfa

libcfa/src/concurrency/io/types.hfa

libcfa/src/concurrency/iocall.cfa

libcfa/src/concurrency/kernel.cfa

libcfa/src/concurrency/kernel.hfa

libcfa/src/concurrency/kernel/fwd.hfa

libcfa/src/concurrency/kernel/startup.cfa

libcfa/src/concurrency/kernel_private.hfa

libcfa/src/concurrency/ready_queue.cfa

libcfa/src/concurrency/ready_subqueue.hfa

libcfa/src/concurrency/stats.cfa

libcfa/src/concurrency/stats.hfa

Download in other formats: