Changeset 24d6572 for libcfa/src/concurrency

libcfa/src/concurrency/clib/cfathread.cfa

-              r34b4268
+              r24d6572
 // #define EPOLL_FOR_SOCKETS
+#include <string.h>
 #include "fstream.hfa"
 #include "locks.hfa"
 …
 #include "time.hfa"
 #include "stdlib.hfa"
+#include "iofwd.hfa"
 #include "cfathread.h"
-extern "C" {
-                #include <string.h>
-                #include <errno.h>
+}
 extern void ?{}(processor &, const char[], cluster &, thread$ *);
 extern "C" {
+      extern void __cfactx_invoke_thread(void (*main)(void *), void * this);
+        extern int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
+        extern void __cfactx_invoke_thread(void (*main)(void *), void * this);
+}
 …
         // Mutex
         struct cfathread_mutex {
                 linear_backoff_then_block_lock impl;
+                exp_backoff_then_block_lock impl;
         };
         int cfathread_mutex_init(cfathread_mutex_t *restrict mut, const cfathread_mutexattr_t *restrict) __attribute__((nonnull (1))) { *mut = new(); return 0; }
 …
         // Condition
         struct cfathread_condition {
                 condition_variable(linear_backoff_then_block_lock) impl;
+                condition_variable(exp_backoff_then_block_lock) impl;
         };
         int cfathread_cond_init(cfathread_cond_t *restrict cond, const cfathread_condattr_t *restrict) __attribute__((nonnull (1))) { *cond = new(); return 0; }
 …
+}
-#include <iofwd.hfa>
 extern "C" {
-        #include <unistd.h>
-        #include <sys/types.h>
-        #include <sys/socket.h>
         //--------------------
         // IO operations
 …
                 , protocol);
+        }
         int cfathread_bind(int socket, const struct sockaddr *address, socklen_t address_len) {
+        int cfathread_bind(int socket, __CONST_SOCKADDR_ARG address, socklen_t address_len) {
                 return bind(socket, address, address_len);
+        }
 …
+        }
         int cfathread_accept(int socket, struct sockaddr *restrict address, socklen_t *restrict address_len) {
+        int cfathread_accept(int socket, __SOCKADDR_ARG address, socklen_t *restrict address_len) {
                 #if defined(EPOLL_FOR_SOCKETS)
                         int ret;
 …
+        }
         int cfathread_connect(int socket, const struct sockaddr *address, socklen_t address_len) {
+        int cfathread_connect(int socket, __CONST_SOCKADDR_ARG address, socklen_t address_len) {
                 #if defined(EPOLL_FOR_SOCKETS)
                         int ret;

libcfa/src/concurrency/clib/cfathread.h

-              r34b4268
+              r24d6572
 // Author           : Thierry Delisle
 // Created On       : Tue Sep 22 15:31:20 2020
 // Last Modified By :
 // Last Modified On :
 // Update Count     :
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Mon Mar 13 23:48:40 2023
+// Update Count     : 7
 //
+#pragma once
 #if defined(__cforall) || defined(__cplusplus)
+#include <unistd.h>
+#include <errno.h>
+#include <sys/socket.h>
 extern "C" {
 #endif
-        #include <asm/types.h>
-        #include <errno.h>
-        #include <unistd.h>
         //--------------------
         // Basic types
 …
         } cfathread_mutexattr_t;
         typedef struct cfathread_mutex * cfathread_mutex_t;
         int cfathread_mutex_init(cfathread_mutex_t *restrict mut, const cfathread_mutexattr_t *restrict attr) __attribute__((nonnull (1)));
+        int cfathread_mutex_init(cfathread_mutex_t * restrict mut, const cfathread_mutexattr_t * restrict attr) __attribute__((nonnull (1)));
         int cfathread_mutex_destroy(cfathread_mutex_t *mut) __attribute__((nonnull (1)));
         int cfathread_mutex_lock(cfathread_mutex_t *mut) __attribute__((nonnull (1)));
 …
         //--------------------
         // IO operations
-        struct sockaddr;
-        struct msghdr;
         int cfathread_socket(int domain, int type, int protocol);
         int cfathread_bind(int socket, const struct sockaddr *address, socklen_t address_len);
+        int cfathread_bind(int socket, __CONST_SOCKADDR_ARG address, socklen_t address_len);
         int cfathread_listen(int socket, int backlog);
         int cfathread_accept(int socket, struct sockaddr *restrict address, socklen_t *restrict address_len);
         int cfathread_connect(int socket, const struct sockaddr *address, socklen_t address_len);
+        int cfathread_accept(int socket, __SOCKADDR_ARG address, socklen_t * restrict address_len);
+        int cfathread_connect(int socket, __CONST_SOCKADDR_ARG address, socklen_t address_len);
         int cfathread_dup(int fildes);
         int cfathread_close(int fildes);

libcfa/src/concurrency/coroutine.cfa

-              r34b4268
+              r24d6572
 // Created On       : Mon Nov 28 12:27:26 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Dec 15 12:06:04 2020
 // Update Count     : 23
+// Last Modified On : Thu Feb 16 15:34:46 2023
+// Update Count     : 24
 //
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include "coroutine.hfa"

libcfa/src/concurrency/coroutine.hfa

-              r34b4268
+              r24d6572
 // Created On       : Mon Nov 28 12:27:26 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Jan  6 16:33:16 2022
 // Update Count     : 12
+// Last Modified On : Thu Feb  2 11:31:42 2023
+// Update Count     : 13
 //
 …
 // Anything that implements this trait can be resumed.
 // Anything that is resumed is a coroutine.
+trait is_coroutine(T & | IS_RESUMPTION_EXCEPTION(CoroutineCancelled(T))) {
+forall( T & | IS_RESUMPTION_EXCEPTION(CoroutineCancelled(T)) )
+trait is_coroutine {
         void main(T & this);
         coroutine$ * get_coroutine(T & this);

libcfa/src/concurrency/future.hfa

-              r34b4268
+              r24d6572
 // file "LICENCE" distributed with Cforall.
 //
 // io/types.hfa --
 //
 // Author           : Thierry Delisle & Peiran Hong
+// concurrency/future.hfa --
+//
+// Author           : Thierry Delisle & Peiran Hong & Colby Parsons
 // Created On       : Wed Jan 06 17:33:18 2021
 // Last Modified By :
 …
 #include "bits/locks.hfa"
 #include "monitor.hfa"
+#include "select.hfa"
+#include "locks.hfa"
+//----------------------------------------------------------------------------
+// future
+// I don't use future_t here since I need to use a lock for this future
+//  since it supports multiple consumers
+//  future_t is lockfree and uses atomics which aren't needed given we use locks here
 forall( T ) {
+    // enum { FUTURE_EMPTY = 0, FUTURE_FULFILLED = 1 }; // Enums seem to be broken so feel free to add this back afterwards
+    // temporary enum replacement
+    const int FUTURE_EMPTY = 0;
+    const int FUTURE_FULFILLED = 1;
         struct future {
+                int state;
+                T result;
+                dlist( select_node ) waiters;
+        futex_mutex lock;
+        };
+    struct future_node {
+        inline select_node;
+        T * my_result;
+    };
+        static inline {
+        void ?{}( future_node(T) & this, thread$ * blocked_thread, T * my_result ) {
+            ((select_node &)this){ blocked_thread };
+            this.my_result = my_result;
+        }
+        void ?{}( future(T) & this ) {
+                        this.waiters{};
+            this.state = FUTURE_EMPTY;
+            this.lock{};
+                }
+                // Reset future back to original state
+                void reset( future(T) & this ) with(this)
+        {
+            lock( lock );
+            if( ! waiters`isEmpty )
+                abort("Attempting to reset a future with blocked waiters");
+            state = FUTURE_EMPTY;
+            unlock( lock );
+        }
+                // check if the future is available
+        // currently no mutual exclusion because I can't see when you need this call to be synchronous or protected
+                bool available( future(T) & this ) { return __atomic_load_n( &this.state, __ATOMIC_RELAXED ); }
+        // memcpy wrapper to help copy values
+        void copy_T( T & from, T & to ) {
+            memcpy((void *)&to, (void *)&from, sizeof(T));
+        }
+        // internal helper to signal waiters off of the future
+        void _internal_flush( future(T) & this ) with(this) {
+            while( ! waiters`isEmpty ) {
+                if ( !__handle_waituntil_OR( waiters ) ) // handle special waituntil OR case
+                    break; // if handle_OR returns false then waiters is empty so break
+                select_node &s = try_pop_front( waiters );
+                if ( s.clause_status == 0p ) // poke in result so that woken threads do not need to reacquire any locks
+                    copy_T( result, *(((future_node(T) &)s).my_result) );
+                wake_one( waiters, s );
+            }
+        }
+                // Fulfil the future, returns whether or not someone was unblocked
+                bool fulfil( future(T) & this, T val ) with(this) {
+            lock( lock );
+            if( state != FUTURE_EMPTY )
+                abort("Attempting to fulfil a future that has already been fulfilled");
+            copy_T( val, result );
+            bool ret_val = ! waiters`isEmpty;
+            state = FUTURE_FULFILLED;
+                        _internal_flush( this );
+            unlock( lock );
+            return ret_val;
+                }
+                // Wait for the future to be fulfilled
+                // Also return whether the thread had to block or not
+                [T, bool] get( future(T) & this ) with( this ) {
+            lock( lock );
+            T ret_val;
+            if( state == FUTURE_FULFILLED ) {
+                copy_T( result, ret_val );
+                unlock( lock );
+                return [ret_val, false];
+            }
+            future_node(T) node = { active_thread(), &ret_val };
+            insert_last( waiters, ((select_node &)node) );
+            unlock( lock );
+            park( );
+                        return [ret_val, true];
+                }
+                // Wait for the future to be fulfilled
+                T get( future(T) & this ) {
+                        [T, bool] tt;
+                        tt = get(this);
+                        return tt.0;
+                }
+        // Gets value if it is available and returns [ val, true ]
+        // otherwise returns [ default_val, false]
+        // will not block
+        [T, bool] try_get( future(T) & this ) with(this) {
+            lock( lock );
+            T ret_val;
+            if( state == FUTURE_FULFILLED ) {
+                copy_T( result, ret_val );
+                unlock( lock );
+                return [ret_val, true];
+            }
+            unlock( lock );
+            return [ret_val, false];
+        }
+        bool register_select( future(T) & this, select_node & s ) with(this) {
+            lock( lock );
+            // check if we can complete operation. If so race to establish winner in special OR case
+            if ( !s.park_counter && state != FUTURE_EMPTY ) {
+                if ( !__make_select_node_available( s ) ) { // we didn't win the race so give up on registering
+                    unlock( lock );
+                    return false;
+                }
+            }
+            // future not ready -> insert select node and return
+            if( state == FUTURE_EMPTY ) {
+                insert_last( waiters, s );
+                unlock( lock );
+                return false;
+            }
+            __make_select_node_available( s );
+            unlock( lock );
+            return true;
+        }
+        bool unregister_select( future(T) & this, select_node & s ) with(this) {
+            if ( ! s`isListed ) return false;
+            lock( lock );
+            if ( s`isListed ) remove( s );
+            unlock( lock );
+            return false;
+        }
+        void on_selected( future(T) & this, select_node & node ) {}
+        }
+}
+//--------------------------------------------------------------------------------------------------------
+// These futures below do not support select statements so they may not have as many features as 'future'
+//  however the 'single_future' is cheap and cheerful and is most likely more performant than 'future'
+//  since it uses raw atomics and no locks
+//
+// As far as 'multi_future' goes I can't see many use cases as it will be less performant than 'future'
+//  since it is monitor based and also is not compatible with select statements
+//--------------------------------------------------------------------------------------------------------
+forall( T ) {
+        struct single_future {
                 inline future_t;
                 T result;
 …
         static inline {
                 // Reset future back to original state
                 void reset(future(T) & this) { reset( (future_t&)this ); }
+                void reset(single_future(T) & this) { reset( (future_t&)this ); }
                 // check if the future is available
                 bool available( future(T) & this ) { return available( (future_t&)this ); }
+                bool available( single_future(T) & this ) { return available( (future_t&)this ); }
                 // Mark the future as abandoned, meaning it will be deleted by the server
                 // This doesn't work beause of the potential need for a destructor
                 void abandon( future(T) & this );
+                void abandon( single_future(T) & this );
                 // Fulfil the future, returns whether or not someone was unblocked
                 thread$ * fulfil( future(T) & this, T result ) {
+                thread$ * fulfil( single_future(T) & this, T result ) {
                         this.result = result;
                         return fulfil( (future_t&)this );
 …
                 // Wait for the future to be fulfilled
                 // Also return whether the thread had to block or not
                 [T, bool] wait( future(T) & this ) {
+                [T, bool] wait( single_future(T) & this ) {
                         bool r = wait( (future_t&)this );
                         return [this.result, r];
 …
                 // Wait for the future to be fulfilled
                 T wait( future(T) & this ) {
+                T wait( single_future(T) & this ) {
                         [T, bool] tt;
                         tt = wait(this);

libcfa/src/concurrency/invoke.h

-              r34b4268
+              r24d6572
 // Created On       : Tue Jan 17 12:27:26 2016
 // Last Modified By : Peter A. Buhr
+// Last Modified On : Tue Nov 29 20:42:21 2022
+// Update Count     : 56
+//
+// Last Modified On : Tue Mar 14 13:39:31 2023
+// Update Count     : 59
+//
+// No not use #pragma once was this file is included twice in some places. It has its own guard system.
 #include "bits/containers.hfa"
 …
                 struct __thread_user_link cltr_link;
-                // used to store state between clh lock/unlock
-                volatile bool * clh_prev;
-                // used to point to this thd's current clh node
-                volatile bool * clh_node;
                 struct processor * last_proc;
+        // ptr used during handover between blocking lists to allow for stack allocation of intrusive nodes
+        // main use case is wait-morphing to allow a different node to be used to block on condvar vs lock
+        void * link_node;
                 PRNG_STATE_T random_state;                                              // fast random numbers

libcfa/src/concurrency/io.cfa

-              r34b4268
+              r24d6572
 #define __cforall_thread__
-#define _GNU_SOURCE
 #if defined(__CFA_DEBUG__)
 …
         static io_context$ * __ioarbiter_allocate( io_arbiter$ & this, __u32 idxs[], __u32 want );
         static void __ioarbiter_submit( io_context$ * , __u32 idxs[], __u32 have, bool lazy );
         static void __ioarbiter_flush ( io_context$ & );
+        static void __ioarbiter_flush ( io_context$ &, bool kernel );
         static inline void __ioarbiter_notify( io_context$ & ctx );
 //=============================================================================================
 …
         extern void __kernel_unpark( thread$ * thrd, unpark_hint );
+        static inline void __post(oneshot & this, bool kernel, unpark_hint hint) {
+                thread$ * t = post( this, false );
+                if(kernel) __kernel_unpark( t, hint );
+                else unpark( t, hint );
+        }
+        // actual system call of io uring
+        // wrap so everything that needs to happen around it is always done
+        //   i.e., stats, book keeping, sqe reclamation, etc.
         static void ioring_syscsll( struct io_context$ & ctx, unsigned int min_comp, unsigned int flags ) {
                 __STATS__( true, io.calls.flush++; )
                 int ret;
                 for() {
+                        // do the system call in a loop, repeat on interrupts
                         ret = syscall( __NR_io_uring_enter, ctx.fd, ctx.sq.to_submit, min_comp, flags, (sigset_t *)0p, _NSIG / 8);
                         if( ret < 0 ) {
 …
                 /* paranoid */ verify( ctx.sq.to_submit >= ret );
+                ctx.sq.to_submit -= ret;
+                // keep track of how many still need submitting
+                __atomic_fetch_sub(&ctx.sq.to_submit, ret, __ATOMIC_SEQ_CST);
                 /* paranoid */ verify( ctx.sq.to_submit <= *ctx.sq.num );
 …
                 /* paranoid */ verify( ! __preemption_enabled() );
+                // mark that there is no pending io left
                 __atomic_store_n(&ctx.proc->io.pending, false, __ATOMIC_RELAXED);
+        }
+        // try to acquire an io context for draining, helping means we never *need* to drain, we can always do it later
         static bool try_acquire( io_context$ * ctx ) __attribute__((nonnull(1))) {
                 /* paranoid */ verify( ! __preemption_enabled() );
 …
+                {
+                        // if there is nothing to drain there is no point in acquiring anything
                         const __u32 head = *ctx->cq.head;
                         const __u32 tail = *ctx->cq.tail;
 …
+                }
+                // Drain the queue
+                if(!__atomic_try_acquire(&ctx->cq.lock)) {
+                // try a simple spinlock acquire, it's likely there are completions to drain
+                if(!__atomic_try_acquire(&ctx->cq.try_lock)) {
+                        // some other processor already has it
                         __STATS__( false, io.calls.locked++; )
                         return false;
+                }
+                // acquired!!
                 return true;
+        }
+        // actually drain the completion
         static bool __cfa_do_drain( io_context$ * ctx, cluster * cltr ) __attribute__((nonnull(1, 2))) {
                 /* paranoid */ verify( ! __preemption_enabled() );
                 /* paranoid */ verify( ready_schedule_islocked() );
+                /* paranoid */ verify( ctx->cq.lock == true );
+                /* paranoid */ verify( ctx->cq.try_lock == true );
+                // get all the invariants and initial state
                 const __u32 mask = *ctx->cq.mask;
                 const __u32 num  = *ctx->cq.num;
 …
                 for() {
                         // re-read the head and tail in case it already changed.
+                        // count the difference between the two
                         const __u32 head = *ctx->cq.head;
                         const __u32 tail = *ctx->cq.tail;
 …
                         __STATS__( false, io.calls.drain++; io.calls.completed += count; )
+                        // for everything between head and tail, drain it
                         for(i; count) {
                                 unsigned idx = (head + i) & mask;
 …
                                 /* paranoid */ verify(&cqe);
+                                // find the future in the completion
                                 struct io_future_t * future = (struct io_future_t *)(uintptr_t)cqe.user_data;
                                 // __cfadbg_print_safe( io, "Kernel I/O : Syscall completed : cqe %p, result %d for %p\n", &cqe, cqe.res, future );
+                                // don't directly fulfill the future, preemption is disabled so we need to use kernel_unpark
                                 __kernel_unpark( fulfil( *future, cqe.res, false ), UNPARK_LOCAL );
+                        }
+                        // update the timestamps accordingly
+                        // keep a local copy so we can update the relaxed copy
                         ts_next = ctx->cq.ts = rdtscl();
 …
                         ctx->proc->idle_wctx.drain_time = ts_next;
+                        // we finished draining the completions... unless the ring buffer was full and there are more secret completions in the kernel.
                         if(likely(count < num)) break;
+                        // the ring buffer was full, there could be more stuff in the kernel.
                         ioring_syscsll( *ctx, 0, IORING_ENTER_GETEVENTS);
+                }
 …
                 /* paranoid */ verify( ! __preemption_enabled() );
+                __atomic_unlock(&ctx->cq.lock);
+                // everything is drained, we can release the lock
+                __atomic_unlock(&ctx->cq.try_lock);
+                // update the relaxed timestamp
                 touch_tsc( cltr->sched.io.tscs, ctx->cq.id, ts_prev, ts_next, false );
 …
+        }
+        // call from a processor to flush
+        // contains all the bookkeeping a proc must do, not just the barebones flushing logic
+        void __cfa_do_flush( io_context$ & ctx, bool kernel ) {
+                /* paranoid */ verify( ! __preemption_enabled() );
+                // flush any external requests
+                ctx.sq.last_external = false; // clear the external bit, the arbiter will reset it if needed
+                __ioarbiter_flush( ctx, kernel );
+                // if submitting must be submitted, do the system call
+                if(ctx.sq.to_submit != 0) {
+                        ioring_syscsll(ctx, 0, 0);
+                }
+        }
+        // call from a processor to drain
+        // contains all the bookkeeping a proc must do, not just the barebones draining logic
         bool __cfa_io_drain( struct processor * proc ) {
                 bool local = false;
                 bool remote = false;
+                // make sure no ones creates/destroys io contexts
                 ready_schedule_lock();
 …
                 /* paranoid */ verify( ctx );
+                // Help if needed
                 with(cltr->sched) {
                         const size_t ctxs_count = io.count;
 …
                         const unsigned long long ctsc = rdtscl();
+                        // only help once every other time
+                        // pick a target when not helping
                         if(proc->io.target == UINT_MAX) {
                                 uint64_t chaos = __tls_rand();
+                                // choose who to help and whether to accept helping far processors
                                 unsigned ext = chaos & 0xff;
                                 unsigned other  = (chaos >> 8) % (ctxs_count);
+                                // if the processor is on the same cache line or is lucky ( 3 out of 256 odds ) help it
                                 if(ext < 3 || __atomic_load_n(&caches[other / __shard_factor.io].id, __ATOMIC_RELAXED) == this_cache) {
                                         proc->io.target = other;
 …
+                        }
                         else {
+                                // a target was picked last time, help it
                                 const unsigned target = proc->io.target;
                                 /* paranoid */ verify( io.tscs[target].t.tv != ULLONG_MAX );
+                                // make sure the target hasn't stopped existing since last time
                                 HELP: if(target < ctxs_count) {
+                                        // calculate it's age and how young it could be before we give up on helping
                                         const __readyQ_avg_t cutoff = calc_cutoff(ctsc, ctx->cq.id, ctxs_count, io.data, io.tscs, __shard_factor.io, false);
                                         const __readyQ_avg_t age = moving_average(ctsc, io.tscs[target].t.tv, io.tscs[target].t.ma, false);
                                         __cfadbg_print_safe(io, "Kernel I/O: Help attempt on %u from %u, age %'llu vs cutoff %'llu, %s\n", target, ctx->cq.id, age, cutoff, age > cutoff ? "yes" : "no");
+                                        // is the target older than the cutoff, recall 0 is oldest and bigger ints are younger
                                         if(age <= cutoff) break HELP;
+                                        if(!try_acquire(io.data[target])) break HELP;
+                                        // attempt to help the submission side
+                                        __cfa_do_flush( *io.data[target], true );
+                                        // attempt to help the completion side
+                                        if(!try_acquire(io.data[target])) break HELP; // already acquire no help needed
+                                        // actually help
                                         if(!__cfa_do_drain( io.data[target], cltr )) break HELP;
+                                        // track we did help someone
                                         remote = true;
                                         __STATS__( true, io.calls.helped++; )
+                                }
+                                // reset the target
                                 proc->io.target = UINT_MAX;
+                        }
+                }
                 // Drain the local queue
 …
                 ready_schedule_unlock();
+                // return true if some completion entry, local or remote, was drained
                 return local || remote;
+        }
+        // call from a processor to flush
+        // contains all the bookkeeping a proc must do, not just the barebones flushing logic
         bool __cfa_io_flush( struct processor * proc ) {
                 /* paranoid */ verify( ! __preemption_enabled() );
 …
                 /* paranoid */ verify( proc->io.ctx );
+                io_context$ & ctx = *proc->io.ctx;
+                __ioarbiter_flush( ctx );
+                if(ctx.sq.to_submit != 0) {
+                        ioring_syscsll(ctx, 0, 0);
+                }
+                __cfa_do_flush( *proc->io.ctx, false );
+                // also drain since some stuff will immediately complete
                 return __cfa_io_drain( proc );
+        }
 …
         //=============================================================================================
         // submission
+        static inline void __submit_only( struct io_context$ * ctx, __u32 idxs[], __u32 have) {
+        // barebones logic to submit a group of sqes
+        static inline void __submit_only( struct io_context$ * ctx, __u32 idxs[], __u32 have, bool lock) {
+                if(!lock)
+                        lock( ctx->ext_sq.lock __cfaabi_dbg_ctx2 );
                 // We can proceed to the fast path
                 // Get the right objects
 …
                 // Make the sqes visible to the submitter
                 __atomic_store_n(sq.kring.tail, tail + have, __ATOMIC_RELEASE);
+                sq.to_submit += have;
+                __atomic_fetch_add(&sq.to_submit, have, __ATOMIC_SEQ_CST);
+                // set the bit to mark things need to be flushed
                 __atomic_store_n(&ctx->proc->io.pending, true, __ATOMIC_RELAXED);
                 __atomic_store_n(&ctx->proc->io.dirty  , true, __ATOMIC_RELAXED);
+        }
+                if(!lock)
+                        unlock( ctx->ext_sq.lock );
+        }
+        // submission logic + maybe flushing
         static inline void __submit( struct io_context$ * ctx, __u32 idxs[], __u32 have, bool lazy) {
                 __sub_ring_t & sq = ctx->sq;
                 __submit_only(ctx, idxs, have);
+                __submit_only(ctx, idxs, have, false);
                 if(sq.to_submit > 30) {
 …
+        }
+        // call from a processor to flush
+        // might require arbitration if the thread was migrated after the allocation
         void cfa_io_submit( struct io_context$ * inctx, __u32 idxs[], __u32 have, bool lazy ) __attribute__((nonnull (1))) libcfa_public {
                 // __cfadbg_print_safe(io, "Kernel I/O : attempting to submit %u (%s)\n", have, lazy ? "lazy" : "eager");
 …
                 if( ctx == inctx )              // We have the right instance?
+                {
+                        // yes! fast submit
                         __submit(ctx, idxs, have, lazy);
 …
                 __atomic_store_n(&ctx.sq.free_ring.tail, ftail + count, __ATOMIC_SEQ_CST);
+                // notify the allocator that new allocations can be made
                 __ioarbiter_notify(ctx);
 …
+        }
+        // notify the arbiter that new allocations are available
         static void __ioarbiter_notify( io_arbiter$ & this, io_context$ * ctx ) {
                 /* paranoid */ verify( !empty(this.pending.queue) );
+                /* paranoid */ verify( __preemption_enabled() );
+                // mutual exclusion is needed
                 lock( this.pending.lock __cfaabi_dbg_ctx2 );
+                {
+                        __cfadbg_print_safe(io, "Kernel I/O : notifying\n");
+                        // as long as there are pending allocations try to satisfy them
+                        // for simplicity do it in FIFO order
                         while( !empty(this.pending.queue) ) {
                                 __cfadbg_print_safe(io, "Kernel I/O : notifying\n");
+                                // get first pending allocs
                                 __u32 have = ctx->sq.free_ring.tail - ctx->sq.free_ring.head;
                                 __pending_alloc & pa = (__pending_alloc&)head( this.pending.queue );
+                                // check if we have enough to satisfy the request
                                 if( have > pa.want ) goto DONE;
+                                // if there are enough allocations it means we can drop the request
                                 drop( this.pending.queue );
                                 /* paranoid */__attribute__((unused)) bool ret =
+                                // actually do the alloc
                                 __alloc(ctx, pa.idxs, pa.want);
                                 /* paranoid */ verify( ret );
+                                // write out which context statisfied the request and post
+                                // this
                                 pa.ctx = ctx;
                                 post( pa.waitctx );
+                        }
 …
+                }
                 unlock( this.pending.lock );
+        }
+                /* paranoid */ verify( __preemption_enabled() );
+        }
+        // short hand to avoid the mutual exclusion of the pending is empty regardless
         static void __ioarbiter_notify( io_context$ & ctx ) {
+                if(!empty( ctx.arbiter->pending )) {
+                        __ioarbiter_notify( *ctx.arbiter, &ctx );
+                }
+        }
+        // Simply append to the pending
+                if(empty( ctx.arbiter->pending )) return;
+                __ioarbiter_notify( *ctx.arbiter, &ctx );
+        }
+        // Submit from outside the local processor: append to the outstanding list
         static void __ioarbiter_submit( io_context$ * ctx, __u32 idxs[], __u32 have, bool lazy ) {
                 __cfadbg_print_safe(io, "Kernel I/O : submitting %u from the arbiter to context %u\n", have, ctx->fd);
 …
                 __cfadbg_print_safe(io, "Kernel I/O : waiting to submit %u\n", have);
+                // create the intrusive object to append
                 __external_io ei;
                 ei.idxs = idxs;
 …
                 ei.lazy = lazy;
+                // enqueue the io
                 bool we = enqueue(ctx->ext_sq, (__outstanding_io&)ei);
+                // mark pending
                 __atomic_store_n(&ctx->proc->io.pending, true, __ATOMIC_SEQ_CST);
+                // if this is the first to be enqueued, signal the processor in an attempt to speed up flushing
+                // if it's not the first enqueue, a signal is already in transit
                 if( we ) {
                         sigval_t value = { PREEMPT_IO };
                         __cfaabi_pthread_sigqueue(ctx->proc->kernel_thread, SIGUSR1, value);
+                }
+                        __STATS__( false, io.flush.signal += 1; )
+                }
+                __STATS__( false, io.submit.extr += 1; )
+                // to avoid dynamic allocation/memory reclamation headaches, wait for it to have been submitted
                 wait( ei.waitctx );
 …
+        }
+        static void __ioarbiter_flush( io_context$ & ctx ) {
+                if(!empty( ctx.ext_sq )) {
+                        __STATS__( false, io.flush.external += 1; )
+                        __cfadbg_print_safe(io, "Kernel I/O : arbiter flushing\n");
+                        lock( ctx.ext_sq.lock __cfaabi_dbg_ctx2 );
+                        {
+                                while( !empty(ctx.ext_sq.queue) ) {
+                                        __external_io & ei = (__external_io&)drop( ctx.ext_sq.queue );
+                                        __submit_only(&ctx, ei.idxs, ei.have);
+                                        post( ei.waitctx );
+                                }
+                                ctx.ext_sq.empty = true;
+        // flush the io arbiter: move all external io operations to the submission ring
+        static void __ioarbiter_flush( io_context$ & ctx, bool kernel ) {
+                // if there are no external operations just return
+                if(empty( ctx.ext_sq )) return;
+                // stats and logs
+                __STATS__( false, io.flush.external += 1; )
+                __cfadbg_print_safe(io, "Kernel I/O : arbiter flushing\n");
+                // this can happen from multiple processors, mutual exclusion is needed
+                lock( ctx.ext_sq.lock __cfaabi_dbg_ctx2 );
+                {
+                        // pop each operation one at a time.
+                        // There is no wait morphing because of the io sq ring
+                        while( !empty(ctx.ext_sq.queue) ) {
+                                // drop the element from the queue
+                                __external_io & ei = (__external_io&)drop( ctx.ext_sq.queue );
+                                // submit it
+                                __submit_only(&ctx, ei.idxs, ei.have, true);
+                                // wake the thread that was waiting on it
+                                // since this can both be called from kernel and user, check the flag before posting
+                                __post( ei.waitctx, kernel, UNPARK_LOCAL );
+                        }
+                        unlock(ctx.ext_sq.lock );
+                        // mark the queue as empty
+                        ctx.ext_sq.empty = true;
+                        ctx.sq.last_external = true;
+                }
+                unlock(ctx.ext_sq.lock );
+        }
+        extern "C" {
+                // debug functions used for gdb
+                // io_uring doesn't yet support gdb soe the kernel-shared data structures aren't viewable in gdb
+                // these functions read the data that gdb can't and should be removed once the support is added
+                static __u32 __cfagdb_cq_head( io_context$ * ctx ) __attribute__((nonnull(1),used,noinline)) { return *ctx->cq.head; }
+                static __u32 __cfagdb_cq_tail( io_context$ * ctx ) __attribute__((nonnull(1),used,noinline)) { return *ctx->cq.tail; }
+                static __u32 __cfagdb_cq_mask( io_context$ * ctx ) __attribute__((nonnull(1),used,noinline)) { return *ctx->cq.mask; }
+                static __u32 __cfagdb_sq_head( io_context$ * ctx ) __attribute__((nonnull(1),used,noinline)) { return *ctx->sq.kring.head; }
+                static __u32 __cfagdb_sq_tail( io_context$ * ctx ) __attribute__((nonnull(1),used,noinline)) { return *ctx->sq.kring.tail; }
+                static __u32 __cfagdb_sq_mask( io_context$ * ctx ) __attribute__((nonnull(1),used,noinline)) { return *ctx->sq.mask; }
+                // fancier version that reads an sqe and copies it out.
+                static struct io_uring_sqe __cfagdb_sq_at( io_context$ * ctx, __u32 at ) __attribute__((nonnull(1),used,noinline)) {
+                        __u32 ax = at & *ctx->sq.mask;
+                        __u32 ix = ctx->sq.kring.array[ax];
+                        return ctx->sq.sqes[ix];
+                }
+        }

libcfa/src/concurrency/io/call.cfa.in

-              r34b4268
+              r24d6572
 Prelude = """#define __cforall_thread__
+#include <unistd.h>
+#include <errno.h>
+#include <sys/socket.h>
+#include <time.hfa>
 #include "bits/defs.hfa"
 #include "kernel.hfa"
 …
         #include <assert.h>
         #include <stdint.h>
-        #include <errno.h>
         #include <linux/io_uring.h>
         #include "kernel/fwd.hfa"
 …
 // I/O Forwards
 //=============================================================================================
-#include <time.hfa>
-// Some forward declarations
-#include <errno.h>
-#include <unistd.h>
 extern "C" {
-        #include <asm/types.h>
-        #include <sys/socket.h>
-        #include <sys/syscall.h>
 #if defined(CFA_HAVE_PREADV2)
         struct iovec;
         extern ssize_t preadv2 (int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
+        extern ssize_t preadv2 (int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags);
 #endif
 #if defined(CFA_HAVE_PWRITEV2)
         struct iovec;
         extern ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
+        extern ssize_t pwritev2(int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags);
 #endif
 …
         struct msghdr;
         struct sockaddr;
+        extern ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags);
+        extern ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags);
+        extern ssize_t send(int sockfd, const void *buf, size_t len, int flags);
+        extern ssize_t recv(int sockfd, void *buf, size_t len, int flags);
+        extern int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
+        extern int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+        extern ssize_t sendmsg(int sockfd, const struct msghdr * msg, int flags);
+        extern ssize_t recvmsg(int sockfd, struct msghdr * msg, int flags);
+        extern ssize_t send(int sockfd, const void * buf, size_t len, int flags);
+        extern ssize_t recv(int sockfd, void * buf, size_t len, int flags);
         extern int fallocate(int fd, int mode, off_t offset, off_t len);
         extern int posix_fadvise(int fd, off_t offset, off_t len, int advice);
         extern int madvise(void *addr, size_t length, int advice);
         extern int openat(int dirfd, const char *pathname, int flags, mode_t mode);
+        extern int madvise(void * addr, size_t length, int advice);
+        extern int openat(int dirfd, const char * pathname, int flags, mode_t mode);
         extern int close(int fd);
         extern ssize_t read (int fd, void *buf, size_t count);
+        extern ssize_t read (int fd, void * buf, size_t count);
         struct epoll_event;
         extern int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
         extern ssize_t splice(int fd_in, __off64_t *off_in, int fd_out, __off64_t *off_out, size_t len, unsigned int flags);
+        extern int epoll_ctl(int epfd, int op, int fd, struct epoll_event * event);
+        extern ssize_t splice(int fd_in, __off64_t * off_in, int fd_out, __off64_t * off_out, size_t len, unsigned int flags);
         extern ssize_t tee(int fd_in, int fd_out, size_t len, unsigned int flags);
+}
 …
 calls = [
         # CFA_HAVE_IORING_OP_READV
         Call('READV', 'ssize_t preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags)', {
+        Call('READV', 'ssize_t preadv2(int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags)', {
                 'fd'  : 'fd',
+                'addr': '(typeof(sqe->addr))iov',
+                'len' : 'iovcnt',
                 'off' : 'offset',
+                'addr': '(uintptr_t)iov',
+                'len' : 'iovcnt',
+                'rw_flags' : 'flags'
         }, define = 'CFA_HAVE_PREADV2'),
         # CFA_HAVE_IORING_OP_WRITEV
         Call('WRITEV', 'ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags)', {
+        Call('WRITEV', 'ssize_t pwritev2(int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags)', {
                 'fd'  : 'fd',
+                'addr': '(typeof(sqe->addr))iov',
+                'len' : 'iovcnt',
                 'off' : 'offset',
+                'addr': '(uintptr_t)iov',
+                'len' : 'iovcnt'
+                'rw_flags' : 'flags'
         }, define = 'CFA_HAVE_PWRITEV2'),
         # CFA_HAVE_IORING_OP_FSYNC
 …
         }),
         # CFA_HAVE_IORING_OP_EPOLL_CTL
         Call('EPOLL_CTL', 'int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event)', {
+        Call('EPOLL_CTL', 'int epoll_ctl(int epfd, int op, int fd, struct epoll_event * event)', {
                 'fd': 'epfd',
+                'len': 'op',
                 'addr': 'fd',
+                'len': 'op',
+                'off': '(uintptr_t)event'
+                'off': '(typeof(sqe->off))event'
         }),
         # CFA_HAVE_IORING_OP_SYNC_FILE_RANGE
 …
         }),
         # CFA_HAVE_IORING_OP_SENDMSG
         Call('SENDMSG', 'ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(uintptr_t)(struct msghdr *)msg',
+        Call('SENDMSG', 'ssize_t sendmsg(int sockfd, const struct msghdr * msg, int flags)', {
+                'fd': 'sockfd',
+                'addr': '(typeof(sqe->addr))(struct msghdr *)msg',
                 'len': '1',
                 'msg_flags': 'flags'
         }),
         # CFA_HAVE_IORING_OP_RECVMSG
         Call('RECVMSG', 'ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(uintptr_t)(struct msghdr *)msg',
+        Call('RECVMSG', 'ssize_t recvmsg(int sockfd, struct msghdr * msg, int flags)', {
+                'fd': 'sockfd',
+                'addr': '(typeof(sqe->addr))(struct msghdr *)msg',
                 'len': '1',
                 'msg_flags': 'flags'
         }),
         # CFA_HAVE_IORING_OP_SEND
         Call('SEND', 'ssize_t send(int sockfd, const void *buf, size_t len, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(uintptr_t)buf',
+        Call('SEND', 'ssize_t send(int sockfd, const void * buf, size_t len, int flags)', {
+                'fd': 'sockfd',
+                'addr': '(typeof(sqe->addr))buf',
                 'len': 'len',
                 'msg_flags': 'flags'
         }),
         # CFA_HAVE_IORING_OP_RECV
         Call('RECV', 'ssize_t recv(int sockfd, void *buf, size_t len, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(uintptr_t)buf',
+        Call('RECV', 'ssize_t recv(int sockfd, void * buf, size_t len, int flags)', {
+                'fd': 'sockfd',
+                'addr': '(typeof(sqe->addr))buf',
                 'len': 'len',
                 'msg_flags': 'flags'
         }),
         # CFA_HAVE_IORING_OP_ACCEPT
         Call('ACCEPT', 'int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(uintptr_t)addr',
                 'addr2': '(uintptr_t)addrlen',
+        Call('ACCEPT', 'int accept4(int sockfd, __SOCKADDR_ARG addr, socklen_t * restrict addrlen, int flags)', {
+                'fd': 'sockfd',
+                'addr': '(typeof(sqe->addr))&addr',
+                'addr2': '(typeof(sqe->addr2))addrlen',
                 'accept_flags': 'flags'
         }),
         # CFA_HAVE_IORING_OP_CONNECT
         Call('CONNECT', 'int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen)', {
                 'fd': 'sockfd',
                 'addr': '(uintptr_t)addr',
+        Call('CONNECT', 'int connect(int sockfd, __CONST_SOCKADDR_ARG addr, socklen_t addrlen)', {
+                'fd': 'sockfd',
+                'addr': '(typeof(sqe->addr))&addr',
                 'off': 'addrlen'
         }),
 …
         Call('FALLOCATE', 'int fallocate(int fd, int mode, off_t offset, off_t len)', {
                 'fd': 'fd',
-                'addr': '(uintptr_t)len',
                 'len': 'mode',
+                'off': 'offset'
+                'off': 'offset',
+                'addr': 'len'
         }),
         # CFA_HAVE_IORING_OP_FADVISE
 …
         }),
         # CFA_HAVE_IORING_OP_MADVISE
         Call('MADVISE', 'int madvise(void *addr, size_t length, int advice)', {
                 'addr': '(uintptr_t)addr',
+        Call('MADVISE', 'int madvise(void * addr, size_t length, int advice)', {
+                'addr': '(typeof(sqe->addr))addr',
                 'len': 'length',
                 'fadvise_advice': 'advice'
         }),
         # CFA_HAVE_IORING_OP_OPENAT
         Call('OPENAT', 'int openat(int dirfd, const char *pathname, int flags, mode_t mode)', {
+        Call('OPENAT', 'int openat(int dirfd, const char * pathname, int flags, mode_t mode)', {
                 'fd': 'dirfd',
                 'addr': '(uintptr_t)pathname',
                 'len': 'mode',
                 'open_flags': 'flags;'
+                'addr': '(typeof(sqe->addr))pathname',
+                'open_flags': 'flags;',
+                'len': 'mode'
         }),
         # CFA_HAVE_IORING_OP_OPENAT2
         Call('OPENAT2', 'int openat2(int dirfd, const char *pathname, struct open_how * how, size_t size)', {
+        Call('OPENAT2', 'int openat2(int dirfd, const char * pathname, struct open_how * how, size_t size)', {
                 'fd': 'dirfd',
                 'addr': 'pathname',
                 'len': 'sizeof(*how)',
                 'off': '(uintptr_t)how',
+                'addr': '(typeof(sqe->addr))pathname',
+                'off': '(typeof(sqe->off))how',
+                'len': 'sizeof(*how)'
         }, define = 'CFA_HAVE_OPENAT2'),
         # CFA_HAVE_IORING_OP_CLOSE
 …
         }),
         # CFA_HAVE_IORING_OP_STATX
         Call('STATX', 'int statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf)', {
+        Call('STATX', 'int statx(int dirfd, const char * pathname, int flags, unsigned int mask, struct statx * statxbuf)', {
                 'fd': 'dirfd',
                 'off': '(uintptr_t)statxbuf',
                 'addr': 'pathname',
+                'addr': '(typeof(sqe->addr))pathname',
+                'statx_flags': 'flags',
                 'len': 'mask',
                 'statx_flags': 'flags'
+                'off': '(typeof(sqe->off))statxbuf'
         }, define = 'CFA_HAVE_STATX'),
         # CFA_HAVE_IORING_OP_READ
         Call('READ', 'ssize_t read(int fd, void * buf, size_t count)', {
                 'fd': 'fd',
                 'addr': '(uintptr_t)buf',
+                'addr': '(typeof(sqe->addr))buf',
                 'len': 'count'
         }),
 …
         Call('WRITE', 'ssize_t write(int fd, void * buf, size_t count)', {
                 'fd': 'fd',
                 'addr': '(uintptr_t)buf',
+                'addr': '(typeof(sqe->addr))buf',
                 'len': 'count'
         }),
         # CFA_HAVE_IORING_OP_SPLICE
         Call('SPLICE', 'ssize_t splice(int fd_in, __off64_t *off_in, int fd_out, __off64_t *off_out, size_t len, unsigned int flags)', {
+        Call('SPLICE', 'ssize_t splice(int fd_in, __off64_t * off_in, int fd_out, __off64_t * off_out, size_t len, unsigned int flags)', {
                 'splice_fd_in': 'fd_in',
                 'splice_off_in': 'off_in ? (__u64)*off_in : (__u64)-1',
+                'splice_off_in': 'off_in ? (typeof(sqe->splice_off_in))*off_in : (typeof(sqe->splice_off_in))-1',
                 'fd': 'fd_out',
                 'off': 'off_out ? (__u64)*off_out : (__u64)-1',
+                'off': 'off_out ? (typeof(sqe->off))*off_out : (typeof(sqe->off))-1',
                 'len': 'len',
                 'splice_flags': 'flags'

libcfa/src/concurrency/io/setup.cfa

-              r34b4268
+              r24d6572
 #define __cforall_thread__
-#define _GNU_SOURCE
 #if defined(__CFA_DEBUG__)
 …
                 // completion queue
                 cq.lock      = false;
+                cq.try_lock  = false;
                 cq.id        = MAX;
                 cq.ts        = rdtscl();

libcfa/src/concurrency/io/types.hfa

-              r34b4268
+              r24d6572
         //-----------------------------------------------------------------------
         // Ring Data structure
+      struct __sub_ring_t {
+        // represent the io_uring submission ring which contains operations that will be sent to io_uring for processing
+        struct __sub_ring_t {
+                // lock needed because remote processors might need to flush the instance
+                __spinlock_t lock;
                 struct {
                         // Head and tail of the ring (associated with array)
 …
                 // number of sqes to submit on next system call.
                 __u32 to_submit;
+                volatile __u32 to_submit;
                 // number of entries and mask to go with it
 …
                 void * ring_ptr;
                 size_t ring_sz;
+        };
+                // for debug purposes, whether or not the last flush was due to a arbiter flush
+                bool last_external;
+        };
+        // represent the io_uring completion ring which contains operations that have completed
         struct __cmp_ring_t {
+                volatile bool lock;
+                // needed because remote processors can help drain the buffer
+                volatile bool try_lock;
+                // id of the ring, used for the helping/topology algorithms
                 unsigned id;
+                // timestamp from last time it was drained
                 unsigned long long ts;
 …
         };
+        // struct representing an io operation that still needs processing
+        // actual operations are expected to inherit from this
         struct __outstanding_io {
+                // intrusive link fields
                 inline Colable;
+                // primitive on which to block until the io is processed
                 oneshot waitctx;
         };
         static inline __outstanding_io *& Next( __outstanding_io * n ) { return (__outstanding_io *)Next( (Colable *)n ); }
+        // queue of operations that are outstanding
         struct __outstanding_io_queue {
+                // spinlock for protection
+                // TODO: changing to a lock that blocks, I haven't examined whether it should be a kernel or user lock
                 __spinlock_t lock;
+                // the actual queue
                 Queue(__outstanding_io) queue;
+                // volatile used to avoid the need for taking the lock if it's empty
                 volatile bool empty;
         };
+        // struct representing an operation that was submitted
         struct __external_io {
+                // inherits from outstanding io
                 inline __outstanding_io;
+                // pointer and count to an array of ids to be submitted
                 __u32 * idxs;
                 __u32 have;
+                // whether or not these can be accumulated before flushing the buffer
                 bool lazy;
         };
+        // complete io_context, contains all the data for io submission and completion
         struct __attribute__((aligned(64))) io_context$ {
+                // arbiter, used in cases where threads for migrated at unfortunate moments
                 io_arbiter$ * arbiter;
+                // which prcessor the context is tied to
                 struct processor * proc;
+                // queue of io submissions that haven't beeen processed.
                 __outstanding_io_queue ext_sq;
+                // io_uring ring data structures
                 struct __sub_ring_t sq;
                 struct __cmp_ring_t cq;
+                // flag the io_uring rings where created with
                 __u32 ring_flags;
+                // file descriptor that identifies the io_uring instance
                 int fd;
         };
+        // short hand to check when the io_context was last processed (io drained)
         static inline unsigned long long ts(io_context$ *& this) {
                 const __u32 head = *this->cq.head;
                 const __u32 tail = *this->cq.tail;
+                // if there is no pending completions, just pretend it's infinetely recent
                 if(head == tail) return ULLONG_MAX;
 …
+        }
+        // structure represeting allocations that couldn't succeed locally
         struct __pending_alloc {
+                // inherit from outstanding io
                 inline __outstanding_io;
+                // array and size of the desired allocation
                 __u32 * idxs;
                 __u32 want;
+                // output param, the context the io was allocated from
                 io_context$ * ctx;
         };
+        // arbiter that handles cases where the context tied to the local processor is unable to satisfy the io
         monitor __attribute__((aligned(64))) io_arbiter$ {
+                // contains a queue of io for pending allocations
                 __outstanding_io_queue pending;
         };

libcfa/src/concurrency/iofwd.hfa

-              r34b4268
+              r24d6572
 // Author           : Thierry Delisle
 // Created On       : Thu Apr 23 17:31:00 2020
 // Last Modified By :
 // Last Modified On :
 // Update Count     :
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Mon Mar 13 23:54:57 2023
+// Update Count     : 1
 //
 …
 #include <unistd.h>
+#include <sys/socket.h>
 extern "C" {
         #include <asm/types.h>
 …
 typedef __off64_t off64_t;
-struct cluster;
-struct io_context$;
-struct iovec;
-struct msghdr;
-struct sockaddr;
-struct statx;
 struct epoll_event;
-struct io_uring_sqe;
 //-----------------------------------------------------------------------
 …
 // synchronous calls
 #if defined(CFA_HAVE_PREADV2)
         extern ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
+        extern ssize_t cfa_preadv2(int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
 #if defined(CFA_HAVE_PWRITEV2)
         extern ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
+        extern ssize_t cfa_pwritev2(int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
 extern int cfa_fsync(int fd, __u64 submit_flags);
 extern int cfa_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event, __u64 submit_flags);
+extern int cfa_epoll_ctl(int epfd, int op, int fd, struct epoll_event * event, __u64 submit_flags);
 extern int cfa_sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags, __u64 submit_flags);
 extern  ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags, __u64 submit_flags);
 extern ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags, __u64 submit_flags);
 extern ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags, __u64 submit_flags);
 extern ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags, __u64 submit_flags);
 extern int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, __u64 submit_flags);
 extern int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen, __u64 submit_flags);
+extern  ssize_t cfa_sendmsg(int sockfd, const struct msghdr * msg, int flags, __u64 submit_flags);
+extern ssize_t cfa_recvmsg(int sockfd, struct msghdr * msg, int flags, __u64 submit_flags);
+extern ssize_t cfa_send(int sockfd, const void * buf, size_t len, int flags, __u64 submit_flags);
+extern ssize_t cfa_recv(int sockfd, void * buf, size_t len, int flags, __u64 submit_flags);
+extern int cfa_accept4(int sockfd, __SOCKADDR_ARG addr, socklen_t * restrict addrlen, int flags, __u64 submit_flags);
+extern int cfa_connect(int sockfd, __CONST_SOCKADDR_ARG addr, socklen_t addrlen, __u64 submit_flags);
 extern int cfa_fallocate(int fd, int mode, off_t offset, off_t len, __u64 submit_flags);
 extern int cfa_posix_fadvise(int fd, off_t offset, off_t len, int advice, __u64 submit_flags);
 extern int cfa_madvise(void *addr, size_t length, int advice, __u64 submit_flags);
 extern int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode, __u64 submit_flags);
+extern int cfa_madvise(void * addr, size_t length, int advice, __u64 submit_flags);
+extern int cfa_openat(int dirfd, const char * pathname, int flags, mode_t mode, __u64 submit_flags);
 #if defined(CFA_HAVE_OPENAT2)
         extern int cfa_openat2(int dirfd, const char *pathname, struct open_how * how, size_t size, __u64 submit_flags);
+        extern int cfa_openat2(int dirfd, const char * pathname, struct open_how * how, size_t size, __u64 submit_flags);
 #endif
 extern int cfa_close(int fd, __u64 submit_flags);
 #if defined(CFA_HAVE_STATX)
         extern int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, __u64 submit_flags);
+        extern int cfa_statx(int dirfd, const char * pathname, int flags, unsigned int mask, struct statx * statxbuf, __u64 submit_flags);
 #endif
 extern ssize_t cfa_read(int fd, void * buf, size_t count, __u64 submit_flags);
 extern ssize_t cfa_write(int fd, void * buf, size_t count, __u64 submit_flags);
 extern ssize_t cfa_splice(int fd_in, __off64_t *off_in, int fd_out, __off64_t *off_out, size_t len, unsigned int flags, __u64 submit_flags);
+extern ssize_t cfa_splice(int fd_in, __off64_t * off_in, int fd_out, __off64_t * off_out, size_t len, unsigned int flags, __u64 submit_flags);
 extern ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags, __u64 submit_flags);
 …
 // asynchronous calls
 #if defined(CFA_HAVE_PREADV2)
         extern void async_preadv2(io_future_t & future, int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
+        extern void async_preadv2(io_future_t & future, int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
 #if defined(CFA_HAVE_PWRITEV2)
         extern void async_pwritev2(io_future_t & future, int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
+        extern void async_pwritev2(io_future_t & future, int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
 extern void async_fsync(io_future_t & future, int fd, __u64 submit_flags);
 extern void async_epoll_ctl(io_future_t & future, int epfd, int op, int fd, struct epoll_event *event, __u64 submit_flags);
+extern void async_epoll_ctl(io_future_t & future, int epfd, int op, int fd, struct epoll_event * event, __u64 submit_flags);
 extern void async_sync_file_range(io_future_t & future, int fd, off64_t offset, off64_t nbytes, unsigned int flags, __u64 submit_flags);
 extern void async_sendmsg(io_future_t & future, int sockfd, const struct msghdr *msg, int flags, __u64 submit_flags);
 extern void async_recvmsg(io_future_t & future, int sockfd, struct msghdr *msg, int flags, __u64 submit_flags);
 extern void async_send(io_future_t & future, int sockfd, const void *buf, size_t len, int flags, __u64 submit_flags);
 extern void async_recv(io_future_t & future, int sockfd, void *buf, size_t len, int flags, __u64 submit_flags);
 extern void async_accept4(io_future_t & future, int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, __u64 submit_flags);
 extern void async_connect(io_future_t & future, int sockfd, const struct sockaddr *addr, socklen_t addrlen, __u64 submit_flags);
+extern void async_sendmsg(io_future_t & future, int sockfd, const struct msghdr * msg, int flags, __u64 submit_flags);
+extern void async_recvmsg(io_future_t & future, int sockfd, struct msghdr * msg, int flags, __u64 submit_flags);
+extern void async_send(io_future_t & future, int sockfd, const void * buf, size_t len, int flags, __u64 submit_flags);
+extern void async_recv(io_future_t & future, int sockfd, void * buf, size_t len, int flags, __u64 submit_flags);
+extern void async_accept4(io_future_t & future, int sockfd, __SOCKADDR_ARG addr, socklen_t * restrict addrlen, int flags, __u64 submit_flags);
+extern void async_connect(io_future_t & future, int sockfd, __CONST_SOCKADDR_ARG addr, socklen_t addrlen, __u64 submit_flags);
 extern void async_fallocate(io_future_t & future, int fd, int mode, off_t offset, off_t len, __u64 submit_flags);
 extern void async_posix_fadvise(io_future_t & future, int fd, off_t offset, off_t len, int advice, __u64 submit_flags);
 extern void async_madvise(io_future_t & future, void *addr, size_t length, int advice, __u64 submit_flags);
 extern void async_openat(io_future_t & future, int dirfd, const char *pathname, int flags, mode_t mode, __u64 submit_flags);
+extern void async_madvise(io_future_t & future, void * addr, size_t length, int advice, __u64 submit_flags);
+extern void async_openat(io_future_t & future, int dirfd, const char * pathname, int flags, mode_t mode, __u64 submit_flags);
 #if defined(CFA_HAVE_OPENAT2)
         extern void async_openat2(io_future_t & future, int dirfd, const char *pathname, struct open_how * how, size_t size, __u64 submit_flags);
+        extern void async_openat2(io_future_t & future, int dirfd, const char * pathname, struct open_how * how, size_t size, __u64 submit_flags);
 #endif
 extern void async_close(io_future_t & future, int fd, __u64 submit_flags);
 #if defined(CFA_HAVE_STATX)
         extern void async_statx(io_future_t & future, int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, __u64 submit_flags);
+        extern void async_statx(io_future_t & future, int dirfd, const char * pathname, int flags, unsigned int mask, struct statx * statxbuf, __u64 submit_flags);
 #endif
 void async_read(io_future_t & future, int fd, void * buf, size_t count, __u64 submit_flags);
 extern void async_write(io_future_t & future, int fd, void * buf, size_t count, __u64 submit_flags);
 extern void async_splice(io_future_t & future, int fd_in, __off64_t *off_in, int fd_out, __off64_t *off_out, size_t len, unsigned int flags, __u64 submit_flags);
+extern void async_splice(io_future_t & future, int fd_in, __off64_t * off_in, int fd_out, __off64_t * off_out, size_t len, unsigned int flags, __u64 submit_flags);
 extern void async_tee(io_future_t & future, int fd_in, int fd_out, size_t len, unsigned int flags, __u64 submit_flags);

libcfa/src/concurrency/kernel.cfa

-              r34b4268
+              r24d6572
 // Created On       : Tue Jan 17 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Nov 30 18:14:08 2022
 // Update Count     : 76
+// Last Modified On : Mon Jan  9 08:42:05 2023
+// Update Count     : 77
 //
 #define __cforall_thread__
-#define _GNU_SOURCE
 // #define __CFA_DEBUG_PRINT_RUNTIME_CORE__
 …
                 __cfadbg_print_safe(runtime_core, "Kernel : core %p stopping\n", this);
+        }
+        __cfa_io_flush( this );
+        __cfa_io_drain( this );
         post( this->terminated );

libcfa/src/concurrency/kernel/cluster.cfa

-              r34b4268
+              r24d6572
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include "bits/defs.hfa"
 …
         return max_cores_l;
+}
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        // No forward declaration needed
-        #define __kernel_rseq_register rseq_register_current_thread
-        #define __kernel_rseq_unregister rseq_unregister_current_thread
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        static void __kernel_raw_rseq_register  (void);
-        static void __kernel_raw_rseq_unregister(void);
-        #define __kernel_rseq_register __kernel_raw_rseq_register
-        #define __kernel_rseq_unregister __kernel_raw_rseq_unregister
-#else
-        // No forward declaration needed
-        // No initialization needed
-        static inline void noop(void) {}
-        #define __kernel_rseq_register noop
-        #define __kernel_rseq_unregister noop
-#endif
 //=======================================================================
 …
 // Lock-Free registering/unregistering of threads
 unsigned register_proc_id( void ) with(__scheduler_lock.lock) {
-        __kernel_rseq_register();
         bool * handle = (bool *)&kernelTLS().sched_lock;
 …
         __atomic_store_n(cell, 0p, __ATOMIC_RELEASE);
-        __kernel_rseq_unregister();
+}
 …
         /* paranoid */ verify( mock_head(this)    == this.l.prev );
+}
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        // No definition needed
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        #if defined( __x86_64 ) || defined( __i386 )
-                #define RSEQ_SIG        0x53053053
-        #elif defined( __ARM_ARCH )
-                #ifdef __ARMEB__
-                #define RSEQ_SIG    0xf3def5e7      /* udf    #24035    ; 0x5de3 (ARMv6+) */
-                #else
-                #define RSEQ_SIG    0xe7f5def3      /* udf    #24035    ; 0x5de3 */
-                #endif
-        #endif
-        extern void __disable_interrupts_hard();
-        extern void __enable_interrupts_hard();
-        static void __kernel_raw_rseq_register  (void) {
-                /* paranoid */ verify( __cfaabi_rseq.cpu_id == RSEQ_CPU_ID_UNINITIALIZED );
-                // int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, (sigset_t *)0p, _NSIG / 8);
-                int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, RSEQ_SIG);
-                if(ret != 0) {
-                        int e = errno;
-                        switch(e) {
-                        case EINVAL: abort("KERNEL ERROR: rseq register invalid argument");
-                        case ENOSYS: abort("KERNEL ERROR: rseq register no supported");
-                        case EFAULT: abort("KERNEL ERROR: rseq register with invalid argument");
-                        case EBUSY : abort("KERNEL ERROR: rseq register already registered");
-                        case EPERM : abort("KERNEL ERROR: rseq register sig  argument  on unregistration does not match the signature received on registration");
-                        default: abort("KERNEL ERROR: rseq register unexpected return %d", e);
+                        }
+                }
+        }
-        static void __kernel_raw_rseq_unregister(void) {
-                /* paranoid */ verify( __cfaabi_rseq.cpu_id >= 0 );
-                // int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, (sigset_t *)0p, _NSIG / 8);
-                int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
-                if(ret != 0) {
-                        int e = errno;
-                        switch(e) {
-                        case EINVAL: abort("KERNEL ERROR: rseq unregister invalid argument");
-                        case ENOSYS: abort("KERNEL ERROR: rseq unregister no supported");
-                        case EFAULT: abort("KERNEL ERROR: rseq unregister with invalid argument");
-                        case EBUSY : abort("KERNEL ERROR: rseq unregister already registered");
-                        case EPERM : abort("KERNEL ERROR: rseq unregister sig  argument  on unregistration does not match the signature received on registration");
-                        default: abort("KERNEL ERROR: rseq unregisteunexpected return %d", e);
+                        }
+                }
+        }
-#else
-        // No definition needed
-#endif

libcfa/src/concurrency/kernel/cluster.hfa

-              r34b4268
+              r24d6572
 // convert to log2 scale but using double
 static inline __readyQ_avg_t __to_readyQ_avg(unsigned long long intsc) { if(unlikely(0 == intsc)) return 0.0; else return log2(intsc); }
+static inline __readyQ_avg_t __to_readyQ_avg(unsigned long long intsc) { if(unlikely(0 == intsc)) return 0.0; else return log2((__readyQ_avg_t)intsc); }
 #define warn_large_before warnf( !strict || old_avg < 35.0, "Suspiciously large previous average: %'lf, %'" PRId64 "ms \n", old_avg, program()`ms )
 …
+}
 static struct {
         const unsigned readyq;
         const unsigned io;
+const static struct {
+        unsigned readyq;
+        unsigned io;
 } __shard_factor = { 2, 1 };

libcfa/src/concurrency/kernel/private.hfa

-              r34b4268
+              r24d6572
 // Created On       : Mon Feb 13 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Aug 12 08:21:33 2020
 // Update Count     : 9
+// Last Modified On : Thu Mar  2 16:04:46 2023
+// Update Count     : 11
 //
 …
 extern "C" {
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        #include <rseq/rseq.h>
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        #include <linux/rseq.h>
-#else
-        #ifndef _GNU_SOURCE
-        #error kernel/private requires gnu_source
-        #endif
         #include <sched.h>
-#endif
+}
 …
 // Hardware
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        // No data needed
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        extern "Cforall" {
-                extern __attribute__((aligned(64))) __thread volatile struct rseq __cfaabi_rseq;
+        }
-#else
-        // No data needed
-#endif
 static inline int __kernel_getcpu() {
         /* paranoid */ verify( ! __preemption_enabled() );
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        return rseq_current_cpu();
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        int r = __cfaabi_rseq.cpu_id;
-        /* paranoid */ verify( r >= 0 );
-        return r;
-#else
         return sched_getcpu();
-#endif
+}

libcfa/src/concurrency/kernel/startup.cfa

-              r34b4268
+              r24d6572
 #define __cforall_thread__
-#define _GNU_SOURCE
 // #define __CFA_DEBUG_PRINT_RUNTIME_CORE__
 // C Includes
 #include <errno.h>                                      // errno
+#include <errno.h>                                                                              // errno
 #include <signal.h>
 #include <string.h>                                     // strerror
 #include <unistd.h>                                     // sysconf
+#include <string.h>                                                                             // strerror
+#include <unistd.h>
+#include <limits.h>                                                                             // PTHREAD_STACK_MIN
 extern "C" {
+        #include <limits.h>                             // PTHREAD_STACK_MIN
+        #include <unistd.h>                             // syscall
+        #include <sys/eventfd.h>                        // eventfd
+        #include <sys/mman.h>                           // mprotect
+        #include <sys/resource.h>                       // getrlimit
+        #include <sys/eventfd.h>                                                        // eventfd
+        #include <sys/mman.h>                                                           // mprotect
+        #include <sys/resource.h>                                                       // getrlimit
+}
 …
 #include "kernel/private.hfa"
 #include "iofwd.hfa"
 #include "startup.hfa"                                  // STARTUP_PRIORITY_XXX
+#include "startup.hfa"                                                                  // STARTUP_PRIORITY_XXX
 #include "limits.hfa"
 #include "math.hfa"
 …
 __scheduler_RWLock_t __scheduler_lock @= { 0 };
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        // No data needed
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        extern "Cforall" {
-                __attribute__((aligned(64))) __thread volatile struct rseq __cfaabi_rseq @= {
-                        .cpu_id : RSEQ_CPU_ID_UNINITIALIZED,
-                };
+        }
-#else
-        // No data needed
-#endif
 //-----------------------------------------------------------------------------
 // Struct to steal stack

libcfa/src/concurrency/locks.cfa

-              r34b4268
+              r24d6572
 // file "LICENCE" distributed with Cforall.
 //
 // locks.hfa -- LIBCFATHREAD
+// locks.cfa -- LIBCFATHREAD
 // Runtime locks that used with the runtime thread system.
 //
 …
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include "locks.hfa"
 …
         // lock is held by some other thread
         if ( owner != 0p && owner != thrd ) {
+                insert_last( blocked_threads, *thrd );
+        select_node node;
+                insert_last( blocked_threads, node );
                 wait_count++;
                 unlock( lock );
                 park( );
+        }
+        // multi acquisition lock is held by current thread
+        else if ( owner == thrd && multi_acquisition ) {
+        return;
+        } else if ( owner == thrd && multi_acquisition ) { // multi acquisition lock is held by current thread
                 recursion_count++;
+                unlock( lock );
+        }
+        // lock isn't held
+        else {
+        } else {  // lock isn't held
                 owner = thrd;
                 recursion_count = 1;
                 unlock( lock );
+        }
+        }
+    unlock( lock );
+}
 …
+}
+static void pop_and_set_new_owner( blocking_lock & this ) with( this ) {
+        thread$ * t = &try_pop_front( blocked_threads );
+        owner = t;
+        recursion_count = ( t ? 1 : 0 );
+        if ( t ) wait_count--;
+        unpark( t );
+static inline void pop_node( blocking_lock & this ) with( this ) {
+    __handle_waituntil_OR( blocked_threads );
+    select_node * node = &try_pop_front( blocked_threads );
+    if ( node ) {
+        wait_count--;
+        owner = node->blocked_thread;
+        recursion_count = 1;
+        // if ( !node->clause_status || __make_select_node_available( *node ) ) unpark( node->blocked_thread );
+        wake_one( blocked_threads, *node );
+    } else {
+        owner = 0p;
+        recursion_count = 0;
+    }
+}
 …
         recursion_count--;
         if ( recursion_count == 0 ) {
                 pop_and_set_new_owner( this );
+                pop_node( this );
+        }
         unlock( lock );
 …
         // lock held
         if ( owner != 0p ) {
                 insert_last( blocked_threads, *t );
+                insert_last( blocked_threads, *(select_node *)t->link_node );
                 wait_count++;
-                unlock( lock );
+        }
         // lock not held
 …
                 recursion_count = 1;
                 unpark( t );
                 unlock( lock );
+        }
+}
 size_t on_wait( blocking_lock & this ) with( this ) {
+        }
+    unlock( lock );
+}
+size_t on_wait( blocking_lock & this, __cfa_pre_park pp_fn, void * pp_datum ) with( this ) {
         lock( lock __cfaabi_dbg_ctx2 );
         /* paranoid */ verifyf( owner != 0p, "Attempt to release lock %p that isn't held", &this );
 …
         size_t ret = recursion_count;
+        pop_and_set_new_owner( this );
+        pop_node( this );
+    select_node node;
+    active_thread()->link_node = (void *)&node;
         unlock( lock );
+    pre_park_then_park( pp_fn, pp_datum );
         return ret;
+}
 …
         recursion_count = recursion;
+}
+// waituntil() support
+bool register_select( blocking_lock & this, select_node & node ) with(this) {
+    lock( lock __cfaabi_dbg_ctx2 );
+        thread$ * thrd = active_thread();
+        // single acquisition lock is held by current thread
+        /* paranoid */ verifyf( owner != thrd || multi_acquisition, "Single acquisition lock holder (%p) attempted to reacquire the lock %p resulting in a deadlock.", owner, &this );
+    if ( !node.park_counter && ( (owner == thrd && multi_acquisition) || owner == 0p ) ) { // OR special case
+        if ( !__make_select_node_available( node ) ) { // we didn't win the race so give up on registering
+           unlock( lock );
+           return false;
+        }
+    }
+        // lock is held by some other thread
+        if ( owner != 0p && owner != thrd ) {
+                insert_last( blocked_threads, node );
+                wait_count++;
+                unlock( lock );
+        return false;
+        } else if ( owner == thrd && multi_acquisition ) { // multi acquisition lock is held by current thread
+                recursion_count++;
+        } else {  // lock isn't held
+                owner = thrd;
+                recursion_count = 1;
+        }
+    if ( node.park_counter ) __make_select_node_available( node );
+    unlock( lock );
+    return true;
+}
+bool unregister_select( blocking_lock & this, select_node & node ) with(this) {
+    lock( lock __cfaabi_dbg_ctx2 );
+    if ( node`isListed ) {
+        remove( node );
+        wait_count--;
+        unlock( lock );
+        return false;
+    }
+    if ( owner == active_thread() ) {
+        /* paranoid */ verifyf( recursion_count == 1 || multi_acquisition, "Thread %p attempted to unlock owner lock %p in waituntil unregister, which is not recursive but has a recursive count of %zu", active_thread(), &this, recursion_count );
+        // if recursion count is zero release lock and set new owner if one is waiting
+        recursion_count--;
+        if ( recursion_count == 0 ) {
+            pop_node( this );
+        }
+    }
+        unlock( lock );
+    return false;
+}
+void on_selected( blocking_lock & this, select_node & node ) {}
 //-----------------------------------------------------------------------------
 …
         int counter( condition_variable(L) & this ) with(this) { return count; }
         static size_t queue_and_get_recursion( condition_variable(L) & this, info_thread(L) * i ) with(this) {
+        static void enqueue_thread( condition_variable(L) & this, info_thread(L) * i ) with(this) {
                 // add info_thread to waiting queue
                 insert_last( blocked_threads, *i );
                 count++;
+                size_t recursion_count = 0;
+                if (i->lock) {
+                        // if lock was passed get recursion count to reset to after waking thread
+                        recursion_count = on_wait( *i->lock );
+                }
+                return recursion_count;
+        }
+        }
+    static size_t block_and_get_recursion( info_thread(L) & i, __cfa_pre_park pp_fn, void * pp_datum ) {
+        size_t recursion_count = 0;
+                if ( i.lock ) // if lock was passed get recursion count to reset to after waking thread
+                        recursion_count = on_wait( *i.lock, pp_fn, pp_datum ); // this call blocks
+                else
+            pre_park_then_park( pp_fn, pp_datum );
+        return recursion_count;
+    }
+    static size_t block_and_get_recursion( info_thread(L) & i ) { return block_and_get_recursion( i, pre_park_noop, 0p ); }
         // helper for wait()'s' with no timeout
         static void queue_info_thread( condition_variable(L) & this, info_thread(L) & i ) with(this) {
                 lock( lock __cfaabi_dbg_ctx2 );
                 size_t recursion_count = queue_and_get_recursion(this, &i);
+        enqueue_thread( this, &i );
                 unlock( lock );
                 // blocks here
                 park( );
+        size_t recursion_count = block_and_get_recursion( i );
                 // resets recursion count here after waking
                 if (i.lock) on_wakeup(*i.lock, recursion_count);
+                if ( i.lock ) on_wakeup( *i.lock, recursion_count );
+        }
 …
                 queue_info_thread( this, i );
+    static void cond_alarm_register( void * node_ptr ) { register_self( (alarm_node_t *)node_ptr ); }
         // helper for wait()'s' with a timeout
         static void queue_info_thread_timeout( condition_variable(L) & this, info_thread(L) & info, Duration t, Alarm_Callback callback ) with(this) {
                 lock( lock __cfaabi_dbg_ctx2 );
                 size_t recursion_count = queue_and_get_recursion(this, &info);
+        enqueue_thread( this, &info );
                 alarm_node_wrap(L) node_wrap = { t, 0`s, callback, &this, &info };
                 unlock( lock );
+                // registers alarm outside cond lock to avoid deadlock
+                register_self( &node_wrap.alarm_node );
+                // blocks here
+                park();
+                // blocks here and registers alarm node before blocking after releasing locks to avoid deadlock
+        size_t recursion_count = block_and_get_recursion( info, cond_alarm_register, (void *)(&node_wrap.alarm_node) );
+                // park();
                 // unregisters alarm so it doesn't go off if this happens first
 …
                 // resets recursion count here after waking
                 if (info.lock) on_wakeup(*info.lock, recursion_count);
+                if ( info.lock ) on_wakeup( *info.lock, recursion_count );
+        }
 …
                 info_thread( L ) i = { active_thread(), info, &l };
                 insert_last( blocked_threads, i );
                 size_t recursion_count = on_wait( *i.lock );
                 park( );
+                size_t recursion_count = on_wait( *i.lock, pre_park_noop, 0p ); // blocks here
+                // park( );
                 on_wakeup(*i.lock, recursion_count);
+        }
 …
         bool empty ( pthread_cond_var(L) & this ) with(this) { return blocked_threads`isEmpty; }
-        static size_t queue_and_get_recursion( pthread_cond_var(L) & this, info_thread(L) * i ) with(this) {
-                // add info_thread to waiting queue
-                insert_last( blocked_threads, *i );
-                size_t recursion_count = 0;
-                recursion_count = on_wait( *i->lock );
-                return recursion_count;
+        }
         static void queue_info_thread_timeout( pthread_cond_var(L) & this, info_thread(L) & info, Duration t, Alarm_Callback callback ) with(this) {
                 lock( lock __cfaabi_dbg_ctx2 );
                 size_t recursion_count = queue_and_get_recursion(this, &info);
+        insert_last( blocked_threads, info );
                 pthread_alarm_node_wrap(L) node_wrap = { t, 0`s, callback, &this, &info };
                 unlock( lock );
+                // registers alarm outside cond lock to avoid deadlock
+                register_self( &node_wrap.alarm_node );
+                // blocks here
+                park();
+                // unregisters alarm so it doesn't go off if this happens first
+                // blocks here and registers alarm node before blocking after releasing locks to avoid deadlock
+        size_t recursion_count = block_and_get_recursion( info, cond_alarm_register, (void *)(&node_wrap.alarm_node) );
+                // unregisters alarm so it doesn't go off if signal happens first
                 unregister_self( &node_wrap.alarm_node );
                 // resets recursion count here after waking
                 if (info.lock) on_wakeup(*info.lock, recursion_count);
+                if ( info.lock ) on_wakeup( *info.lock, recursion_count );
+        }
 …
                 lock( lock __cfaabi_dbg_ctx2 );
                 info_thread( L ) i = { active_thread(), info, &l };
+                size_t recursion_count = queue_and_get_recursion(this, &i);
+                unlock( lock );
+                park( );
+                on_wakeup(*i.lock, recursion_count);
+        insert_last( blocked_threads, i );
+                unlock( lock );
+        // blocks here
+                size_t recursion_count = block_and_get_recursion( i );
+                on_wakeup( *i.lock, recursion_count );
+        }
 …
         return thrd != 0p;
+}

libcfa/src/concurrency/locks.hfa

-              r34b4268
+              r24d6572
 #include "time.hfa"
+#include "select.hfa"
 #include <fstream.hfa>
 // futex headers
 …
 #include <unistd.h>
+// undef to make a number of the locks not reacquire upon waking from a condlock
+#define REACQ 1
+typedef void (*__cfa_pre_park)( void * );
+static inline void pre_park_noop( void * ) {}
+//-----------------------------------------------------------------------------
+// is_blocking_lock
+forall( L & | sized(L) )
+trait is_blocking_lock {
+        // For synchronization locks to use when acquiring
+        void on_notify( L &, struct thread$ * );
+        // For synchronization locks to use when releasing
+        size_t on_wait( L &, __cfa_pre_park pp_fn, void * pp_datum );
+        // to set recursion count after getting signalled;
+        void on_wakeup( L &, size_t recursion );
+};
+static inline void pre_park_then_park( __cfa_pre_park pp_fn, void * pp_datum ) {
+    pp_fn( pp_datum );
+    park();
+}
+// macros for default routine impls for is_blocking_lock trait that do not wait-morph
+#define DEFAULT_ON_NOTIFY( lock_type ) \
+    static inline void on_notify( lock_type & this, thread$ * t ){ unpark(t); }
+#define DEFAULT_ON_WAIT( lock_type ) \
+    static inline size_t on_wait( lock_type & this, __cfa_pre_park pp_fn, void * pp_datum ) { \
+        unlock( this ); \
+        pre_park_then_park( pp_fn, pp_datum ); \
+        return 0; \
+    }
+// on_wakeup impl if lock should be reacquired after waking up
+#define DEFAULT_ON_WAKEUP_REACQ( lock_type ) \
+    static inline void on_wakeup( lock_type & this, size_t recursion ) { lock( this ); }
+// on_wakeup impl if lock will not be reacquired after waking up
+#define DEFAULT_ON_WAKEUP_NO_REACQ( lock_type ) \
+    static inline void on_wakeup( lock_type & this, size_t recursion ) {}
 //-----------------------------------------------------------------------------
 …
 static inline bool   try_lock ( single_acquisition_lock & this ) { return try_lock( (blocking_lock &)this ); }
 static inline void   unlock   ( single_acquisition_lock & this ) { unlock  ( (blocking_lock &)this ); }
 static inline size_t on_wait  ( single_acquisition_lock & this ) { return on_wait ( (blocking_lock &)this ); }
+static inline size_t on_wait  ( single_acquisition_lock & this, __cfa_pre_park pp_fn, void * pp_datum ) { return on_wait ( (blocking_lock &)this, pp_fn, pp_datum ); }
 static inline void   on_wakeup( single_acquisition_lock & this, size_t v ) { on_wakeup ( (blocking_lock &)this, v ); }
 static inline void   on_notify( single_acquisition_lock & this, struct thread$ * t ) { on_notify( (blocking_lock &)this, t ); }
+static inline bool   register_select( single_acquisition_lock & this, select_node & node ) { return register_select( (blocking_lock &)this, node ); }
+static inline bool   unregister_select( single_acquisition_lock & this, select_node & node ) { return unregister_select( (blocking_lock &)this, node ); }
+static inline void   on_selected( single_acquisition_lock & this, select_node & node ) { on_selected( (blocking_lock &)this, node ); }
 //----------
 …
 static inline bool   try_lock ( owner_lock & this ) { return try_lock( (blocking_lock &)this ); }
 static inline void   unlock   ( owner_lock & this ) { unlock  ( (blocking_lock &)this ); }
 static inline size_t on_wait  ( owner_lock & this ) { return on_wait ( (blocking_lock &)this ); }
+static inline size_t on_wait  ( owner_lock & this, __cfa_pre_park pp_fn, void * pp_datum ) { return on_wait ( (blocking_lock &)this, pp_fn, pp_datum ); }
 static inline void   on_wakeup( owner_lock & this, size_t v ) { on_wakeup ( (blocking_lock &)this, v ); }
 static inline void   on_notify( owner_lock & this, struct thread$ * t ) { on_notify( (blocking_lock &)this, t ); }
+static inline bool   register_select( owner_lock & this, select_node & node ) { return register_select( (blocking_lock &)this, node ); }
+static inline bool   unregister_select( owner_lock & this, select_node & node ) { return unregister_select( (blocking_lock &)this, node ); }
+static inline void   on_selected( owner_lock & this, select_node & node ) { on_selected( (blocking_lock &)this, node ); }
 //-----------------------------------------------------------------------------
 …
 static inline void ?{}(mcs_spin_node & this) { this.next = 0p; this.locked = true; }
-static inline mcs_spin_node * volatile & ?`next ( mcs_spin_node * node ) {
-        return node->next;
+}
 struct mcs_spin_lock {
         mcs_spin_queue queue;
 …
 static inline void lock(mcs_spin_lock & l, mcs_spin_node & n) {
+    n.locked = true;
         mcs_spin_node * prev = __atomic_exchange_n(&l.queue.tail, &n, __ATOMIC_SEQ_CST);
+        n.locked = true;
+        if(prev == 0p) return;
+        if( prev == 0p ) return;
         prev->next = &n;
         while(__atomic_load_n(&n.locked, __ATOMIC_RELAXED)) Pause();
+        while( __atomic_load_n(&n.locked, __ATOMIC_RELAXED) ) Pause();
+}
 …
         mcs_spin_node * n_ptr = &n;
         if (__atomic_compare_exchange_n(&l.queue.tail, &n_ptr, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) return;
         while (__atomic_load_n(&n.next, __ATOMIC_RELAXED) == 0p) {}
+        while (__atomic_load_n(&n.next, __ATOMIC_RELAXED) == 0p) Pause();
         n.next->locked = false;
+}
 …
 // futex_mutex
-// - No cond var support
 // - Kernel thd blocking alternative to the spinlock
 // - No ownership (will deadlock on reacq)
+// - no reacq on wakeup
 struct futex_mutex {
         // lock state any state other than UNLOCKED is locked
 …
+}
 static inline void  ?{}( futex_mutex & this ) with(this) { val = 0; }
 static inline bool internal_try_lock(futex_mutex & this, int & compare_val) with(this) {
+static inline void ?{}( futex_mutex & this ) with(this) { val = 0; }
+static inline bool internal_try_lock( futex_mutex & this, int & compare_val) with(this) {
         return __atomic_compare_exchange_n((int*)&val, (int*)&compare_val, 1, false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE);
+}
 static inline int internal_exchange(futex_mutex & this) with(this) {
+static inline int internal_exchange( futex_mutex & this ) with(this) {
         return __atomic_exchange_n((int*)&val, 2, __ATOMIC_ACQUIRE);
+}
 // if this is called recursively IT WILL DEADLOCK!!!!!
 static inline void lock(futex_mutex & this) with(this) {
+static inline void lock( futex_mutex & this ) with(this) {
         int state;
+        // // linear backoff omitted for now
+        // for( int spin = 4; spin < 1024; spin += spin) {
+        //      state = 0;
+        //      // if unlocked, lock and return
+        //      if (internal_try_lock(this, state)) return;
+        //      if (2 == state) break;
+        //      for (int i = 0; i < spin; i++) Pause();
+        // }
+        // no contention try to acquire
+        if (internal_try_lock(this, state)) return;
+        for( int spin = 4; spin < 1024; spin += spin) {
+                state = 0;
+                // if unlocked, lock and return
+                if (internal_try_lock(this, state)) return;
+                if (2 == state) break;
+                for (int i = 0; i < spin; i++) Pause();
+        }
         // if not in contended state, set to be in contended state
 …
 static inline void unlock(futex_mutex & this) with(this) {
         // if uncontended do atomice unlock and then return
+        if (__atomic_fetch_sub(&val, 1, __ATOMIC_RELEASE) == 1) return; // TODO: try acq/rel
+        // if uncontended do atomic unlock and then return
+    if (__atomic_exchange_n(&val, 0, __ATOMIC_RELEASE) == 1) return;
         // otherwise threads are blocked so we must wake one
-        __atomic_store_n((int *)&val, 0, __ATOMIC_RELEASE);
         futex((int *)&val, FUTEX_WAKE, 1);
+}
+static inline void on_notify( futex_mutex & f, thread$ * t){ unpark(t); }
+static inline size_t on_wait( futex_mutex & f ) {unlock(f); return 0;}
+// to set recursion count after getting signalled;
+static inline void on_wakeup( futex_mutex & f, size_t recursion ) {}
+//-----------------------------------------------------------------------------
+// CLH Spinlock
+// - No recursive acquisition
+// - Needs to be released by owner
+struct clh_lock {
+        volatile bool * volatile tail;
+};
+static inline void  ?{}( clh_lock & this ) { this.tail = malloc(); *this.tail = true; }
+static inline void ^?{}( clh_lock & this ) { free(this.tail); }
+static inline void lock(clh_lock & l) {
+        thread$ * curr_thd = active_thread();
+        *(curr_thd->clh_node) = false;
+        volatile bool * prev = __atomic_exchange_n((bool **)(&l.tail), (bool *)(curr_thd->clh_node), __ATOMIC_SEQ_CST);
+        while(!__atomic_load_n(prev, __ATOMIC_ACQUIRE)) Pause();
+        curr_thd->clh_prev = prev;
+}
+static inline void unlock(clh_lock & l) {
+        thread$ * curr_thd = active_thread();
+        __atomic_store_n(curr_thd->clh_node, true, __ATOMIC_RELEASE);
+        curr_thd->clh_node = curr_thd->clh_prev;
+}
+static inline void on_notify(clh_lock & this, struct thread$ * t ) { unpark(t); }
+static inline size_t on_wait(clh_lock & this) { unlock(this); return 0; }
+static inline void on_wakeup(clh_lock & this, size_t recursion ) {
+        #ifdef REACQ
+        lock(this);
+        #endif
+}
+//-----------------------------------------------------------------------------
+// Linear backoff Spinlock
+struct linear_backoff_then_block_lock {
+DEFAULT_ON_NOTIFY( futex_mutex )
+DEFAULT_ON_WAIT( futex_mutex )
+DEFAULT_ON_WAKEUP_NO_REACQ( futex_mutex )
+//-----------------------------------------------------------------------------
+// go_mutex
+// - Kernel thd blocking alternative to the spinlock
+// - No ownership (will deadlock on reacq)
+// - Golang's flavour of mutex
+// - Impl taken from Golang: src/runtime/lock_futex.go
+struct go_mutex {
+        // lock state any state other than UNLOCKED is locked
+        // enum LockState { UNLOCKED = 0, LOCKED = 1, SLEEPING = 2 };
+        // stores a lock state
+        int val;
+};
+static inline void  ?{}( go_mutex & this ) with(this) { val = 0; }
+// static inline void ?{}( go_mutex & this, go_mutex this2 ) = void; // these don't compile correctly at the moment so they should be omitted
+// static inline void ?=?( go_mutex & this, go_mutex this2 ) = void;
+static inline bool internal_try_lock(go_mutex & this, int & compare_val, int new_val ) with(this) {
+        return __atomic_compare_exchange_n((int*)&val, (int*)&compare_val, new_val, false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE);
+}
+static inline int internal_exchange(go_mutex & this, int swap ) with(this) {
+        return __atomic_exchange_n((int*)&val, swap, __ATOMIC_ACQUIRE);
+}
+// if this is called recursively IT WILL DEADLOCK!!!!!
+static inline void lock( go_mutex & this ) with( this ) {
+        int state, init_state;
+    // speculative grab
+    state = internal_exchange(this, 1);
+    if ( !state ) return; // state == 0
+    init_state = state;
+    for (;;) {
+        for( int i = 0; i < 4; i++ ) {
+            while( !val ) { // lock unlocked
+                state = 0;
+                if ( internal_try_lock( this, state, init_state ) ) return;
+            }
+            for (int i = 0; i < 30; i++) Pause();
+        }
+        while( !val ) { // lock unlocked
+            state = 0;
+            if ( internal_try_lock( this, state, init_state ) ) return;
+        }
+        sched_yield();
+        // if not in contended state, set to be in contended state
+        state = internal_exchange( this, 2 );
+        if ( !state ) return; // state == 0
+        init_state = 2;
+        futex( (int*)&val, FUTEX_WAIT, 2 ); // if val is not 2 this returns with EWOULDBLOCK
+    }
+}
+static inline void unlock( go_mutex & this ) with(this) {
+        // if uncontended do atomic unlock and then return
+    if ( __atomic_exchange_n(&val, 0, __ATOMIC_RELEASE) == 1 ) return;
+        // otherwise threads are blocked so we must wake one
+        futex( (int *)&val, FUTEX_WAKE, 1 );
+}
+DEFAULT_ON_NOTIFY( go_mutex )
+DEFAULT_ON_WAIT( go_mutex )
+DEFAULT_ON_WAKEUP_NO_REACQ( go_mutex )
+//-----------------------------------------------------------------------------
+// Exponential backoff then block lock
+struct exp_backoff_then_block_lock {
         // Spin lock used for mutual exclusion
         __spinlock_t spinlock;
 …
 };
 static inline void  ?{}( linear_backoff_then_block_lock & this ) {
+static inline void  ?{}( exp_backoff_then_block_lock & this ) {
         this.spinlock{};
         this.blocked_threads{};
         this.lock_value = 0;
+}
+static inline void ^?{}( linear_backoff_then_block_lock & this ) {}
+// static inline void ?{}( linear_backoff_then_block_lock & this, linear_backoff_then_block_lock this2 ) = void;
+// static inline void ?=?( linear_backoff_then_block_lock & this, linear_backoff_then_block_lock this2 ) = void;
+static inline bool internal_try_lock(linear_backoff_then_block_lock & this, size_t & compare_val) with(this) {
+        if (__atomic_compare_exchange_n(&lock_value, &compare_val, 1, false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
+                return true;
+        }
+        return false;
+}
+static inline bool try_lock(linear_backoff_then_block_lock & this) { size_t compare_val = 0; return internal_try_lock(this, compare_val); }
+static inline bool try_lock_contention(linear_backoff_then_block_lock & this) with(this) {
+        if (__atomic_exchange_n(&lock_value, 2, __ATOMIC_ACQUIRE) == 0) {
+                return true;
+        }
+        return false;
+}
+static inline bool block(linear_backoff_then_block_lock & this) with(this) {
+        lock( spinlock __cfaabi_dbg_ctx2 ); // TODO change to lockfree queue (MPSC)
+        if (lock_value != 2) {
+                unlock( spinlock );
+                return true;
+        }
+        insert_last( blocked_threads, *active_thread() );
+        unlock( spinlock );
+static inline void ?{}( exp_backoff_then_block_lock & this, exp_backoff_then_block_lock this2 ) = void;
+static inline void ?=?( exp_backoff_then_block_lock & this, exp_backoff_then_block_lock this2 ) = void;
+static inline void  ^?{}( exp_backoff_then_block_lock & this ){}
+static inline bool internal_try_lock( exp_backoff_then_block_lock & this, size_t & compare_val ) with(this) {
+        return __atomic_compare_exchange_n(&lock_value, &compare_val, 1, false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+}
+static inline bool try_lock( exp_backoff_then_block_lock & this ) { size_t compare_val = 0; return internal_try_lock( this, compare_val ); }
+static inline bool try_lock_contention( exp_backoff_then_block_lock & this ) with(this) {
+        return !__atomic_exchange_n( &lock_value, 2, __ATOMIC_ACQUIRE );
+}
+static inline bool block( exp_backoff_then_block_lock & this ) with(this) {
+    lock( spinlock __cfaabi_dbg_ctx2 );
+    if (__atomic_load_n( &lock_value, __ATOMIC_SEQ_CST) != 2) {
+        unlock( spinlock );
+        return true;
+    }
+    insert_last( blocked_threads, *active_thread() );
+    unlock( spinlock );
         park( );
         return true;
+}
 static inline void lock(linear_backoff_then_block_lock & this) with(this) {
+static inline void lock( exp_backoff_then_block_lock & this ) with(this) {
         size_t compare_val = 0;
         int spin = 4;
         // linear backoff
         for( ;; ) {
 …
+}
 static inline void unlock(linear_backoff_then_block_lock & this) with(this) {
+static inline void unlock( exp_backoff_then_block_lock & this ) with(this) {
     if (__atomic_exchange_n(&lock_value, 0, __ATOMIC_RELEASE) == 1) return;
+        lock( spinlock __cfaabi_dbg_ctx2 );
+        thread$ * t = &try_pop_front( blocked_threads );
+        unlock( spinlock );
+        unpark( t );
+}
+static inline void on_notify(linear_backoff_then_block_lock & this, struct thread$ * t ) { unpark(t); }
+static inline size_t on_wait(linear_backoff_then_block_lock & this) { unlock(this); return 0; }
+static inline void on_wakeup(linear_backoff_then_block_lock & this, size_t recursion ) {
+        #ifdef REACQ
+        lock(this);
+        #endif
+}
+    lock( spinlock __cfaabi_dbg_ctx2 );
+    thread$ * t = &try_pop_front( blocked_threads );
+    unlock( spinlock );
+    unpark( t );
+}
+DEFAULT_ON_NOTIFY( exp_backoff_then_block_lock )
+DEFAULT_ON_WAIT( exp_backoff_then_block_lock )
+DEFAULT_ON_WAKEUP_REACQ( exp_backoff_then_block_lock )
 //-----------------------------------------------------------------------------
 …
 // if this is called recursively IT WILL DEADLOCK!!!!!
 static inline void lock(fast_block_lock & this) with(this) {
+static inline void lock( fast_block_lock & this ) with(this) {
         lock( lock __cfaabi_dbg_ctx2 );
         if ( held ) {
 …
+}
 static inline void unlock(fast_block_lock & this) with(this) {
+static inline void unlock( fast_block_lock & this ) with(this) {
         lock( lock __cfaabi_dbg_ctx2 );
         /* paranoid */ verifyf( held != false, "Attempt to release lock %p that isn't held", &this );
 …
+}
+static inline void on_notify(fast_block_lock & this, struct thread$ * t ) with(this) {
+        #ifdef REACQ
+                lock( lock __cfaabi_dbg_ctx2 );
+                insert_last( blocked_threads, *t );
+                unlock( lock );
+        #else
+                unpark(t);
+        #endif
+}
+static inline size_t on_wait(fast_block_lock & this) { unlock(this); return 0; }
+static inline void on_wakeup(fast_block_lock & this, size_t recursion ) { }
+static inline void on_notify( fast_block_lock & this, struct thread$ * t ) with(this) {
+    lock( lock __cfaabi_dbg_ctx2 );
+    insert_last( blocked_threads, *t );
+    unlock( lock );
+}
+DEFAULT_ON_WAIT( fast_block_lock )
+DEFAULT_ON_WAKEUP_NO_REACQ( fast_block_lock )
 //-----------------------------------------------------------------------------
 …
 struct simple_owner_lock {
         // List of blocked threads
         dlist( thread$ ) blocked_threads;
+        dlist( select_node ) blocked_threads;
         // Spin lock used for mutual exclusion
 …
 static inline void ?=?( simple_owner_lock & this, simple_owner_lock this2 ) = void;
 static inline void lock(simple_owner_lock & this) with(this) {
         if (owner == active_thread()) {
+static inline void lock( simple_owner_lock & this ) with(this) {
+        if ( owner == active_thread() ) {
                 recursion_count++;
                 return;
 …
         lock( lock __cfaabi_dbg_ctx2 );
+        if (owner != 0p) {
+                insert_last( blocked_threads, *active_thread() );
+        if ( owner != 0p ) {
+        select_node node;
+                insert_last( blocked_threads, node );
                 unlock( lock );
                 park( );
 …
+}
+// TODO: fix duplicate def issue and bring this back
+// void pop_and_set_new_owner( simple_owner_lock & this ) with( this ) {
+        // thread$ * t = &try_pop_front( blocked_threads );
+        // owner = t;
+        // recursion_count = ( t ? 1 : 0 );
+        // unpark( t );
+// }
+static inline void unlock(simple_owner_lock & this) with(this) {
+static inline void pop_node( simple_owner_lock & this ) with(this) {
+    __handle_waituntil_OR( blocked_threads );
+    select_node * node = &try_pop_front( blocked_threads );
+    if ( node ) {
+        owner = node->blocked_thread;
+        recursion_count = 1;
+        // if ( !node->clause_status || __make_select_node_available( *node ) ) unpark( node->blocked_thread );
+        wake_one( blocked_threads, *node );
+    } else {
+        owner = 0p;
+        recursion_count = 0;
+    }
+}
+static inline void unlock( simple_owner_lock & this ) with(this) {
         lock( lock __cfaabi_dbg_ctx2 );
         /* paranoid */ verifyf( owner != 0p, "Attempt to release lock %p that isn't held", &this );
 …
         recursion_count--;
         if ( recursion_count == 0 ) {
+                // pop_and_set_new_owner( this );
+                thread$ * t = &try_pop_front( blocked_threads );
+                owner = t;
+                recursion_count = ( t ? 1 : 0 );
+                unpark( t );
+                pop_node( this );
+        }
         unlock( lock );
+}
 static inline void on_notify(simple_owner_lock & this, struct thread$ * t ) with(this) {
+static inline void on_notify( simple_owner_lock & this, thread$ * t ) with(this) {
         lock( lock __cfaabi_dbg_ctx2 );
         // lock held
         if ( owner != 0p ) {
                 insert_last( blocked_threads, *t );
+                insert_last( blocked_threads, *(select_node *)t->link_node );
+        }
         // lock not held
 …
+}
 static inline size_t on_wait(simple_owner_lock & this) with(this) {
+static inline size_t on_wait( simple_owner_lock & this, __cfa_pre_park pp_fn, void * pp_datum ) with(this) {
         lock( lock __cfaabi_dbg_ctx2 );
         /* paranoid */ verifyf( owner != 0p, "Attempt to release lock %p that isn't held", &this );
 …
         size_t ret = recursion_count;
+        // pop_and_set_new_owner( this );
+        thread$ * t = &try_pop_front( blocked_threads );
+        owner = t;
+        recursion_count = ( t ? 1 : 0 );
+        unpark( t );
+        pop_node( this );
+    select_node node;
+    active_thread()->link_node = (void *)&node;
         unlock( lock );
+    pre_park_then_park( pp_fn, pp_datum );
         return ret;
+}
+static inline void on_wakeup(simple_owner_lock & this, size_t recursion ) with(this) { recursion_count = recursion; }
+static inline void on_wakeup( simple_owner_lock & this, size_t recursion ) with(this) { recursion_count = recursion; }
+// waituntil() support
+static inline bool register_select( simple_owner_lock & this, select_node & node ) with(this) {
+    lock( lock __cfaabi_dbg_ctx2 );
+    // check if we can complete operation. If so race to establish winner in special OR case
+    if ( !node.park_counter && ( owner == active_thread() || owner == 0p ) ) {
+        if ( !__make_select_node_available( node ) ) { // we didn't win the race so give up on registering
+           unlock( lock );
+           return false;
+        }
+    }
+    if ( owner == active_thread() ) {
+                recursion_count++;
+        if ( node.park_counter ) __make_select_node_available( node );
+        unlock( lock );
+                return true;
+        }
+    if ( owner != 0p ) {
+                insert_last( blocked_threads, node );
+                unlock( lock );
+                return false;
+        }
+        owner = active_thread();
+        recursion_count = 1;
+    if ( node.park_counter ) __make_select_node_available( node );
+    unlock( lock );
+    return true;
+}
+static inline bool unregister_select( simple_owner_lock & this, select_node & node ) with(this) {
+    lock( lock __cfaabi_dbg_ctx2 );
+    if ( node`isListed ) {
+        remove( node );
+        unlock( lock );
+        return false;
+    }
+    if ( owner == active_thread() ) {
+        recursion_count--;
+        if ( recursion_count == 0 ) {
+            pop_node( this );
+        }
+    }
+    unlock( lock );
+    return false;
+}
+static inline void on_selected( simple_owner_lock & this, select_node & node ) {}
 //-----------------------------------------------------------------------------
 …
         // flag showing if lock is held
         volatile bool held;
-        #ifdef __CFA_DEBUG__
-        // for deadlock detection
-        struct thread$ * owner;
-        #endif
 };
 …
 static inline void ?=?( spin_queue_lock & this, spin_queue_lock this2 ) = void;
 // if this is called recursively IT WILL DEADLOCK!!!!!
 static inline void lock(spin_queue_lock & this) with(this) {
+// if this is called recursively IT WILL DEADLOCK!
+static inline void lock( spin_queue_lock & this ) with(this) {
         mcs_spin_node node;
         lock( lock, node );
 …
+}
 static inline void unlock(spin_queue_lock & this) with(this) {
+static inline void unlock( spin_queue_lock & this ) with(this) {
         __atomic_store_n(&held, false, __ATOMIC_RELEASE);
+}
+static inline void on_notify(spin_queue_lock & this, struct thread$ * t ) {
+        unpark(t);
+}
+static inline size_t on_wait(spin_queue_lock & this) { unlock(this); return 0; }
+static inline void on_wakeup(spin_queue_lock & this, size_t recursion ) {
+        #ifdef REACQ
+        lock(this);
+        #endif
+}
+DEFAULT_ON_NOTIFY( spin_queue_lock )
+DEFAULT_ON_WAIT( spin_queue_lock )
+DEFAULT_ON_WAKEUP_REACQ( spin_queue_lock )
 //-----------------------------------------------------------------------------
 …
 // if this is called recursively IT WILL DEADLOCK!!!!!
 static inline void lock(mcs_block_spin_lock & this) with(this) {
+static inline void lock( mcs_block_spin_lock & this ) with(this) {
         mcs_node node;
         lock( lock, node );
 …
+}
+static inline void on_notify(mcs_block_spin_lock & this, struct thread$ * t ) { unpark(t); }
+static inline size_t on_wait(mcs_block_spin_lock & this) { unlock(this); return 0; }
+static inline void on_wakeup(mcs_block_spin_lock & this, size_t recursion ) {
+        #ifdef REACQ
+        lock(this);
+        #endif
+}
+DEFAULT_ON_NOTIFY( mcs_block_spin_lock )
+DEFAULT_ON_WAIT( mcs_block_spin_lock )
+DEFAULT_ON_WAKEUP_REACQ( mcs_block_spin_lock )
 //-----------------------------------------------------------------------------
 …
 // if this is called recursively IT WILL DEADLOCK!!!!!
 static inline void lock(block_spin_lock & this) with(this) {
+static inline void lock( block_spin_lock & this ) with(this) {
         lock( lock );
         while(__atomic_load_n(&held, __ATOMIC_SEQ_CST)) Pause();
 …
+}
 static inline void unlock(block_spin_lock & this) with(this) {
+static inline void unlock( block_spin_lock & this ) with(this) {
         __atomic_store_n(&held, false, __ATOMIC_RELEASE);
+}
+static inline void on_notify(block_spin_lock & this, struct thread$ * t ) with(this.lock) {
+  #ifdef REACQ
+static inline void on_notify( block_spin_lock & this, struct thread$ * t ) with(this.lock) {
         // first we acquire internal fast_block_lock
         lock( lock __cfaabi_dbg_ctx2 );
 …
         unlock( lock );
-  #endif
         unpark(t);
+}
+static inline size_t on_wait(block_spin_lock & this) { unlock(this); return 0; }
+static inline void on_wakeup(block_spin_lock & this, size_t recursion ) with(this) {
+  #ifdef REACQ
+}
+DEFAULT_ON_WAIT( block_spin_lock )
+static inline void on_wakeup( block_spin_lock & this, size_t recursion ) with(this) {
         // now we acquire the entire block_spin_lock upon waking up
         while(__atomic_load_n(&held, __ATOMIC_SEQ_CST)) Pause();
         __atomic_store_n(&held, true, __ATOMIC_RELEASE);
         unlock( lock ); // Now we release the internal fast_spin_lock
+  #endif
+}
+//-----------------------------------------------------------------------------
+// is_blocking_lock
+trait is_blocking_lock(L & | sized(L)) {
+        // For synchronization locks to use when acquiring
+        void on_notify( L &, struct thread$ * );
+        // For synchronization locks to use when releasing
+        size_t on_wait( L & );
+        // to set recursion count after getting signalled;
+        void on_wakeup( L &, size_t recursion );
+};
+}
 //-----------------------------------------------------------------------------
 …
 forall(L & | is_blocking_lock(L)) {
         struct info_thread;
-        // // for use by sequence
-        // info_thread(L) *& Back( info_thread(L) * this );
-        // info_thread(L) *& Next( info_thread(L) * this );
+}

libcfa/src/concurrency/monitor.cfa

-              r34b4268
+              r24d6572
 // Created On       : Thd Feb 23 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Dec  4 07:55:14 2019
 // Update Count     : 10
+// Last Modified On : Sun Feb 19 17:00:59 2023
+// Update Count     : 12
 //
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include "monitor.hfa"

libcfa/src/concurrency/monitor.hfa

-              r34b4268
+              r24d6572
 // Created On       : Thd Feb 23 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Dec  4 07:55:32 2019
 // Update Count     : 11
+// Last Modified On : Thu Feb  2 11:29:21 2023
+// Update Count     : 12
 //
 …
 #include "stdlib.hfa"
+trait is_monitor(T &) {
+forall( T & )
+trait is_monitor {
         monitor$ * get_monitor( T & );
         void ^?{}( T & mutex );

libcfa/src/concurrency/mutex.cfa

-              r34b4268
+              r24d6572
 // Created On       : Fri May 25 01:37:11 2018
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Dec  4 09:16:39 2019
 // Update Count     : 1
+// Last Modified On : Sun Feb 19 17:01:36 2023
+// Update Count     : 3
 //
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include "mutex.hfa"

libcfa/src/concurrency/mutex.hfa

-              r34b4268
+              r24d6572
 // Created On       : Fri May 25 01:24:09 2018
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Dec  4 09:16:53 2019
 // Update Count     : 1
+// Last Modified On : Thu Feb  2 11:46:08 2023
+// Update Count     : 2
 //
 …
 void unlock(recursive_mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+trait is_lock(L & | sized(L)) {
+forall( L & | sized(L) )
+trait is_lock {
         void lock  (L &);
         void unlock(L &);

libcfa/src/concurrency/mutex_stmt.hfa

-              r34b4268
+              r24d6572
+#pragma once
 #include "bits/algorithm.hfa"
 #include "bits/defs.hfa"
 …
 //-----------------------------------------------------------------------------
 // is_lock
+trait is_lock(L & | sized(L)) {
+forall(L & | sized(L))
+trait is_lock {
         // For acquiring a lock
         void lock( L & );
 …
         void unlock( L & );
 };
 struct __mutex_stmt_lock_guard {
 …
     // Sort locks based on address
     __libcfa_small_sort(this.lockarr, count);
-    // acquire locks in order
-    // for ( size_t i = 0; i < count; i++ ) {
-    //     lock(*this.lockarr[i]);
-    // }
+}
-static inline void ^?{}( __mutex_stmt_lock_guard & this ) with(this) {
-    // for ( size_t i = count; i > 0; i-- ) {
-    //     unlock(*lockarr[i - 1]);
-    // }
+}
 forall(L & | is_lock(L)) {
+    struct scoped_lock {
+        L * internal_lock;
+    };
+    static inline void ?{}( scoped_lock(L) & this, L & internal_lock ) {
+        this.internal_lock = &internal_lock;
+        lock(internal_lock);
+    }
+    static inline void ^?{}( scoped_lock(L) & this ) with(this) {
+        unlock(*internal_lock);
+    }
+    static inline void * __get_mutexstmt_lock_ptr( L & this ) {
+        return &this;
+    }
+    static inline L __get_mutexstmt_lock_type( L & this );
+    static inline L __get_mutexstmt_lock_type( L * this );
+    static inline void * __get_mutexstmt_lock_ptr( L & this ) { return &this; }
+    static inline L __get_mutexstmt_lock_type( L & this ) {}
+    static inline L __get_mutexstmt_lock_type( L * this ) {}
+}

libcfa/src/concurrency/preemption.cfa

-              r34b4268
+              r24d6572
 // Created On       : Mon Jun 5 14:20:42 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Feb 17 11:18:57 2022
 // Update Count     : 59
+// Last Modified On : Mon Jan  9 08:42:59 2023
+// Update Count     : 60
 //
 #define __cforall_thread__
-#define _GNU_SOURCE
 // #define __CFA_DEBUG_PRINT_PREEMPTION__
 …
                 __cfadbg_print_buffer_decl( preemption, " KERNEL: preemption tick %lu\n", currtime.tn);
                 Duration period = node->period;
                 if( period == 0) {
+                if( period == 0 ) {
                         node->set = false;                  // Node is one-shot, just mark it as not pending
+                }

libcfa/src/concurrency/pthread.cfa

-              r34b4268
+              r24d6572
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include <signal.h>
 …
 struct pthread_values{
         inline Seqable;
         void* value;
+        void * value;
         bool in_use;
 };
 …
 struct pthread_keys {
         bool in_use;
         void (*destructor)( void * );
+        void (* destructor)( void * );
         Sequence(pthread_values) threads;
 };
 static void ?{}(pthread_keys& k){
+static void ?{}(pthread_keys& k) {
         k.threads{};
+}
 …
 static pthread_keys cfa_pthread_keys_storage[PTHREAD_KEYS_MAX] __attribute__((aligned (16)));
 static void init_pthread_storage(){
         for (int i = 0; i < PTHREAD_KEYS_MAX; i++){
+static void init_pthread_storage() {
+        for ( int i = 0; i < PTHREAD_KEYS_MAX; i++ ) {
                 cfa_pthread_keys_storage[i]{};
+        }
 …
 /* condvar helper routines */
 static void init(pthread_cond_t* pcond){
+static void init(pthread_cond_t * pcond) {
         static_assert(sizeof(pthread_cond_t) >= sizeof(cfa2pthr_cond_var_t),"sizeof(pthread_t) < sizeof(cfa2pthr_cond_var_t)");
         cfa2pthr_cond_var_t* _cond = (cfa2pthr_cond_var_t*)pcond;
+        cfa2pthr_cond_var_t * _cond = (cfa2pthr_cond_var_t *)pcond;
         ?{}(*_cond);
+}
 static cfa2pthr_cond_var_t* get(pthread_cond_t* pcond){
+static cfa2pthr_cond_var_t * get(pthread_cond_t * pcond) {
         static_assert(sizeof(pthread_cond_t) >= sizeof(cfa2pthr_cond_var_t),"sizeof(pthread_t) < sizeof(cfa2pthr_cond_var_t)");
         return (cfa2pthr_cond_var_t*)pcond;
+}
 static void destroy(pthread_cond_t* cond){
+        return (cfa2pthr_cond_var_t *)pcond;
+}
+static void destroy(pthread_cond_t * cond) {
         static_assert(sizeof(pthread_cond_t) >= sizeof(cfa2pthr_cond_var_t),"sizeof(pthread_t) < sizeof(cfa2pthr_cond_var_t)");
         ^?{}(*get(cond));
 …
 /* mutex helper routines */
 static void mutex_check(pthread_mutex_t* t){
+static void mutex_check(pthread_mutex_t * t) {
         // Use double check to improve performance.
         // Check is safe on x86; volatile prevents compiler reordering
         volatile pthread_mutex_t *const mutex_ = t;
+        volatile pthread_mutex_t * const mutex_ = t;
         // SKULLDUGGERY: not a portable way to access the kind field, /usr/include/x86_64-linux-gnu/bits/pthreadtypes.h
 …
 static void init(pthread_mutex_t* plock){
+static void init(pthread_mutex_t * plock) {
         static_assert(sizeof(pthread_mutex_t) >= sizeof(simple_owner_lock),"sizeof(pthread_mutex_t) < sizeof(simple_owner_lock)");
         simple_owner_lock* _lock = (simple_owner_lock*)plock;
+        simple_owner_lock * _lock = (simple_owner_lock *)plock;
         ?{}(*_lock);
+}
 static simple_owner_lock* get(pthread_mutex_t* plock){
+static simple_owner_lock * get(pthread_mutex_t * plock) {
         static_assert(sizeof(pthread_mutex_t) >= sizeof(simple_owner_lock),"sizeof(pthread_mutex_t) < sizeof(simple_owner_lock)");
         return (simple_owner_lock*)plock;
+}
 static void destroy(pthread_mutex_t* plock){
+        return (simple_owner_lock *)plock;
+}
+static void destroy(pthread_mutex_t * plock) {
         static_assert(sizeof(pthread_mutex_t) >= sizeof(simple_owner_lock),"sizeof(pthread_mutex_t) < sizeof(simple_owner_lock)");
         ^?{}(*get(plock));
 …
 //######################### Attr helpers #########################
 struct cfaPthread_attr_t {                                                              // thread attributes
+typedef struct cfaPthread_attr_t {                                              // thread attributes
                 int contentionscope;
                 int detachstate;
                 size_t stacksize;
                 void *stackaddr;
+                void * stackaddr;
                 int policy;
                 int inheritsched;
                 struct sched_param param;
 } typedef cfaPthread_attr_t;
 static const cfaPthread_attr_t default_attrs{
+} cfaPthread_attr_t;
+static const cfaPthread_attr_t default_attrs {
 ,
 ,
         (size_t)65000,
         (void *)NULL,
+_000,
+        NULL,
 ,
 ,
 …
 };
 static cfaPthread_attr_t* get(const pthread_attr_t* attr){
         static_assert(sizeof(pthread_attr_t) >= sizeof(cfaPthread_attr_t),"sizeof(pthread_attr_t) < sizeof(cfaPthread_attr_t)");
         return (cfaPthread_attr_t*)attr;
+static cfaPthread_attr_t * get(const pthread_attr_t * attr) {
+        static_assert(sizeof(pthread_attr_t) >= sizeof(cfaPthread_attr_t), "sizeof(pthread_attr_t) < sizeof(cfaPthread_attr_t)");
+        return (cfaPthread_attr_t *)attr;
+}
 …
         // pthreads return value
         void *joinval;
+        void * joinval;
         // pthread attributes
         pthread_attr_t pthread_attr;
         void *(*start_routine)(void *);
         void *start_arg;
+        void *(* start_routine)(void *);
+        void * start_arg;
         // thread local data
         pthread_values* pthreadData;
+        pthread_values * pthreadData;
         // flag used for tryjoin
 …
 /* thread part routines */
 //  cfaPthread entry point
 void main(cfaPthread& _thread) with(_thread){
         joinval =  start_routine(start_arg);
+void main(cfaPthread & _thread) with(_thread) {
+        joinval = start_routine(start_arg);
         isTerminated = true;
+}
 static cfaPthread *lookup( pthread_t p ){
         static_assert(sizeof(pthread_t) >= sizeof(cfaPthread*),"sizeof(pthread_t) < sizeof(cfaPthread*)");
         return (cfaPthread*)p;
+}
 static void pthread_deletespecific_( pthread_values* values )  { // see uMachContext::invokeTask
         pthread_values* value;
         pthread_keys* key;
+static cfaPthread * lookup( pthread_t p ) {
+        static_assert(sizeof(pthread_t) >= sizeof(cfaPthread *),"sizeof(pthread_t) < sizeof(cfaPthread *)");
+        return (cfaPthread *)p;
+}
+static void pthread_deletespecific_( pthread_values * values )  { // see uMachContext::invokeTask
+        pthread_values * value;
+        pthread_keys * key;
         bool destcalled = true;
         if (values != NULL){
+        if (values != NULL) {
                 for ( int attempts = 0; attempts < PTHREAD_DESTRUCTOR_ITERATIONS && destcalled ; attempts += 1 ) {
                         destcalled = false;
                         lock(key_lock);
                         for (int i = 0; i < PTHREAD_KEYS_MAX; i++){
+                        for ( int i = 0; i < PTHREAD_KEYS_MAX; i++ ) {
                                 // for each valid key
                                 if ( values[i].in_use){
+                                if ( values[i].in_use) {
                                         value = &values[i];
                                         key = &cfa_pthread_keys[i];
 …
                                         // if  a  key  value  has  a  non-NULL  destructor pointer,  and  the  thread  has  a  non-NULL  value associated with that key,
                                         // the value of the key is set to NULL, and then the function pointed to is called with the previously associated value as its sole argument.
                                         if (value->value != NULL && key->destructor != NULL){
+                                        if (value->value != NULL && key->destructor != NULL) {
                                                 unlock(key_lock);
                                                 key->destructor(value->value); // run destructor
 …
+}
 static void ^?{}(cfaPthread & mutex t){
+static void ^?{}(cfaPthread & mutex t) {
         // delete pthread local storage
         pthread_values * values = t.pthreadData;
 …
+}
 static void ?{}(cfaPthread &t, pthread_t* _thread, const pthread_attr_t * _attr,void *(*start_routine)(void *), void * arg) {
         static_assert(sizeof(pthread_t) >= sizeof(cfaPthread*), "pthread_t too small to hold a pointer: sizeof(pthread_t) < sizeof(cfaPthread*)");
+static void ?{}(cfaPthread & t, pthread_t * _thread, const pthread_attr_t * _attr,void *(* start_routine)(void *), void * arg) {
+        static_assert(sizeof(pthread_t) >= sizeof(cfaPthread *), "pthread_t too small to hold a pointer: sizeof(pthread_t) < sizeof(cfaPthread *)");
         // set up user thread stackSize
 …
         //######################### Pthread Attrs #########################
         int pthread_attr_init(pthread_attr_t *attr) libcfa_public __THROW {
                 cfaPthread_attr_t* _attr = get(attr);
+        int pthread_attr_init(pthread_attr_t * attr) libcfa_public __THROW {
+                cfaPthread_attr_t * _attr = get(attr);
                 ?{}(*_attr, default_attrs);
                 return 0;
+        }
         int pthread_attr_destroy(pthread_attr_t *attr) libcfa_public __THROW {
+        int pthread_attr_destroy(pthread_attr_t * attr) libcfa_public __THROW {
                 ^?{}(*get(attr));
                 return 0;
+        }
         int pthread_attr_setscope( pthread_attr_t *attr, int contentionscope ) libcfa_public __THROW {
+        int pthread_attr_setscope( pthread_attr_t * attr, int contentionscope ) libcfa_public __THROW {
                 get( attr )->contentionscope = contentionscope;
                 return 0;
         } // pthread_attr_setscope
         int pthread_attr_getscope( const pthread_attr_t *attr, int *contentionscope ) libcfa_public __THROW {
+        int pthread_attr_getscope( const pthread_attr_t * attr, int * contentionscope ) libcfa_public __THROW {
                 *contentionscope = get( attr )->contentionscope;
                 return 0;
         } // pthread_attr_getscope
         int pthread_attr_setdetachstate( pthread_attr_t *attr, int detachstate ) libcfa_public __THROW {
+        int pthread_attr_setdetachstate( pthread_attr_t * attr, int detachstate ) libcfa_public __THROW {
                 get( attr )->detachstate = detachstate;
                 return 0;
         } // pthread_attr_setdetachstate
         int pthread_attr_getdetachstate( const pthread_attr_t *attr, int *detachstate ) libcfa_public __THROW {
+        int pthread_attr_getdetachstate( const pthread_attr_t * attr, int * detachstate ) libcfa_public __THROW {
                 *detachstate = get( attr )->detachstate;
                 return 0;
         } // pthread_attr_getdetachstate
         int pthread_attr_setstacksize( pthread_attr_t *attr, size_t stacksize ) libcfa_public __THROW {
+        int pthread_attr_setstacksize( pthread_attr_t * attr, size_t stacksize ) libcfa_public __THROW {
                 get( attr )->stacksize = stacksize;
                 return 0;
         } // pthread_attr_setstacksize
         int pthread_attr_getstacksize( const pthread_attr_t *attr, size_t *stacksize ) libcfa_public __THROW {
+        int pthread_attr_getstacksize( const pthread_attr_t * attr, size_t * stacksize ) libcfa_public __THROW {
                 *stacksize = get( attr )->stacksize;
                 return 0;
 …
         } // pthread_attr_setguardsize
         int pthread_attr_setstackaddr( pthread_attr_t *attr, void *stackaddr ) libcfa_public __THROW {
+        int pthread_attr_setstackaddr( pthread_attr_t * attr, void * stackaddr ) libcfa_public __THROW {
                 get( attr )->stackaddr = stackaddr;
                 return 0;
         } // pthread_attr_setstackaddr
         int pthread_attr_getstackaddr( const pthread_attr_t *attr, void **stackaddr ) libcfa_public __THROW {
+        int pthread_attr_getstackaddr( const pthread_attr_t * attr, void ** stackaddr ) libcfa_public __THROW {
                 *stackaddr = get( attr )->stackaddr;
                 return 0;
         } // pthread_attr_getstackaddr
         int pthread_attr_setstack( pthread_attr_t *attr, void *stackaddr, size_t stacksize ) libcfa_public __THROW {
+        int pthread_attr_setstack( pthread_attr_t * attr, void * stackaddr, size_t stacksize ) libcfa_public __THROW {
                 get( attr )->stackaddr = stackaddr;
                 get( attr )->stacksize = stacksize;
 …
         } // pthread_attr_setstack
         int pthread_attr_getstack( const pthread_attr_t *attr, void **stackaddr, size_t *stacksize ) libcfa_public __THROW {
+        int pthread_attr_getstack( const pthread_attr_t * attr, void ** stackaddr, size_t * stacksize ) libcfa_public __THROW {
                 *stackaddr = get( attr )->stackaddr;
                 *stacksize = get( attr )->stacksize;
 …
         // already running thread threadID. It shall be called on unitialized attr
         // and destroyed with pthread_attr_destroy when no longer needed.
         int pthread_getattr_np( pthread_t threadID, pthread_attr_t *attr ) libcfa_public __THROW { // GNU extension
+        int pthread_getattr_np( pthread_t threadID, pthread_attr_t * attr ) libcfa_public __THROW { // GNU extension
                 check_nonnull(attr);
 …
         //######################### Threads #########################
         int pthread_create(pthread_t * _thread, const pthread_attr_t * attr, void *(*start_routine)(void *), void * arg) libcfa_public __THROW {
                 cfaPthread *t = alloc();
+        int pthread_create(pthread_t * _thread, const pthread_attr_t * attr, void *(* start_routine)(void *), void * arg) libcfa_public __THROW {
+                cfaPthread * t = alloc();
                 (*t){_thread, attr, start_routine, arg};
                 return 0;
+        }
+        int pthread_join(pthread_t _thread, void **value_ptr) libcfa_public __THROW {
+        int pthread_join(pthread_t _thread, void ** value_ptr) libcfa_public __THROW {
                 // if thread is invalid
                 if (_thread == NULL) return EINVAL;
 …
                 // get user thr pointer
                 cfaPthread* p = lookup(_thread);
+                cfaPthread * p = lookup(_thread);
                 try {
                         join(*p);
 …
+        }
         int pthread_tryjoin_np(pthread_t _thread, void **value_ptr) libcfa_public __THROW {
+        int pthread_tryjoin_np(pthread_t _thread, void ** value_ptr) libcfa_public __THROW {
                 // if thread is invalid
                 if (_thread == NULL) return EINVAL;
                 if (_thread == pthread_self()) return EDEADLK;
                 cfaPthread* p = lookup(_thread);
+                cfaPthread * p = lookup(_thread);
                 // thread not finished ?
 …
         void pthread_exit(void * status) libcfa_public __THROW {
                 pthread_t pid = pthread_self();
                 cfaPthread* _thread = (cfaPthread*)pid;
+                cfaPthread * _thread = (cfaPthread *)pid;
                 _thread->joinval = status;  // set return value
                 _thread->isTerminated = 1;  // set terminated flag
 …
         //######################### Mutex #########################
         int pthread_mutex_init(pthread_mutex_t *_mutex, const pthread_mutexattr_t *attr) libcfa_public __THROW {
+        int pthread_mutex_init(pthread_mutex_t *_mutex, const pthread_mutexattr_t * attr) libcfa_public __THROW {
                 check_nonnull(_mutex);
                 init(_mutex);
 …
         int pthread_mutex_destroy(pthread_mutex_t *_mutex) libcfa_public __THROW {
                 check_nonnull(_mutex);
                 simple_owner_lock* _lock = get(_mutex);
                 if (_lock->owner != NULL){
+                simple_owner_lock * _lock = get(_mutex);
+                if (_lock->owner != NULL) {
                         return EBUSY;
+                }
 …
                 check_nonnull(_mutex);
                 mutex_check(_mutex);
                 simple_owner_lock* _lock = get(_mutex);
+                simple_owner_lock * _lock = get(_mutex);
                 lock(*_lock);
                 return 0;
 …
         int pthread_mutex_unlock(pthread_mutex_t *_mutex) libcfa_public __THROW {
                 check_nonnull(_mutex);
                 simple_owner_lock* _lock = get(_mutex);
                 if (_lock->owner != active_thread()){
+                simple_owner_lock * _lock = get(_mutex);
+                if (_lock->owner != active_thread()) {
                         return EPERM;
                 } // current thread does not hold the mutex
 …
         int pthread_mutex_trylock(pthread_mutex_t *_mutex) libcfa_public __THROW {
                 check_nonnull(_mutex);
                 simple_owner_lock* _lock = get(_mutex);
                 if (_lock->owner != active_thread() && _lock->owner != NULL){
+                simple_owner_lock * _lock = get(_mutex);
+                if (_lock->owner != active_thread() && _lock->owner != NULL) {
                         return EBUSY;
                 }   // if mutex is owned
 …
         /* conditional variable routines */
         int pthread_cond_init(pthread_cond_t *cond, const pthread_condattr_t *attr) libcfa_public __THROW {
+        int pthread_cond_init(pthread_cond_t * cond, const pthread_condattr_t * attr) libcfa_public __THROW {
                 check_nonnull(cond);
                 init(cond);
 …
         }  //pthread_cond_init
         int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *_mutex) libcfa_public __THROW {
+        int pthread_cond_wait(pthread_cond_t * cond, pthread_mutex_t *_mutex) libcfa_public __THROW {
                 check_nonnull(_mutex);
                 check_nonnull(cond);
 …
         } // pthread_cond_timedwait
         int pthread_cond_signal(pthread_cond_t *cond) libcfa_public __THROW {
+        int pthread_cond_signal(pthread_cond_t * cond) libcfa_public __THROW {
                 check_nonnull(cond);
                 return notify_one(*get(cond));
         } // pthread_cond_signal
         int pthread_cond_broadcast(pthread_cond_t *cond) libcfa_public __THROW {
+        int pthread_cond_broadcast(pthread_cond_t * cond) libcfa_public __THROW {
                 check_nonnull(cond);
                 return notify_all(*get(cond));
         } // pthread_cond_broadcast
         int pthread_cond_destroy(pthread_cond_t *cond) libcfa_public __THROW {
+        int pthread_cond_destroy(pthread_cond_t * cond) libcfa_public __THROW {
                 check_nonnull(cond);
                 destroy(cond);
 …
         //######################### Local storage #########################
         int pthread_once(pthread_once_t *once_control, void (*init_routine)(void)) libcfa_public __THROW {
+        int pthread_once(pthread_once_t * once_control, void (* init_routine)(void)) libcfa_public __THROW {
                 static_assert(sizeof(pthread_once_t) >= sizeof(int),"sizeof(pthread_once_t) < sizeof(int)");
                 check_nonnull(once_control);
 …
         } // pthread_once
         int pthread_key_create( pthread_key_t *key, void (*destructor)( void * ) ) libcfa_public __THROW {
+        int pthread_key_create( pthread_key_t * key, void (* destructor)( void * ) ) libcfa_public __THROW {
                 lock(key_lock);
                 for ( int i = 0; i < PTHREAD_KEYS_MAX; i += 1 ) {
 …
         }   // pthread_key_delete
         int pthread_setspecific( pthread_key_t key, const void *value ) libcfa_public __THROW {
+        int pthread_setspecific( pthread_key_t key, const void * value ) libcfa_public __THROW {
                 // get current thread
                 cfaPthread* t = lookup(pthread_self());
+                cfaPthread * t = lookup(pthread_self());
                 // if current thread's pthreadData is NULL; initialize it
                 pthread_values* values;
                 if (t->pthreadData == NULL){
+                pthread_values * values;
+                if (t->pthreadData == NULL) {
                         values = anew( PTHREAD_KEYS_MAX);
                         t->pthreadData = values;
                         for (int i = 0;i < PTHREAD_KEYS_MAX; i++){
+                        for ( int i = 0;i < PTHREAD_KEYS_MAX; i++ ) {
                                 t->pthreadData[i].in_use = false;
                         }   // for
 …
         } //pthread_setspecific
         void* pthread_getspecific(pthread_key_t key) libcfa_public __THROW {
+        void * pthread_getspecific(pthread_key_t key) libcfa_public __THROW {
                 if (key >= PTHREAD_KEYS_MAX || ! cfa_pthread_keys[key].in_use) return NULL;
                 // get current thread
                 cfaPthread* t = lookup(pthread_self());
+                cfaPthread * t = lookup(pthread_self());
                 if (t->pthreadData == NULL) return NULL;
                 lock(key_lock);
                 pthread_values &entry = ((pthread_values *)t->pthreadData)[key];
+                pthread_values & entry = ((pthread_values *)t->pthreadData)[key];
                 if ( ! entry.in_use ) {
                         unlock( key_lock );
                         return NULL;
                 } // if
                 void *value = entry.value;
+                void * value = entry.value;
                 unlock(key_lock);
 …
         //######################### Parallelism #########################
         int pthread_setaffinity_np( pthread_t /* __th */, size_t /* __cpusetsize */, __const cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
                 abort( "pthread_setaffinity_np" );
         } // pthread_setaffinity_np
         int pthread_getaffinity_np( pthread_t /* __th */, size_t /* __cpusetsize */, cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
                 abort( "pthread_getaffinity_np" );
         } // pthread_getaffinity_np
         int pthread_attr_setaffinity_np( pthread_attr_t * /* __attr */, size_t /* __cpusetsize */, __const cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
                 abort( "pthread_attr_setaffinity_np" );
         } // pthread_attr_setaffinity_np
         int pthread_attr_getaffinity_np( __const pthread_attr_t * /* __attr */, size_t /* __cpusetsize */, cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
                 abort( "pthread_attr_getaffinity_np" );
         } // pthread_attr_getaffinity_np
+        // int pthread_setaffinity_np( pthread_t /* __th */, size_t /* __cpusetsize */, __const cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
+        //      abort( "pthread_setaffinity_np" );
+        // } // pthread_setaffinity_np
+        // int pthread_getaffinity_np( pthread_t /* __th */, size_t /* __cpusetsize */, cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
+        //      abort( "pthread_getaffinity_np" );
+        // } // pthread_getaffinity_np
+        // int pthread_attr_setaffinity_np( pthread_attr_t * /* __attr */, size_t /* __cpusetsize */, __const cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
+        //      abort( "pthread_attr_setaffinity_np" );
+        // } // pthread_attr_setaffinity_np
+        // int pthread_attr_getaffinity_np( __const pthread_attr_t * /* __attr */, size_t /* __cpusetsize */, cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
+        //      abort( "pthread_attr_getaffinity_np" );
+        // } // pthread_attr_getaffinity_np
         //######################### Cancellation #########################
 …
         } // pthread_cancel
         int pthread_setcancelstate( int state, int *oldstate ) libcfa_public __THROW {
+        int pthread_setcancelstate( int state, int * oldstate ) libcfa_public __THROW {
                 abort("pthread_setcancelstate not implemented");
                 return 0;
         } // pthread_setcancelstate
         int pthread_setcanceltype( int type, int *oldtype ) libcfa_public __THROW {
+        int pthread_setcanceltype( int type, int * oldtype ) libcfa_public __THROW {
                 abort("pthread_setcanceltype not implemented");
                 return 0;
 …
 #pragma GCC diagnostic pop

libcfa/src/concurrency/ready_queue.cfa

r34b4268	r24d6572
15	15
16	16	#define __cforall_thread__
17		~~#define _GNU_SOURCE~~
18	17
19	18	// #define __CFA_DEBUG_PRINT_READY_QUEUE__

libcfa/src/concurrency/thread.cfa

-              r34b4268
+              r24d6572
 // Created On       : Tue Jan 17 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sun Dec 11 20:56:54 2022
 // Update Count     : 102
+// Last Modified On : Mon Jan  9 08:42:33 2023
+// Update Count     : 103
 //
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include "thread.hfa"
 …
         preferred = ready_queue_new_preferred();
         last_proc = 0p;
+    link_node = 0p;
         PRNG_SET_SEED( random_state, __global_random_mask ? __global_random_prime : __global_random_prime ^ rdtscl() );
         #if defined( __CFA_WITH_VERIFY__ )
 …
         #endif
-        clh_node = malloc( );
-        *clh_node = false;
         doregister(curr_cluster, this);
         monitors{ &self_mon_p, 1, (fptr_t)0 };
 …
                 canary = 0xDEADDEADDEADDEADp;
         #endif
-        free(clh_node);
         unregister(curr_cluster, this);
         ^self_cor{};

libcfa/src/concurrency/thread.hfa

-              r34b4268
+              r24d6572
 // Created On       : Tue Jan 17 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Nov 22 22:18:34 2022
 // Update Count     : 35
+// Last Modified On : Thu Feb  2 11:27:59 2023
+// Update Count     : 37
 //
 …
 //-----------------------------------------------------------------------------
 // thread trait
+trait is_thread(T &) {
+forall( T & )
+trait is_thread {
         void ^?{}(T& mutex this);
         void main(T& this);

Context Navigation

Legend:

libcfa/src/concurrency/clib/cfathread.cfa

libcfa/src/concurrency/clib/cfathread.h

libcfa/src/concurrency/coroutine.cfa

libcfa/src/concurrency/coroutine.hfa

libcfa/src/concurrency/future.hfa

libcfa/src/concurrency/invoke.h

libcfa/src/concurrency/io.cfa

libcfa/src/concurrency/io/call.cfa.in

libcfa/src/concurrency/io/setup.cfa

libcfa/src/concurrency/io/types.hfa

libcfa/src/concurrency/iofwd.hfa

libcfa/src/concurrency/kernel.cfa

libcfa/src/concurrency/kernel/cluster.cfa

libcfa/src/concurrency/kernel/cluster.hfa

libcfa/src/concurrency/kernel/private.hfa

libcfa/src/concurrency/kernel/startup.cfa

libcfa/src/concurrency/locks.cfa

libcfa/src/concurrency/locks.hfa

libcfa/src/concurrency/monitor.cfa

libcfa/src/concurrency/monitor.hfa

libcfa/src/concurrency/mutex.cfa

libcfa/src/concurrency/mutex.hfa

libcfa/src/concurrency/mutex_stmt.hfa

libcfa/src/concurrency/preemption.cfa

libcfa/src/concurrency/pthread.cfa

libcfa/src/concurrency/ready_queue.cfa

libcfa/src/concurrency/thread.cfa

libcfa/src/concurrency/thread.hfa

Download in other formats: