Changeset b110bcc for libcfa

libcfa/src/Makefile.am

r2ed94a9	rb110bcc
48	48	math.hfa \
49	49	time_t.hfa \
	50	virtual_dtor.hfa \
50	51	bits/algorithm.hfa \
51	52	bits/align.hfa \

libcfa/src/algorithms/range_iterator.hfa

-              r2ed94a9
+              rb110bcc
 // Author           : Thierry Delisle
 // Created On       : Tue Nov 30 13:06:22 2021
 // Last Modified By :
 // Last Modified On :
 // Update Count     :
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Mon Mar 13 23:10:35 2023
+// Update Count     : 1
 //
+#pragma once
 generator RangeIter {

libcfa/src/bitmanip.hfa

-              r2ed94a9
+              rb110bcc
 // Created On       : Sat Mar 14 18:12:27 2020
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sat Oct  8 08:28:15 2022
 // Update Count     : 142
+// Last Modified On : Mon Jan  9 09:02:43 2023
+// Update Count     : 144
 //
 #pragma once
+#include "bits/debug.hfa"                                                               // verify
 // Reference: Bit Twiddling Hacks: http://graphics.stanford.edu/%7Eseander/bithacks.html#CountBitsSetNaive

libcfa/src/bits/random.hfa

-              r2ed94a9
+              rb110bcc
 // Created On       : Fri Jan 14 07:18:11 2022
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Dec 22 20:54:22 2022
 // Update Count     : 178
+// Last Modified On : Mon Mar 20 21:45:24 2023
+// Update Count     : 186
 //
 …
         #define XOSHIRO256PP
         //#define KISS_64
+    // #define SPLITMIX_64
         // 32-bit generators
         //#define XORSHIFT_6_21_7
         #define XOSHIRO128PP
+    // #define SPLITMIX_32
 #else                                                                                                   // 32-bit architecture
         // 64-bit generators
         //#define XORSHIFT_13_7_17
         #define XOSHIRO256PP
+    // #define SPLITMIX_64
         // 32-bit generators
         //#define XORSHIFT_6_21_7
         #define XOSHIRO128PP
+    // #define SPLITMIX_32
 #endif // __x86_64__
 // Define C/CFA PRNG name and random-state.
-// SKULLDUGGERY: typedefs name struct and typedef with the same name to deal with CFA typedef numbering problem.
 #ifdef XOSHIRO256PP
 #define PRNG_NAME_64 xoshiro256pp
 #define PRNG_STATE_64_T GLUE(PRNG_NAME_64,_t)
 typedef struct PRNG_STATE_64_T { uint64_t s0, s1, s2, s3; } PRNG_STATE_64_T;
+typedef struct { uint64_t s0, s1, s2, s3; } PRNG_STATE_64_T;
 #endif // XOSHIRO256PP
 …
 #define PRNG_NAME_32 xoshiro128pp
 #define PRNG_STATE_32_T GLUE(PRNG_NAME_32,_t)
 typedef struct PRNG_STATE_32_T { uint32_t s0, s1, s2, s3; } PRNG_STATE_32_T;
+typedef struct { uint32_t s0, s1, s2, s3; } PRNG_STATE_32_T;
 #endif // XOSHIRO128PP
 …
 #endif // XORSHIFT_12_25_27
+#ifdef SPLITMIX_64
+#define PRNG_NAME_64 splitmix64
+#define PRNG_STATE_64_T uint64_t
+#endif // SPLITMIX32
+#ifdef SPLITMIX_32
+#define PRNG_NAME_32 splitmix32
+#define PRNG_STATE_32_T uint32_t
+#endif // SPLITMIX32
 #ifdef KISS_64
 #define PRNG_NAME_64 kiss_64
 #define PRNG_STATE_64_T GLUE(PRNG_NAME_64,_t)
 typedef struct PRNG_STATE_64_T { uint64_t z, w, jsr, jcong; } PRNG_STATE_64_T;
+typedef struct { uint64_t z, w, jsr, jcong; } PRNG_STATE_64_T;
 #endif // KISS_^64
 …
 #define PRNG_NAME_32 xorwow
 #define PRNG_STATE_32_T GLUE(PRNG_NAME_32,_t)
 typedef struct PRNG_STATE_32_T { uint32_t a, b, c, d, counter; } PRNG_STATE_32_T;
+typedef struct { uint32_t a, b, c, d, counter; } PRNG_STATE_32_T;
 #endif // XOSHIRO128PP
 …
 #ifdef __cforall                                                                                // don't include in C code (invoke.h)
+// https://rosettacode.org/wiki/Pseudo-random_numbers/Splitmix64
+//
+// Splitmix64 is not recommended for demanding random number requirements, but is often used to calculate initial states
+// for other more complex pseudo-random number generators (see https://prng.di.unimi.it).
+// Also https://rosettacode.org/wiki/Pseudo-random_numbers/Splitmix64.
+static inline uint64_t splitmix64( uint64_t & state ) {
+    state += 0x9e3779b97f4a7c15;
+    uint64_t z = state;
+    z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
+    z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
+    return z ^ (z >> 31);
+} // splitmix64
+static inline void splitmix64_set_seed( uint64_t & state , uint64_t seed ) {
+    state = seed;
+    splitmix64( state );                                                                // prime
+} // splitmix64_set_seed
+// https://github.com/bryc/code/blob/master/jshash/PRNGs.md#splitmix32
+//
+// Splitmix32 is not recommended for demanding random number requirements, but is often used to calculate initial states
+// for other more complex pseudo-random number generators (see https://prng.di.unimi.it).
+static inline uint32_t splitmix32( uint32_t & state ) {
+    state += 0x9e3779b9;
+    uint64_t z = state;
+    z = (z ^ (z >> 15)) * 0x85ebca6b;
+    z = (z ^ (z >> 13)) * 0xc2b2ae35;
+    return z ^ (z >> 16);
+} // splitmix32
+static inline void splitmix32_set_seed( uint32_t & state, uint64_t seed ) {
+    state = seed;
+    splitmix32( state );                                                                // prime
+} // splitmix32_set_seed
+#ifdef __SIZEOF_INT128__
+//--------------------------------------------------
+static inline uint64_t lehmer64( __uint128_t & state ) {
+        __uint128_t ret = state;
+        state *= 0x_da94_2042_e4dd_58b5;
+        return ret >> 64;
+} // lehmer64
+static inline void lehmer64_set_seed( __uint128_t & state, uint64_t seed ) {
+        // The seed needs to be coprime with the 2^64 modulus to get the largest period, so no factors of 2 in the seed.
+        state = splitmix64( seed );                                                     // prime
+} // lehmer64_set_seed
+//--------------------------------------------------
+static inline uint64_t wyhash64( uint64_t & state ) {
+        uint64_t ret = state;
+        state += 0x_60be_e2be_e120_fc15;
+        __uint128_t tmp;
+        tmp = (__uint128_t) ret * 0x_a3b1_9535_4a39_b70d;
+        uint64_t m1 = (tmp >> 64) ^ tmp;
+        tmp = (__uint128_t)m1 * 0x_1b03_7387_12fa_d5c9;
+        uint64_t m2 = (tmp >> 64) ^ tmp;
+        return m2;
+} // wyhash64
+static inline void wyhash64_set_seed( uint64_t & state, uint64_t seed ) {
+        state = splitmix64( seed );                                                     // prime
+} // wyhash64_set_seed
+#endif // __SIZEOF_INT128__
 // https://prng.di.unimi.it/xoshiro256starstar.c
 //
 …
 #ifndef XOSHIRO256PP
 typedef struct xoshiro256pp_t { uint64_t s0, s1, s2, s3; } xoshiro256pp_t;
+typedef struct { uint64_t s0, s1, s2, s3; } xoshiro256pp_t;
 #endif // ! XOSHIRO256PP
 …
 static inline void xoshiro256pp_set_seed( xoshiro256pp_t & state, uint64_t seed ) {
+        state = (xoshiro256pp_t){ seed, seed, seed, seed };
+        xoshiro256pp( state );
+    // To attain repeatable seeding, compute seeds separately because the order of argument evaluation is undefined.
+    uint64_t seed1 = splitmix64( seed );                                // prime
+    uint64_t seed2 = splitmix64( seed );
+    uint64_t seed3 = splitmix64( seed );
+    uint64_t seed4 = splitmix64( seed );
+        state = (xoshiro256pp_t){ seed1, seed2, seed3, seed4 };
 } // xoshiro256pp_set_seed
 …
 #ifndef XOSHIRO128PP
 typedef struct xoshiro128pp_t { uint32_t s0, s1, s2, s3; } xoshiro128pp_t;
+typedef struct { uint32_t s0, s1, s2, s3; } xoshiro128pp_t;
 #endif // ! XOSHIRO128PP
 …
 static inline void xoshiro128pp_set_seed( xoshiro128pp_t & state, uint32_t seed ) {
+        state = (xoshiro128pp_t){ seed, seed, seed, seed };
+        xoshiro128pp( state );                                                          // prime
+    // To attain repeatable seeding, compute seeds separately because the order of argument evaluation is undefined.
+    uint32_t seed1 = splitmix32( seed );                                // prime
+    uint32_t seed2 = splitmix32( seed );
+    uint32_t seed3 = splitmix32( seed );
+    uint32_t seed4 = splitmix32( seed );
+        state = (xoshiro128pp_t){ seed1, seed2, seed3, seed4 };
 } // xoshiro128pp_set_seed
-#ifdef __SIZEOF_INT128__
-        //--------------------------------------------------
-        static inline uint64_t lehmer64( __uint128_t & state ) {
-                __uint128_t ret = state;
-                state *= 0x_da94_2042_e4dd_58b5;
-                return ret >> 64;
-        } // lehmer64
-        static inline void lehmer64_set_seed( __uint128_t & state, uint64_t seed ) {
-                // The seed needs to be coprime with the 2^64 modulus to get the largest period, so no factors of 2 in the seed.
-                state = seed;
-                lehmer64( state );                                                              // prime
-        } // lehmer64_set_seed
-        //--------------------------------------------------
-        static inline uint64_t wyhash64( uint64_t & state ) {
-                uint64_t ret = state;
-                state += 0x_60be_e2be_e120_fc15;
-                __uint128_t tmp;
-                tmp = (__uint128_t) ret * 0x_a3b1_9535_4a39_b70d;
-                uint64_t m1 = (tmp >> 64) ^ tmp;
-                tmp = (__uint128_t)m1 * 0x_1b03_7387_12fa_d5c9;
-                uint64_t m2 = (tmp >> 64) ^ tmp;
-                return m2;
-        } // wyhash64
-        static inline void wyhash64_set_seed( uint64_t & state, uint64_t seed ) {
-                state = seed;
-                wyhash64( state );                                                              // prime
-        } // wyhash64_set_seed
-#endif // __SIZEOF_INT128__
 //--------------------------------------------------
 …
 static inline void xorshift_13_7_17_set_seed( uint64_t & state, uint64_t seed ) {
+        state = seed;
+        xorshift_13_7_17( state );                                                      // prime
+        state = splitmix64( seed );                                                     // prime
 } // xorshift_13_7_17_set_seed
 …
 static inline void xorshift_6_21_7_set_seed( uint32_t & state, uint32_t seed ) {
+        state = seed;
+        xorshift_6_21_7( state );                                                       // prime
+    state = splitmix32( seed );                                                 // prime
 } // xorshift_6_21_7_set_seed
 …
 static inline void xorshift_12_25_27_set_seed( uint64_t & state, uint64_t seed ) {
+        state = seed;
+        xorshift_12_25_27( state );                                                     // prime
+        state = splitmix64( seed );                                                     // prime
 } // xorshift_12_25_27_set_seed
 …
 // The state must be seeded with a nonzero value.
 #ifndef KISS_64
 typedef struct kiss_64_t { uint64_t z, w, jsr, jcong; } kiss_64_t;
+typedef struct { uint64_t z, w, jsr, jcong; } kiss_64_t;
 #endif // ! KISS_64
 …
 static inline void kiss_64_set_seed( kiss_64_t & rs, uint64_t seed ) with(rs) {
+        z = 1; w = 1; jsr = 4; jcong = seed;
+        kiss_64( rs );                                                                          // prime
+        z = 1; w = 1; jsr = 4; jcong = splitmix64( seed );      // prime
 } // kiss_64_set_seed
 …
 // The state array must be initialized to non-zero in the first four words.
 #ifndef XORWOW
 typedef struct xorwow_t { uint32_t a, b, c, d, counter; } xorwow_t;
+typedef struct { uint32_t a, b, c, d, counter; } xorwow_t;
 #endif // ! XORWOW
 …
 static inline void xorwow_set_seed( xorwow_t & rs, uint32_t seed ) {
+        rs = (xorwow_t){ seed, seed, seed, seed, 0 };
+        xorwow( rs );                                                                           // prime
+    // To attain repeatable seeding, compute seeds separately because the order of argument evaluation is undefined.
+    uint32_t seed1 = splitmix32( seed );                                // prime
+    uint32_t seed2 = splitmix32( seed );
+    uint32_t seed3 = splitmix32( seed );
+    uint32_t seed4 = splitmix32( seed );
+        rs = (xorwow_t){ seed1, seed2, seed3, seed4, 0 };
 } // xorwow_set_seed
 …
 // Used in __tls_rand_fwd
 #define M  (1_l64u << 48_l64u)
 #define A  (25214903917_l64u)
 #define AI (18446708753438544741_l64u)
+#define A  (25_214_903_917_l64u)
+#define AI (18_446_708_753_438_544_741_l64u)
 #define C  (11_l64u)
 #define D  (16_l64u)

libcfa/src/concurrency/actor.hfa

-              r2ed94a9
+              rb110bcc
 #include <locks.hfa>
 #include <limits.hfa>
-#include <list.hfa>
 #include <kernel.hfa>
+#include <iofwd.hfa>
+#include <virtual_dtor.hfa>
 #ifdef __CFA_DEBUG__
 …
 // Define the default number of executor request-queues (mailboxes) written to by actors and serviced by the
 // actor-executor threads. Must be greater than 0.
 #define __DEFAULT_EXECUTOR_RQUEUES__ 2
+#define __DEFAULT_EXECUTOR_RQUEUES__ 4
 // Define if executor is created in a separate cluster
 #define __DEFAULT_EXECUTOR_SEPCLUS__ false
+// when you flip this make sure to recompile compiler and flip the appropriate flag there too in Actors.cpp
+#define __ALLOC 0
+#define __DEFAULT_EXECUTOR_BUFSIZE__ 10
+#define __STEAL 0 // workstealing toggle. Disjoint from toggles above
+// workstealing heuristic selection (only set one to be 1)
+// #define RAND 0
+#define SEARCH 1
+// show stats
+// #define ACTOR_STATS
 // forward decls
 struct actor;
 struct message;
+struct executor;
 enum Allocation { Nodelete, Delete, Destroy, Finished }; // allocation status
 …
     __receive_fn fn;
     bool stop;
-    inline dlink(request);
 };
-P9_EMBEDDED( request, dlink(request) )
 static inline void ?{}( request & this ) { this.stop = true; } // default ctor makes a sentinel
 …
+}
+// hybrid data structure. Copies until buffer is full and then allocates for intrusive list
+// Vector-like data structure that supports O(1) queue operations with no bound on size
+// assumes gulping behaviour (once a remove occurs, removes happen until empty beforw next insert)
 struct copy_queue {
-    dlist( request ) list;
-    #if ! __ALLOC
     request * buffer;
+    size_t count, buffer_size, index;
+    #endif
+    size_t count, buffer_size, index, utilized, last_size;
 };
 static inline void ?{}( copy_queue & this ) {}
 static inline void ?{}( copy_queue & this, size_t buf_size ) with(this) {
-    list{};
-    #if ! __ALLOC
     buffer_size = buf_size;
     buffer = aalloc( buffer_size );
     count = 0;
+    utilized = 0;
     index = 0;
+    #endif
+}
+static inline void ^?{}( copy_queue & this ) with(this) {
+    #if ! __ALLOC
+    adelete(buffer);
+    #endif
+}
+    last_size = 0;
+}
+static inline void ^?{}( copy_queue & this ) with(this) { adelete(buffer); }
 static inline void insert( copy_queue & this, request & elem ) with(this) {
+    #if ! __ALLOC
+    if ( count < buffer_size ) { // fast path ( no alloc )
+        buffer[count]{ elem };
+        count++;
+        return;
+    }
+    request * new_elem = alloc();
+    (*new_elem){ elem };
+    insert_last( list, *new_elem );
+    #else
+    insert_last( list, elem );
+    #endif
+    if ( count >= buffer_size ) { // increase arr size
+        last_size = buffer_size;
+        buffer_size = 2 * buffer_size;
+        buffer = realloc( buffer, sizeof( request ) * buffer_size );
+        /* paranoid */ verify( buffer );
+    }
+    memcpy( &buffer[count], &elem, sizeof(request) );
+    count++;
+}
 // once you start removing you need to remove all elements
+// it is not supported to call insert() before the list is fully empty
+// should_delete is an output param
+static inline request & remove( copy_queue & this, bool & should_delete ) with(this) {
+    #if ! __ALLOC
+// it is not supported to call insert() before the array is fully empty
+static inline request & remove( copy_queue & this ) with(this) {
     if ( count > 0 ) {
         count--;
-        should_delete = false;
         size_t old_idx = index;
         index = count == 0 ? 0 : index + 1;
         return buffer[old_idx];
+    }
     #endif
     should_delete = true;
+    return try_pop_front( list );
+}
 static inline bool isEmpty( copy_queue & this ) with(this) {
     #if ! __ALLOC
     return count == 0 && list`isEmpty;
     #else
     return list`isEmpty;
+    #endif
+}
+static size_t __buffer_size = 10; // C_TODO: rework this to be passed from executor through ctors (no need for global)
+    request * ret = 0p;
+    return *0p;
+}
+// try to reclaim some memory if less than half of buffer is utilized
+static inline void reclaim( copy_queue & this ) with(this) {
+    if ( utilized >= last_size || buffer_size <= 4 ) { utilized = 0; return; }
+    utilized = 0;
+    buffer_size--;
+    buffer = realloc( buffer, sizeof( request ) * buffer_size ); // try to reclaim some memory
+}
+static inline bool isEmpty( copy_queue & this ) with(this) { return count == 0; }
 struct work_queue {
     __spinlock_t mutex_lock;
+    copy_queue owned_queue;
+    copy_queue * c_queue; // C_TODO: try putting this on the stack with ptr juggling
+    copy_queue * owned_queue;       // copy queue allocated and cleaned up by this work_queue
+    copy_queue * c_queue;           // current queue
+    volatile bool being_processed;  // flag to prevent concurrent processing
+    #ifdef ACTOR_STATS
+    unsigned int id;
+    size_t missed;                  // transfers skipped due to being_processed flag being up
+    #endif
 }; // work_queue
+static inline void ?{}( work_queue & this ) with(this) {
+    // c_queue = alloc();
+    // (*c_queue){ __buffer_size };
+    owned_queue{ __buffer_size };
+    c_queue = &owned_queue;
+}
+// static inline void ^?{}( work_queue & this ) with(this) { delete( c_queue ); }
+static inline void ?{}( work_queue & this, size_t buf_size, unsigned int i ) with(this) {
+    owned_queue = alloc();      // allocated separately to avoid false sharing
+    (*owned_queue){ buf_size };
+    c_queue = owned_queue;
+    being_processed = false;
+    #ifdef ACTOR_STATS
+    id = i;
+    missed = 0;
+    #endif
+}
+// clean up copy_queue owned by this work_queue
+static inline void ^?{}( work_queue & this ) with(this) { delete( owned_queue ); }
 static inline void insert( work_queue & this, request & elem ) with(this) {
 …
 static inline void transfer( work_queue & this, copy_queue ** transfer_to ) with(this) {
     lock( mutex_lock __cfaabi_dbg_ctx2 );
+    #ifdef __STEAL
+    // check if queue is being processed elsewhere
+    if ( unlikely( being_processed ) ) {
+        #ifdef ACTOR_STATS
+        missed++;
+        #endif
+        unlock( mutex_lock );
+        return;
+    }
+    being_processed = c_queue->count != 0;
+    #endif // __STEAL
+    c_queue->utilized = c_queue->count;
     // swap copy queue ptrs
     copy_queue * temp = *transfer_to;
 …
 } // transfer
+// needed since some info needs to persist past worker lifetimes
+struct worker_info {
+    volatile unsigned long long stamp;
+    #ifdef ACTOR_STATS
+    size_t stolen_from, try_steal, stolen, failed_swaps, msgs_stolen;
+    unsigned long long processed;
+    size_t gulps;
+    #endif
+};
+static inline void ?{}( worker_info & this ) {
+    #ifdef ACTOR_STATS
+    this.stolen_from = 0;
+    this.try_steal = 0;                             // attempts to steal
+    this.stolen = 0;                                // successful steals
+    this.processed = 0;                             // requests processed
+    this.gulps = 0;                                 // number of gulps
+    this.failed_swaps = 0;                          // steal swap failures
+    this.msgs_stolen = 0;                           // number of messages stolen
+    #endif
+    this.stamp = rdtscl();
+}
+// #ifdef ACTOR_STATS
+// unsigned int * stolen_arr;
+// unsigned int * replaced_queue;
+// #endif
 thread worker {
+    copy_queue owned_queue;
+    work_queue * request_queues;
+    work_queue ** request_queues;
     copy_queue * current_queue;
         request & req;
+    executor * executor_;
     unsigned int start, range;
+    int id;
 };
+static inline void ?{}( worker & this, cluster & clu, work_queue * request_queues, unsigned int start, unsigned int range ) {
+#ifdef ACTOR_STATS
+// aggregate counters for statistics
+size_t __total_tries = 0, __total_stolen = 0, __total_workers, __all_gulps = 0,
+    __total_failed_swaps = 0, __all_processed = 0, __num_actors_stats = 0, __all_msgs_stolen = 0;
+#endif
+static inline void ?{}( worker & this, cluster & clu, work_queue ** request_queues, copy_queue * current_queue, executor * executor_,
+    unsigned int start, unsigned int range, int id ) {
     ((thread &)this){ clu };
+    this.request_queues = request_queues;
+    // this.current_queue = alloc();
+    // (*this.current_queue){ __buffer_size };
+    this.owned_queue{ __buffer_size };
+    this.current_queue = &this.owned_queue;
+    this.start = start;
+    this.range = range;
+}
+// static inline void ^?{}( worker & mutex this ) with(this) { delete( current_queue ); }
+    this.request_queues = request_queues;           // array of all queues
+    this.current_queue = current_queue;             // currently gulped queue (start with empty queue to use in swap later)
+    this.executor_ = executor_;                     // pointer to current executor
+    this.start = start;                             // start of worker's subrange of request_queues
+    this.range = range;                             // size of worker's subrange of request_queues
+    this.id = id;                                   // worker's id and index in array of workers
+}
+static bool no_steal = false;
 struct executor {
     cluster * cluster;                                                      // if workers execute on separate cluster
         processor ** processors;                                            // array of virtual processors adding parallelism for workers
+        work_queue * request_queues;                                // master list of work request queues
+        worker ** workers;                                                              // array of workers executing work requests
+        work_queue * request_queues;                                // master array of work request queues
+    copy_queue * local_queues;                      // array of all worker local queues to avoid deletion race
+        work_queue ** worker_req_queues;                // secondary array of work queues to allow for swapping
+    worker ** workers;                                                          // array of workers executing work requests
+    worker_info * w_infos;                          // array of info about each worker
         unsigned int nprocessors, nworkers, nrqueues;   // number of processors/threads/request queues
         bool seperate_clus;                                                             // use same or separate cluster for executor
 }; // executor
+// #ifdef ACTOR_STATS
+// __spinlock_t out_lock;
+// #endif
+static inline void ^?{}( worker & mutex this ) with(this) {
+    #ifdef ACTOR_STATS
+    __atomic_add_fetch(&__all_gulps, executor_->w_infos[id].gulps,__ATOMIC_SEQ_CST);
+    __atomic_add_fetch(&__all_processed, executor_->w_infos[id].processed,__ATOMIC_SEQ_CST);
+    __atomic_add_fetch(&__all_msgs_stolen, executor_->w_infos[id].msgs_stolen,__ATOMIC_SEQ_CST);
+    __atomic_add_fetch(&__total_tries, executor_->w_infos[id].try_steal, __ATOMIC_SEQ_CST);
+    __atomic_add_fetch(&__total_stolen, executor_->w_infos[id].stolen, __ATOMIC_SEQ_CST);
+    __atomic_add_fetch(&__total_failed_swaps, executor_->w_infos[id].failed_swaps, __ATOMIC_SEQ_CST);
+    // per worker steal stats (uncomment alongside the lock above this routine to print)
+    // lock( out_lock __cfaabi_dbg_ctx2 );
+    // printf("Worker id: %d, processed: %llu messages, attempted %lu, stole: %lu, stolen from: %lu\n", id, processed, try_steal, stolen, __atomic_add_fetch(&executor_->w_infos[id].stolen_from, 0, __ATOMIC_SEQ_CST) );
+    // int count = 0;
+    // int count2 = 0;
+    // for ( i; range ) {
+    //     if ( replaced_queue[start + i] > 0 ){
+    //         count++;
+    //         // printf("%d: %u, ",i, replaced_queue[i]);
+    //     }
+    //     if (__atomic_add_fetch(&stolen_arr[start + i],0,__ATOMIC_SEQ_CST) > 0)
+    //         count2++;
+    // }
+    // printf("swapped with: %d of %u indices\n", count, executor_->nrqueues / executor_->nworkers );
+    // printf("%d of %u indices were stolen\n", count2, executor_->nrqueues / executor_->nworkers );
+    // unlock( out_lock );
+    #endif
+}
 static inline void ?{}( executor & this, unsigned int nprocessors, unsigned int nworkers, unsigned int nrqueues, bool seperate_clus, size_t buf_size ) with(this) {
     if ( nrqueues < nworkers ) abort( "nrqueues needs to be >= nworkers\n" );
-    __buffer_size = buf_size;
     this.nprocessors = nprocessors;
     this.nworkers = nworkers;
 …
     this.seperate_clus = seperate_clus;
+    if ( nworkers == nrqueues )
+        no_steal = true;
+    #ifdef ACTOR_STATS
+    // stolen_arr = aalloc( nrqueues );
+    // replaced_queue = aalloc( nrqueues );
+    __total_workers = nworkers;
+    #endif
     if ( seperate_clus ) {
         cluster = alloc();
 …
     request_queues = aalloc( nrqueues );
+    for ( i; nrqueues )
+        request_queues[i]{};
+    worker_req_queues = aalloc( nrqueues );
+    for ( i; nrqueues ) {
+        request_queues[i]{ buf_size, i };
+        worker_req_queues[i] = &request_queues[i];
+    }
     processors = aalloc( nprocessors );
 …
         (*(processors[i] = alloc())){ *cluster };
+    workers = alloc( nworkers );
+    local_queues = aalloc( nworkers );
+    workers = aalloc( nworkers );
+    w_infos = aalloc( nworkers );
     unsigned int reqPerWorker = nrqueues / nworkers, extras = nrqueues % nworkers;
+    for ( i; nworkers ) {
+        w_infos[i]{};
+        local_queues[i]{ buf_size };
+    }
     for ( unsigned int i = 0, start = 0, range; i < nworkers; i += 1, start += range ) {
         range = reqPerWorker + ( i < extras ? 1 : 0 );
         (*(workers[i] = alloc())){ *cluster, request_queues, start, range };
+        (*(workers[i] = alloc())){ *cluster, worker_req_queues, &local_queues[i], &this, start, range, i };
     } // for
+}
 static inline void ?{}( executor & this, unsigned int nprocessors, unsigned int nworkers, unsigned int nrqueues, bool seperate_clus ) { this{ nprocessors, nworkers, nrqueues, seperate_clus, __buffer_size }; }
+static inline void ?{}( executor & this, unsigned int nprocessors, unsigned int nworkers, unsigned int nrqueues, bool seperate_clus ) { this{ nprocessors, nworkers, nrqueues, seperate_clus, __DEFAULT_EXECUTOR_BUFSIZE__ }; }
 static inline void ?{}( executor & this, unsigned int nprocessors, unsigned int nworkers, unsigned int nrqueues ) { this{ nprocessors, nworkers, nrqueues, __DEFAULT_EXECUTOR_SEPCLUS__ }; }
 static inline void ?{}( executor & this, unsigned int nprocessors, unsigned int nworkers ) { this{ nprocessors, nworkers, __DEFAULT_EXECUTOR_RQUEUES__ }; }
 …
 static inline void ^?{}( executor & this ) with(this) {
+    #ifdef __STEAL
+    request sentinels[nrqueues];
+    for ( unsigned int i = 0; i < nrqueues; i++ ) {
+        insert( request_queues[i], sentinels[i] );              // force eventually termination
+    } // for
+    #else
     request sentinels[nworkers];
+    unsigned int reqPerWorker = nrqueues / nworkers;
+    for ( unsigned int i = 0, step = 0; i < nworkers; i += 1, step += reqPerWorker ) {
+    unsigned int reqPerWorker = nrqueues / nworkers, extras = nrqueues % nworkers;
+    for ( unsigned int i = 0, step = 0, range; i < nworkers; i += 1, step += range ) {
+        range = reqPerWorker + ( i < extras ? 1 : 0 );
         insert( request_queues[step], sentinels[i] );           // force eventually termination
     } // for
+    #endif
     for ( i; nworkers )
 …
     } // for
+    #ifdef ACTOR_STATS
+    size_t misses = 0;
+    for ( i; nrqueues ) {
+        misses += worker_req_queues[i]->missed;
+    }
+    // adelete( stolen_arr );
+    // adelete( replaced_queue );
+    #endif
     adelete( workers );
+    adelete( w_infos );
+    adelete( local_queues );
     adelete( request_queues );
+    adelete( worker_req_queues );
     adelete( processors );
     if ( seperate_clus ) delete( cluster );
+    #ifdef ACTOR_STATS // print formatted stats
+    printf("    Actor System Stats:\n");
+    printf("\tActors Created:\t\t\t\t%lu\n\tMessages Sent:\t\t\t\t%lu\n", __num_actors_stats, __all_processed);
+    size_t avg_gulps = __all_gulps == 0 ? 0 : __all_processed / __all_gulps;
+    printf("\tGulps:\t\t\t\t\t%lu\n\tAverage Gulp Size:\t\t\t%lu\n\tMissed gulps:\t\t\t\t%lu\n", __all_gulps, avg_gulps, misses);
+    printf("\tSteal attempts:\t\t\t\t%lu\n\tSteals:\t\t\t\t\t%lu\n\tSteal failures (no candidates):\t\t%lu\n\tSteal failures (failed swaps):\t\t%lu\n",
+        __total_tries, __total_stolen, __total_tries - __total_stolen - __total_failed_swaps, __total_failed_swaps);
+    size_t avg_steal = __total_stolen == 0 ? 0 : __all_msgs_stolen / __total_stolen;
+    printf("\tMessages stolen:\t\t\t%lu\n\tAverage steal size:\t\t\t%lu\n", __all_msgs_stolen, avg_steal);
+    #endif
+}
 // this is a static field of executor but have to forward decl for get_next_ticket
+static unsigned int __next_ticket = 0;
+static inline unsigned int get_next_ticket( executor & this ) with(this) {
+    return __atomic_fetch_add( &__next_ticket, 1, __ATOMIC_SEQ_CST) % nrqueues;
+static size_t __next_ticket = 0;
+static inline size_t __get_next_ticket( executor & this ) with(this) {
+    #ifdef __CFA_DEBUG__
+    size_t temp = __atomic_fetch_add( &__next_ticket, 1, __ATOMIC_SEQ_CST) % nrqueues;
+    // reserve MAX for dead actors
+    if ( unlikely( temp == MAX ) ) temp = __atomic_fetch_add( &__next_ticket, 1, __ATOMIC_SEQ_CST) % nrqueues;
+    return temp;
+    #else
+    return __atomic_fetch_add( &__next_ticket, 1, __ATOMIC_RELAXED) % nrqueues;
+    #endif
 } // tickets
 // C_TODO: update globals in this file to be static fields once the project is done
+// TODO: update globals in this file to be static fields once the static fields project is done
 static executor * __actor_executor_ = 0p;
 static bool __actor_executor_passed = false;        // was an executor passed to start_actor_system
 static unsigned long int __num_actors_;                         // number of actor objects in system
+static bool __actor_executor_passed = false;            // was an executor passed to start_actor_system
+static size_t __num_actors_ = 0;                                        // number of actor objects in system
 static struct thread$ * __actor_executor_thd = 0p;              // used to wake executor after actors finish
 struct actor {
+    unsigned long int ticket;           // executor-queue handle to provide FIFO message execution
+    Allocation allocation_;                     // allocation action
+    size_t ticket;                                          // executor-queue handle
+    Allocation allocation_;                                         // allocation action
+    inline virtual_dtor;
 };
 static inline void ?{}( actor & this ) {
+static inline void ?{}( actor & this ) with(this) {
     // Once an actor is allocated it must be sent a message or the actor system cannot stop. Hence, its receive
     // member must be called to end it
+    verifyf( __actor_executor_, "Creating actor before calling start_actor_system()." );
+    this.allocation_ = Nodelete;
+    this.ticket = get_next_ticket( *__actor_executor_ );
+    __atomic_fetch_add( &__num_actors_, 1, __ATOMIC_SEQ_CST );
+}
+static inline void ^?{}( actor & this ) {}
+    verifyf( __actor_executor_, "Creating actor before calling start_actor_system() can cause undefined behaviour.\n" );
+    allocation_ = Nodelete;
+    ticket = __get_next_ticket( *__actor_executor_ );
+    __atomic_fetch_add( &__num_actors_, 1, __ATOMIC_RELAXED );
+    #ifdef ACTOR_STATS
+    __atomic_fetch_add( &__num_actors_stats, 1, __ATOMIC_SEQ_CST );
+    #endif
+}
 static inline void check_actor( actor & this ) {
 …
+        }
         if ( unlikely( __atomic_add_fetch( &__num_actors_, -1, __ATOMIC_SEQ_CST ) == 0 ) ) { // all actors have terminated
+        if ( unlikely( __atomic_add_fetch( &__num_actors_, -1, __ATOMIC_RELAXED ) == 0 ) ) { // all actors have terminated
             unpark( __actor_executor_thd );
+        }
 …
 struct message {
     Allocation allocation_;                     // allocation action
+    inline virtual_dtor;
 };
+static inline void ?{}( message & this ) { this.allocation_ = Nodelete; }
+static inline void ?{}( message & this, Allocation allocation ) { this.allocation_ = allocation; }
+static inline void ^?{}( message & this ) {}
+static inline void ?{}( message & this ) {
+    this.allocation_ = Nodelete;
+}
+static inline void ?{}( message & this, Allocation allocation ) {
+    memcpy( &this.allocation_, &allocation, sizeof(allocation) ); // optimization to elide ctor
+    verifyf( this.allocation_ != Finished, "The Finished Allocation status is not supported for message types.\n");
+}
+static inline void ^?{}( message & this ) with(this) {
+    CFA_DEBUG( if ( allocation_ == Nodelete ) printf("A message at location %p was allocated but never sent.\n", &this); )
+}
 static inline void check_message( message & this ) {
     switch ( this.allocation_ ) {                                               // analyze message status
         case Nodelete: break;
+        case Nodelete: CFA_DEBUG(this.allocation_ = Finished); break;
         case Delete: delete( &this ); break;
         case Destroy: ^?{}(this); break;
 …
     } // switch
+}
+static inline void set_allocation( message & this, Allocation state ) {
+    this.allocation_ = state;
+}
 static inline void deliver_request( request & this ) {
     Allocation actor_allocation = this.fn( *this.receiver, *this.msg );
     this.receiver->allocation_ = actor_allocation;
+    this.receiver->allocation_ = this.fn( *this.receiver, *this.msg );
+    check_message( *this.msg );
     check_actor( *this.receiver );
+    check_message( *this.msg );
+}
+// tries to atomically swap two queues and returns 0p if the swap failed
+// returns ptr to newly owned queue if swap succeeds
+static inline work_queue * try_swap_queues( worker & this, unsigned int victim_idx, unsigned int my_idx ) with(this) {
+    work_queue * my_queue = request_queues[my_idx];
+    work_queue * other_queue = request_queues[victim_idx];
+    // if either queue is 0p then they are in the process of being stolen
+    if ( other_queue == 0p ) return 0p;
+    // try to set our queue ptr to be 0p. If it fails someone moved our queue so return false
+    if ( !__atomic_compare_exchange_n( &request_queues[my_idx], &my_queue, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) )
+        return 0p;
+    // try to set other queue ptr to be our queue ptr. If it fails someone moved the other queue so fix up then return false
+    if ( !__atomic_compare_exchange_n( &request_queues[victim_idx], &other_queue, my_queue, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) ) {
+        /* paranoid */ verify( request_queues[my_idx] == 0p );
+        request_queues[my_idx] = my_queue; // reset my queue ptr back to appropriate val
+        return 0p;
+    }
+    // we have successfully swapped and since our queue is 0p no one will touch it so write back new queue ptr non atomically
+    request_queues[my_idx] = other_queue; // last write does not need to be atomic
+    return other_queue;
+}
+// once a worker to steal from has been chosen, choose queue to steal from
+static inline void choose_queue( worker & this, unsigned int victim_id, unsigned int swap_idx ) with(this) {
+    // have to calculate victim start and range since victim may be deleted before us in shutdown
+    const unsigned int queues_per_worker = executor_->nrqueues / executor_->nworkers;
+    const unsigned int extras = executor_->nrqueues % executor_->nworkers;
+    unsigned int vic_start, vic_range;
+    if ( extras > victim_id  ) {
+        vic_range = queues_per_worker + 1;
+        vic_start = vic_range * victim_id;
+    } else {
+        vic_start = extras + victim_id * queues_per_worker;
+        vic_range = queues_per_worker;
+    }
+    unsigned int start_idx = prng( vic_range );
+    unsigned int tries = 0;
+    work_queue * curr_steal_queue;
+    for ( unsigned int i = start_idx; tries < vic_range; i = (i + 1) % vic_range ) {
+        tries++;
+        curr_steal_queue = request_queues[ i + vic_start ];
+        // avoid empty queues and queues that are being operated on
+        if ( curr_steal_queue == 0p || curr_steal_queue->being_processed || isEmpty( *curr_steal_queue->c_queue ) )
+            continue;
+        #ifdef ACTOR_STATS
+        curr_steal_queue = try_swap_queues( this, i + vic_start, swap_idx );
+        if ( curr_steal_queue ) {
+            executor_->w_infos[id].msgs_stolen += curr_steal_queue->c_queue->count;
+            executor_->w_infos[id].stolen++;
+            // __atomic_add_fetch(&executor_->w_infos[victim_id].stolen_from, 1, __ATOMIC_RELAXED);
+            // replaced_queue[swap_idx]++;
+            // __atomic_add_fetch(&stolen_arr[ i + vic_start ], 1, __ATOMIC_RELAXED);
+        } else {
+            executor_->w_infos[id].failed_swaps++;
+        }
+        #else
+        curr_steal_queue = try_swap_queues( this, i + vic_start, swap_idx );
+        #endif // ACTOR_STATS
+        return;
+    }
+    return;
+}
+// choose a worker to steal from
+static inline void steal_work( worker & this, unsigned int swap_idx ) with(this) {
+    #if RAND
+    unsigned int victim = prng( executor_->nworkers );
+    if ( victim == id ) victim = ( victim + 1 ) % executor_->nworkers;
+    choose_queue( this, victim, swap_idx );
+    #elif SEARCH
+    unsigned long long min = MAX; // smaller timestamp means longer since service
+    int min_id = 0; // use ints not uints to avoid integer underflow without hacky math
+    int n_workers = executor_->nworkers;
+    unsigned long long curr_stamp;
+    int scount = 1;
+    for ( int i = (id + 1) % n_workers; scount < n_workers; i = (i + 1) % n_workers, scount++ ) {
+        curr_stamp = executor_->w_infos[i].stamp;
+        if ( curr_stamp < min ) {
+            min = curr_stamp;
+            min_id = i;
+        }
+    }
+    choose_queue( this, min_id, swap_idx );
+    #endif
+}
 void main( worker & this ) with(this) {
+    bool should_delete;
+    // #ifdef ACTOR_STATS
+    // for ( i; executor_->nrqueues ) {
+    //     replaced_queue[i] = 0;
+    //     __atomic_store_n( &stolen_arr[i], 0, __ATOMIC_SEQ_CST );
+    // }
+    // #endif
+    // threshold of empty queues we see before we go stealing
+    const unsigned int steal_threshold = 2 * range;
+    // Store variable data here instead of worker struct to avoid any potential false sharing
+    unsigned int empty_count = 0;
+    request & req;
+    work_queue * curr_work_queue;
     Exit:
     for ( unsigned int i = 0;; i = (i + 1) % range ) { // cycle through set of request buffers
+        // C_TODO: potentially check queue count instead of immediately trying to transfer
+        transfer( request_queues[i + start], &current_queue );
+        curr_work_queue = request_queues[i + start];
+        // check if queue is empty before trying to gulp it
+        if ( isEmpty( *curr_work_queue->c_queue ) ) {
+            #ifdef __STEAL
+            empty_count++;
+            if ( empty_count < steal_threshold ) continue;
+            #else
+            continue;
+            #endif
+        }
+        transfer( *curr_work_queue, &current_queue );
+        #ifdef ACTOR_STATS
+        executor_->w_infos[id].gulps++;
+        #endif // ACTOR_STATS
+        #ifdef __STEAL
+        if ( isEmpty( *current_queue ) ) {
+            if ( unlikely( no_steal ) ) continue;
+            empty_count++;
+            if ( empty_count < steal_threshold ) continue;
+            empty_count = 0;
+            __atomic_store_n( &executor_->w_infos[id].stamp, rdtscl(), __ATOMIC_RELAXED );
+            #ifdef ACTOR_STATS
+            executor_->w_infos[id].try_steal++;
+            #endif // ACTOR_STATS
+            steal_work( this, start + prng( range ) );
+            continue;
+        }
+        #endif // __STEAL
         while ( ! isEmpty( *current_queue ) ) {
+            &req = &remove( *current_queue, should_delete );
+            if ( !&req ) continue; // possibly add some work stealing/idle sleep here
+            #ifdef ACTOR_STATS
+            executor_->w_infos[id].processed++;
+            #endif
+            &req = &remove( *current_queue );
+            if ( !&req ) continue;
             if ( req.stop ) break Exit;
             deliver_request( req );
+            if ( should_delete ) delete( &req );
+        } // while
+        }
+        #ifdef __STEAL
+        curr_work_queue->being_processed = false; // set done processing
+        empty_count = 0; // we found work so reset empty counter
+        #endif
+        // potentially reclaim some of the current queue's vector space if it is unused
+        reclaim( *current_queue );
     } // for
+}
 …
 static inline void send( actor & this, request & req ) {
+    verifyf( this.ticket != (unsigned long int)MAX, "Attempted to send message to deleted/dead actor\n" );
     send( *__actor_executor_, req, this.ticket );
+}
+static inline void __reset_stats() {
+    #ifdef ACTOR_STATS
+    __total_tries = 0;
+    __total_stolen = 0;
+    __all_gulps = 0;
+    __total_failed_swaps = 0;
+    __all_processed = 0;
+    __num_actors_stats = 0;
+    __all_msgs_stolen = 0;
+    #endif
+}
 static inline void start_actor_system( size_t num_thds ) {
+    __reset_stats();
     __actor_executor_thd = active_thread();
     __actor_executor_ = alloc();
 …
+}
+static inline void start_actor_system() { start_actor_system( active_cluster()->procs.total ); }
+// TODO: potentially revisit getting number of processors
+//  ( currently the value stored in active_cluster()->procs.total is often stale
+//  and doesn't reflect how many procs are allocated )
+// static inline void start_actor_system() { start_actor_system( active_cluster()->procs.total ); }
+static inline void start_actor_system() { start_actor_system( 1 ); }
 static inline void start_actor_system( executor & this ) {
+    __reset_stats();
     __actor_executor_thd = active_thread();
     __actor_executor_ = &this;
 …
     __actor_executor_passed = false;
+}
+// Default messages to send to any actor to change status
+// assigned at creation to __base_msg_finished to avoid unused message warning
+message __base_msg_finished @= { .allocation_ : Finished };
+struct __DeleteMsg { inline message; } DeleteMsg = __base_msg_finished;
+struct __DestroyMsg { inline message; } DestroyMsg = __base_msg_finished;
+struct __FinishedMsg { inline message; } FinishedMsg = __base_msg_finished;
+Allocation receive( actor & this, __DeleteMsg & msg ) { return Delete; }
+Allocation receive( actor & this, __DestroyMsg & msg ) { return Destroy; }
+Allocation receive( actor & this, __FinishedMsg & msg ) { return Finished; }

libcfa/src/concurrency/channel.hfa

-              r2ed94a9
+              rb110bcc
+#pragma once
 #include <locks.hfa>
+struct no_reacq_lock {
+    inline exp_backoff_then_block_lock;
+#include <list.hfa>
+#include <mutex_stmt.hfa>
+// link field used for threads waiting on channel
+struct wait_link {
+    // used to put wait_link on a dl queue
+    inline dlink(wait_link);
+    // waiting thread
+    struct thread$ * t;
+    // shadow field
+    void * elem;
 };
+// have to override these by hand to get around plan 9 inheritance bug where resolver can't find the appropriate routine to call
+static inline void   ?{}( no_reacq_lock & this ) { ((exp_backoff_then_block_lock &)this){}; }
+static inline bool   try_lock(no_reacq_lock & this) { return try_lock(((exp_backoff_then_block_lock &)this)); }
+static inline void   lock(no_reacq_lock & this) { lock(((exp_backoff_then_block_lock &)this)); }
+static inline void   unlock(no_reacq_lock & this) { unlock(((exp_backoff_then_block_lock &)this)); }
+static inline void   on_notify(no_reacq_lock & this, struct thread$ * t ) { on_notify(((exp_backoff_then_block_lock &)this), t); }
+static inline size_t on_wait(no_reacq_lock & this) { return on_wait(((exp_backoff_then_block_lock &)this)); }
+// override wakeup so that we don't reacquire the lock if using a condvar
+static inline void   on_wakeup( no_reacq_lock & this, size_t recursion ) {}
+P9_EMBEDDED( wait_link, dlink(wait_link) )
+static inline void ?{}( wait_link & this, thread$ * t, void * elem ) {
+    this.t = t;
+    this.elem = elem;
+}
+// wake one thread from the list
+static inline void wake_one( dlist( wait_link ) & queue ) {
+    wait_link & popped = try_pop_front( queue );
+    unpark( popped.t );
+}
+// returns true if woken due to shutdown
+// blocks thread on list and releases passed lock
+static inline bool block( dlist( wait_link ) & queue, void * elem_ptr, go_mutex & lock ) {
+    wait_link w{ active_thread(), elem_ptr };
+    insert_last( queue, w );
+    unlock( lock );
+    park();
+    return w.elem == 0p;
+}
+// void * used for some fields since exceptions don't work with parametric polymorphism currently
+exception channel_closed {
+    // on failed insert elem is a ptr to the element attempting to be inserted
+    // on failed remove elem ptr is 0p
+    // on resumption of a failed insert this elem will be inserted
+    // so a user may modify it in the resumption handler
+    void * elem;
+    // pointer to chan that is closed
+    void * closed_chan;
+};
+vtable(channel_closed) channel_closed_vt;
+// #define CHAN_STATS // define this to get channel stats printed in dtor
 forall( T ) {
+struct channel {
+    size_t size;
     size_t front, back, count;
+struct __attribute__((aligned(128))) channel {
+    size_t size, front, back, count;
     T * buffer;
+    fast_cond_var( no_reacq_lock ) prods, cons;
+    no_reacq_lock mutex_lock;
+    dlist( wait_link ) prods, cons; // lists of blocked threads
+    go_mutex mutex_lock;            // MX lock
+    bool closed;                    // indicates channel close/open
+    #ifdef CHAN_STATS
+    size_t blocks, operations;      // counts total ops and ops resulting in a blocked thd
+    #endif
 };
 …
     size = _size;
     front = back = count = 0;
     buffer = anew( size );
+    buffer = aalloc( size );
     prods{};
     cons{};
     mutex_lock{};
+    closed = false;
+    #ifdef CHAN_STATS
+    blocks = 0;
+    operations = 0;
+    #endif
+}
 static inline void ?{}( channel(T) &c ){ ((channel(T) &)c){ 0 }; }
+static inline void ^?{}( channel(T) &c ) with(c) { delete( buffer ); }
+static inline void ^?{}( channel(T) &c ) with(c) {
+    #ifdef CHAN_STATS
+    printf("Channel %p Blocks: %lu, Operations: %lu, %.2f%% of ops blocked\n", &c, blocks, operations, ((double)blocks)/operations * 100);
+    #endif
+    verifyf( cons`isEmpty && prods`isEmpty, "Attempted to delete channel with waiting threads (Deadlock).\n" );
+    delete( buffer );
+}
 static inline size_t get_count( channel(T) & chan ) with(chan) { return count; }
 static inline size_t get_size( channel(T) & chan ) with(chan) { return size; }
+static inline bool has_waiters( channel(T) & chan ) with(chan) { return !empty( cons ) || !empty( prods ); }
+static inline bool has_waiting_consumers( channel(T) & chan ) with(chan) { return !empty( cons ); }
+static inline bool has_waiting_producers( channel(T) & chan ) with(chan) { return !empty( prods ); }
+static inline void insert_( channel(T) & chan, T elem ) with(chan) {
+static inline bool has_waiters( channel(T) & chan ) with(chan) { return !cons`isEmpty || !prods`isEmpty; }
+static inline bool has_waiting_consumers( channel(T) & chan ) with(chan) { return !cons`isEmpty; }
+static inline bool has_waiting_producers( channel(T) & chan ) with(chan) { return !prods`isEmpty; }
+// closes the channel and notifies all blocked threads
+static inline void close( channel(T) & chan ) with(chan) {
+    lock( mutex_lock );
+    closed = true;
+    // flush waiting consumers and producers
+    while ( has_waiting_consumers( chan ) ) {
+        cons`first.elem = 0p;
+        wake_one( cons );
+    }
+    while ( has_waiting_producers( chan ) ) {
+        prods`first.elem = 0p;
+        wake_one( prods );
+    }
+    unlock(mutex_lock);
+}
+static inline void is_closed( channel(T) & chan ) with(chan) { return closed; }
+static inline void flush( channel(T) & chan, T elem ) with(chan) {
+    lock( mutex_lock );
+    while ( count == 0 && !cons`isEmpty ) {
+        memcpy(cons`first.elem, (void *)&elem, sizeof(T)); // do waiting consumer work
+        wake_one( cons );
+    }
+    unlock( mutex_lock );
+}
+// handles buffer insert
+static inline void __buf_insert( channel(T) & chan, T & elem ) with(chan) {
     memcpy((void *)&buffer[back], (void *)&elem, sizeof(T));
     count += 1;
 …
+}
+// does the buffer insert or hands elem directly to consumer if one is waiting
+static inline void __do_insert( channel(T) & chan, T & elem ) with(chan) {
+    if ( count == 0 && !cons`isEmpty ) {
+        memcpy(cons`first.elem, (void *)&elem, sizeof(T)); // do waiting consumer work
+        wake_one( cons );
+    } else __buf_insert( chan, elem );
+}
+// needed to avoid an extra copy in closed case
+static inline bool __internal_try_insert( channel(T) & chan, T & elem ) with(chan) {
+    lock( mutex_lock );
+    #ifdef CHAN_STATS
+    operations++;
+    #endif
+    if ( count == size ) { unlock( mutex_lock ); return false; }
+    __do_insert( chan, elem );
+    unlock( mutex_lock );
+    return true;
+}
+// attempts a nonblocking insert
+// returns true if insert was successful, false otherwise
+static inline bool try_insert( channel(T) & chan, T elem ) { return __internal_try_insert( chan, elem ); }
+// handles closed case of insert routine
+static inline void __closed_insert( channel(T) & chan, T & elem ) with(chan) {
+    channel_closed except{&channel_closed_vt, &elem, &chan };
+    throwResume except; // throw closed resumption
+    if ( !__internal_try_insert( chan, elem ) ) throw except; // if try to insert fails (would block), throw termination
+}
 static inline void insert( channel(T) & chan, T elem ) with(chan) {
+    lock( mutex_lock );
+    // check for close before acquire mx
+    if ( unlikely(closed) ) {
+        __closed_insert( chan, elem );
+        return;
+    }
+    lock( mutex_lock );
+    #ifdef CHAN_STATS
+    if ( !closed ) operations++;
+    #endif
+    // if closed handle
+    if ( unlikely(closed) ) {
+        unlock( mutex_lock );
+        __closed_insert( chan, elem );
+        return;
+    }
     // have to check for the zero size channel case
     if ( size == 0 && !empty( cons ) ) {
         memcpy((void *)front( cons ), (void *)&elem, sizeof(T));
         notify_one( cons );
+    if ( size == 0 && !cons`isEmpty ) {
+        memcpy(cons`first.elem, (void *)&elem, sizeof(T));
+        wake_one( cons );
         unlock( mutex_lock );
         return;
+        return true;
+    }
     // wait if buffer is full, work will be completed by someone else
+    if ( count == size ) {
+        wait( prods, mutex_lock, (uintptr_t)&elem );
+    if ( count == size ) {
+        #ifdef CHAN_STATS
+        blocks++;
+        #endif
+        // check for if woken due to close
+        if ( unlikely( block( prods, &elem, mutex_lock ) ) )
+            __closed_insert( chan, elem );
         return;
     } // if
     if ( count == 0 && !empty( cons ) )
         // do waiting consumer work
         memcpy((void *)front( cons ), (void *)&elem, sizeof(T));
     else insert_( chan, elem );
+    if ( count == 0 && !cons`isEmpty ) {
+        memcpy(cons`first.elem, (void *)&elem, sizeof(T)); // do waiting consumer work
+        wake_one( cons );
+    } else __buf_insert( chan, elem );
+    notify_one( cons );
+    unlock( mutex_lock );
+}
+static inline T remove( channel(T) & chan ) with(chan) {
+    lock( mutex_lock );
+    T retval;
+    // have to check for the zero size channel case
+    if ( size == 0 && !empty( prods ) ) {
+        memcpy((void *)&retval, (void *)front( prods ), sizeof(T));
+        notify_one( prods );
+        unlock( mutex_lock );
+        return retval;
+    }
+    // wait if buffer is empty, work will be completed by someone else
+    if (count == 0) {
+        wait( cons, mutex_lock, (uintptr_t)&retval );
+        return retval;
+    }
+    // Remove from buffer
+    unlock( mutex_lock );
+    return;
+}
+// handles buffer remove
+static inline void __buf_remove( channel(T) & chan, T & retval ) with(chan) {
     memcpy((void *)&retval, (void *)&buffer[front], sizeof(T));
     count -= 1;
     front = (front + 1) % size;
+    if (count == size - 1 && !empty( prods ) )
+        insert_( chan, *((T *)front( prods )) );  // do waiting producer work
+    notify_one( prods );
+    unlock( mutex_lock );
+}
+// does the buffer remove and potentially does waiting producer work
+static inline void __do_remove( channel(T) & chan, T & retval ) with(chan) {
+    __buf_remove( chan, retval );
+    if (count == size - 1 && !prods`isEmpty ) {
+        __buf_insert( chan, *(T *)prods`first.elem );  // do waiting producer work
+        wake_one( prods );
+    }
+}
+// needed to avoid an extra copy in closed case and single return val case
+static inline bool __internal_try_remove( channel(T) & chan, T & retval ) with(chan) {
+    lock( mutex_lock );
+    #ifdef CHAN_STATS
+    operations++;
+    #endif
+    if ( count == 0 ) { unlock( mutex_lock ); return false; }
+    __do_remove( chan, retval );
+    unlock( mutex_lock );
+    return true;
+}
+// attempts a nonblocking remove
+// returns [T, true] if insert was successful
+// returns [T, false] if insert was successful (T uninit)
+static inline [T, bool] try_remove( channel(T) & chan ) {
+    T retval;
+    return [ retval, __internal_try_remove( chan, retval ) ];
+}
+static inline T try_remove( channel(T) & chan, T elem ) {
+    T retval;
+    __internal_try_remove( chan, retval );
     return retval;
+}
+// handles closed case of insert routine
+static inline void __closed_remove( channel(T) & chan, T & retval ) with(chan) {
+    channel_closed except{&channel_closed_vt, 0p, &chan };
+    throwResume except; // throw resumption
+    if ( !__internal_try_remove( chan, retval ) ) throw except; // if try to remove fails (would block), throw termination
+}
+static inline T remove( channel(T) & chan ) with(chan) {
+    T retval;
+    if ( unlikely(closed) ) {
+        __closed_remove( chan, retval );
+        return retval;
+    }
+    lock( mutex_lock );
+    #ifdef CHAN_STATS
+    if ( !closed ) operations++;
+    #endif
+    if ( unlikely(closed) ) {
+        unlock( mutex_lock );
+        __closed_remove( chan, retval );
+        return retval;
+    }
+    // have to check for the zero size channel case
+    if ( size == 0 && !prods`isEmpty ) {
+        memcpy((void *)&retval, (void *)prods`first.elem, sizeof(T));
+        wake_one( prods );
+        unlock( mutex_lock );
+        return retval;
+    }
+    // wait if buffer is empty, work will be completed by someone else
+    if (count == 0) {
+        #ifdef CHAN_STATS
+        blocks++;
+        #endif
+        // check for if woken due to close
+        if ( unlikely( block( cons, &retval, mutex_lock ) ) )
+            __closed_remove( chan, retval );
+        return retval;
+    }
+    // Remove from buffer
+    __do_remove( chan, retval );
+    unlock( mutex_lock );
+    return retval;
+}
 } // forall( T )

libcfa/src/concurrency/clib/cfathread.cfa

-              r2ed94a9
+              rb110bcc
 // #define EPOLL_FOR_SOCKETS
+#include <string.h>
 #include "fstream.hfa"
 #include "locks.hfa"
 …
 #include "time.hfa"
 #include "stdlib.hfa"
+#include "iofwd.hfa"
 #include "cfathread.h"
-extern "C" {
-                #include <string.h>
-                #include <errno.h>
+}
 extern void ?{}(processor &, const char[], cluster &, thread$ *);
 extern "C" {
+      extern void __cfactx_invoke_thread(void (*main)(void *), void * this);
+        extern int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
+        extern void __cfactx_invoke_thread(void (*main)(void *), void * this);
+}
 …
+}
-#include <iofwd.hfa>
 extern "C" {
-        #include <unistd.h>
-        #include <sys/types.h>
-        #include <sys/socket.h>
         //--------------------
         // IO operations
 …
                 , protocol);
+        }
         int cfathread_bind(int socket, const struct sockaddr *address, socklen_t address_len) {
+        int cfathread_bind(int socket, __CONST_SOCKADDR_ARG address, socklen_t address_len) {
                 return bind(socket, address, address_len);
+        }
 …
+        }
         int cfathread_accept(int socket, struct sockaddr *restrict address, socklen_t *restrict address_len) {
+        int cfathread_accept(int socket, __SOCKADDR_ARG address, socklen_t *restrict address_len) {
                 #if defined(EPOLL_FOR_SOCKETS)
                         int ret;
 …
+        }
         int cfathread_connect(int socket, const struct sockaddr *address, socklen_t address_len) {
+        int cfathread_connect(int socket, __CONST_SOCKADDR_ARG address, socklen_t address_len) {
                 #if defined(EPOLL_FOR_SOCKETS)
                         int ret;

libcfa/src/concurrency/clib/cfathread.h

-              r2ed94a9
+              rb110bcc
 // Author           : Thierry Delisle
 // Created On       : Tue Sep 22 15:31:20 2020
 // Last Modified By :
 // Last Modified On :
 // Update Count     :
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Mon Mar 13 23:48:40 2023
+// Update Count     : 7
 //
+#pragma once
 #if defined(__cforall) || defined(__cplusplus)
+#include <unistd.h>
+#include <errno.h>
+#include <sys/socket.h>
 extern "C" {
 #endif
-        #include <asm/types.h>
-        #include <errno.h>
-        #include <unistd.h>
         //--------------------
         // Basic types
 …
         } cfathread_mutexattr_t;
         typedef struct cfathread_mutex * cfathread_mutex_t;
         int cfathread_mutex_init(cfathread_mutex_t *restrict mut, const cfathread_mutexattr_t *restrict attr) __attribute__((nonnull (1)));
+        int cfathread_mutex_init(cfathread_mutex_t * restrict mut, const cfathread_mutexattr_t * restrict attr) __attribute__((nonnull (1)));
         int cfathread_mutex_destroy(cfathread_mutex_t *mut) __attribute__((nonnull (1)));
         int cfathread_mutex_lock(cfathread_mutex_t *mut) __attribute__((nonnull (1)));
 …
         //--------------------
         // IO operations
-        struct sockaddr;
-        struct msghdr;
         int cfathread_socket(int domain, int type, int protocol);
         int cfathread_bind(int socket, const struct sockaddr *address, socklen_t address_len);
+        int cfathread_bind(int socket, __CONST_SOCKADDR_ARG address, socklen_t address_len);
         int cfathread_listen(int socket, int backlog);
         int cfathread_accept(int socket, struct sockaddr *restrict address, socklen_t *restrict address_len);
         int cfathread_connect(int socket, const struct sockaddr *address, socklen_t address_len);
+        int cfathread_accept(int socket, __SOCKADDR_ARG address, socklen_t * restrict address_len);
+        int cfathread_connect(int socket, __CONST_SOCKADDR_ARG address, socklen_t address_len);
         int cfathread_dup(int fildes);
         int cfathread_close(int fildes);

libcfa/src/concurrency/coroutine.cfa

-              r2ed94a9
+              rb110bcc
 // Created On       : Mon Nov 28 12:27:26 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Dec 15 12:06:04 2020
 // Update Count     : 23
+// Last Modified On : Thu Feb 16 15:34:46 2023
+// Update Count     : 24
 //
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include "coroutine.hfa"

libcfa/src/concurrency/future.hfa

r2ed94a9	rb110bcc
14	14	//
15	15
16		// #pragma once
	16	#pragma once
17	17
18	18	#include "bits/locks.hfa"

libcfa/src/concurrency/invoke.h

-              r2ed94a9
+              rb110bcc
 // Created On       : Tue Jan 17 12:27:26 2016
 // Last Modified By : Peter A. Buhr
+// Last Modified On : Tue Nov 29 20:42:21 2022
+// Update Count     : 56
+//
+// Last Modified On : Tue Mar 14 13:39:31 2023
+// Update Count     : 59
+//
+// No not use #pragma once was this file is included twice in some places. It has its own guard system.
 #include "bits/containers.hfa"

libcfa/src/concurrency/io.cfa

-              r2ed94a9
+              rb110bcc
 #define __cforall_thread__
-#define _GNU_SOURCE
 #if defined(__CFA_DEBUG__)
 …
                                 // make sure the target hasn't stopped existing since last time
                                 HELP: if(target < ctxs_count) {
                                         // calculate it's age and how young it could be before we give ip on helping
+                                        // calculate it's age and how young it could be before we give up on helping
                                         const __readyQ_avg_t cutoff = calc_cutoff(ctsc, ctx->cq.id, ctxs_count, io.data, io.tscs, __shard_factor.io, false);
                                         const __readyQ_avg_t age = moving_average(ctsc, io.tscs[target].t.tv, io.tscs[target].t.ma, false);

libcfa/src/concurrency/io/call.cfa.in

-              r2ed94a9
+              rb110bcc
 Prelude = """#define __cforall_thread__
+#include <unistd.h>
+#include <errno.h>
+#include <sys/socket.h>
+#include <time.hfa>
 #include "bits/defs.hfa"
 #include "kernel.hfa"
 …
         #include <assert.h>
         #include <stdint.h>
-        #include <errno.h>
         #include <linux/io_uring.h>
         #include "kernel/fwd.hfa"
 …
 // I/O Forwards
 //=============================================================================================
-#include <time.hfa>
-// Some forward declarations
-#include <errno.h>
-#include <unistd.h>
 extern "C" {
-        #include <asm/types.h>
-        #include <sys/socket.h>
-        #include <sys/syscall.h>
 #if defined(CFA_HAVE_PREADV2)
         struct iovec;
         extern ssize_t preadv2 (int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
+        extern ssize_t preadv2 (int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags);
 #endif
 #if defined(CFA_HAVE_PWRITEV2)
         struct iovec;
         extern ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
+        extern ssize_t pwritev2(int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags);
 #endif
 …
         struct msghdr;
         struct sockaddr;
+        extern ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags);
+        extern ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags);
+        extern ssize_t send(int sockfd, const void *buf, size_t len, int flags);
+        extern ssize_t recv(int sockfd, void *buf, size_t len, int flags);
+        extern int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
+        extern int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+        extern ssize_t sendmsg(int sockfd, const struct msghdr * msg, int flags);
+        extern ssize_t recvmsg(int sockfd, struct msghdr * msg, int flags);
+        extern ssize_t send(int sockfd, const void * buf, size_t len, int flags);
+        extern ssize_t recv(int sockfd, void * buf, size_t len, int flags);
         extern int fallocate(int fd, int mode, off_t offset, off_t len);
         extern int posix_fadvise(int fd, off_t offset, off_t len, int advice);
         extern int madvise(void *addr, size_t length, int advice);
         extern int openat(int dirfd, const char *pathname, int flags, mode_t mode);
+        extern int madvise(void * addr, size_t length, int advice);
+        extern int openat(int dirfd, const char * pathname, int flags, mode_t mode);
         extern int close(int fd);
         extern ssize_t read (int fd, void *buf, size_t count);
+        extern ssize_t read (int fd, void * buf, size_t count);
         struct epoll_event;
         extern int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
         extern ssize_t splice(int fd_in, __off64_t *off_in, int fd_out, __off64_t *off_out, size_t len, unsigned int flags);
+        extern int epoll_ctl(int epfd, int op, int fd, struct epoll_event * event);
+        extern ssize_t splice(int fd_in, __off64_t * off_in, int fd_out, __off64_t * off_out, size_t len, unsigned int flags);
         extern ssize_t tee(int fd_in, int fd_out, size_t len, unsigned int flags);
+}
 …
 calls = [
         # CFA_HAVE_IORING_OP_READV
         Call('READV', 'ssize_t preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags)', {
+        Call('READV', 'ssize_t preadv2(int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags)', {
                 'fd'  : 'fd',
+                'addr': '(typeof(sqe->addr))iov',
+                'len' : 'iovcnt',
                 'off' : 'offset',
+                'addr': '(uintptr_t)iov',
+                'len' : 'iovcnt',
+                'rw_flags' : 'flags'
         }, define = 'CFA_HAVE_PREADV2'),
         # CFA_HAVE_IORING_OP_WRITEV
         Call('WRITEV', 'ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags)', {
+        Call('WRITEV', 'ssize_t pwritev2(int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags)', {
                 'fd'  : 'fd',
+                'addr': '(typeof(sqe->addr))iov',
+                'len' : 'iovcnt',
                 'off' : 'offset',
+                'addr': '(uintptr_t)iov',
+                'len' : 'iovcnt'
+                'rw_flags' : 'flags'
         }, define = 'CFA_HAVE_PWRITEV2'),
         # CFA_HAVE_IORING_OP_FSYNC
 …
         }),
         # CFA_HAVE_IORING_OP_EPOLL_CTL
         Call('EPOLL_CTL', 'int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event)', {
+        Call('EPOLL_CTL', 'int epoll_ctl(int epfd, int op, int fd, struct epoll_event * event)', {
                 'fd': 'epfd',
+                'len': 'op',
                 'addr': 'fd',
+                'len': 'op',
+                'off': '(uintptr_t)event'
+                'off': '(typeof(sqe->off))event'
         }),
         # CFA_HAVE_IORING_OP_SYNC_FILE_RANGE
 …
         }),
         # CFA_HAVE_IORING_OP_SENDMSG
         Call('SENDMSG', 'ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(uintptr_t)(struct msghdr *)msg',
+        Call('SENDMSG', 'ssize_t sendmsg(int sockfd, const struct msghdr * msg, int flags)', {
+                'fd': 'sockfd',
+                'addr': '(typeof(sqe->addr))(struct msghdr *)msg',
                 'len': '1',
                 'msg_flags': 'flags'
         }),
         # CFA_HAVE_IORING_OP_RECVMSG
         Call('RECVMSG', 'ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(uintptr_t)(struct msghdr *)msg',
+        Call('RECVMSG', 'ssize_t recvmsg(int sockfd, struct msghdr * msg, int flags)', {
+                'fd': 'sockfd',
+                'addr': '(typeof(sqe->addr))(struct msghdr *)msg',
                 'len': '1',
                 'msg_flags': 'flags'
         }),
         # CFA_HAVE_IORING_OP_SEND
         Call('SEND', 'ssize_t send(int sockfd, const void *buf, size_t len, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(uintptr_t)buf',
+        Call('SEND', 'ssize_t send(int sockfd, const void * buf, size_t len, int flags)', {
+                'fd': 'sockfd',
+                'addr': '(typeof(sqe->addr))buf',
                 'len': 'len',
                 'msg_flags': 'flags'
         }),
         # CFA_HAVE_IORING_OP_RECV
         Call('RECV', 'ssize_t recv(int sockfd, void *buf, size_t len, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(uintptr_t)buf',
+        Call('RECV', 'ssize_t recv(int sockfd, void * buf, size_t len, int flags)', {
+                'fd': 'sockfd',
+                'addr': '(typeof(sqe->addr))buf',
                 'len': 'len',
                 'msg_flags': 'flags'
         }),
         # CFA_HAVE_IORING_OP_ACCEPT
         Call('ACCEPT', 'int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(uintptr_t)addr',
                 'addr2': '(uintptr_t)addrlen',
+        Call('ACCEPT', 'int accept4(int sockfd, __SOCKADDR_ARG addr, socklen_t * restrict addrlen, int flags)', {
+                'fd': 'sockfd',
+                'addr': '(typeof(sqe->addr))&addr',
+                'addr2': '(typeof(sqe->addr2))addrlen',
                 'accept_flags': 'flags'
         }),
         # CFA_HAVE_IORING_OP_CONNECT
         Call('CONNECT', 'int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen)', {
                 'fd': 'sockfd',
                 'addr': '(uintptr_t)addr',
+        Call('CONNECT', 'int connect(int sockfd, __CONST_SOCKADDR_ARG addr, socklen_t addrlen)', {
+                'fd': 'sockfd',
+                'addr': '(typeof(sqe->addr))&addr',
                 'off': 'addrlen'
         }),
 …
         Call('FALLOCATE', 'int fallocate(int fd, int mode, off_t offset, off_t len)', {
                 'fd': 'fd',
-                'addr': '(uintptr_t)len',
                 'len': 'mode',
+                'off': 'offset'
+                'off': 'offset',
+                'addr': 'len'
         }),
         # CFA_HAVE_IORING_OP_FADVISE
 …
         }),
         # CFA_HAVE_IORING_OP_MADVISE
         Call('MADVISE', 'int madvise(void *addr, size_t length, int advice)', {
                 'addr': '(uintptr_t)addr',
+        Call('MADVISE', 'int madvise(void * addr, size_t length, int advice)', {
+                'addr': '(typeof(sqe->addr))addr',
                 'len': 'length',
                 'fadvise_advice': 'advice'
         }),
         # CFA_HAVE_IORING_OP_OPENAT
         Call('OPENAT', 'int openat(int dirfd, const char *pathname, int flags, mode_t mode)', {
+        Call('OPENAT', 'int openat(int dirfd, const char * pathname, int flags, mode_t mode)', {
                 'fd': 'dirfd',
                 'addr': '(uintptr_t)pathname',
                 'len': 'mode',
                 'open_flags': 'flags;'
+                'addr': '(typeof(sqe->addr))pathname',
+                'open_flags': 'flags;',
+                'len': 'mode'
         }),
         # CFA_HAVE_IORING_OP_OPENAT2
         Call('OPENAT2', 'int openat2(int dirfd, const char *pathname, struct open_how * how, size_t size)', {
+        Call('OPENAT2', 'int openat2(int dirfd, const char * pathname, struct open_how * how, size_t size)', {
                 'fd': 'dirfd',
                 'addr': 'pathname',
                 'len': 'sizeof(*how)',
                 'off': '(uintptr_t)how',
+                'addr': '(typeof(sqe->addr))pathname',
+                'off': '(typeof(sqe->off))how',
+                'len': 'sizeof(*how)'
         }, define = 'CFA_HAVE_OPENAT2'),
         # CFA_HAVE_IORING_OP_CLOSE
 …
         }),
         # CFA_HAVE_IORING_OP_STATX
         Call('STATX', 'int statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf)', {
+        Call('STATX', 'int statx(int dirfd, const char * pathname, int flags, unsigned int mask, struct statx * statxbuf)', {
                 'fd': 'dirfd',
                 'off': '(uintptr_t)statxbuf',
                 'addr': 'pathname',
+                'addr': '(typeof(sqe->addr))pathname',
+                'statx_flags': 'flags',
                 'len': 'mask',
                 'statx_flags': 'flags'
+                'off': '(typeof(sqe->off))statxbuf'
         }, define = 'CFA_HAVE_STATX'),
         # CFA_HAVE_IORING_OP_READ
         Call('READ', 'ssize_t read(int fd, void * buf, size_t count)', {
                 'fd': 'fd',
                 'addr': '(uintptr_t)buf',
+                'addr': '(typeof(sqe->addr))buf',
                 'len': 'count'
         }),
 …
         Call('WRITE', 'ssize_t write(int fd, void * buf, size_t count)', {
                 'fd': 'fd',
                 'addr': '(uintptr_t)buf',
+                'addr': '(typeof(sqe->addr))buf',
                 'len': 'count'
         }),
         # CFA_HAVE_IORING_OP_SPLICE
         Call('SPLICE', 'ssize_t splice(int fd_in, __off64_t *off_in, int fd_out, __off64_t *off_out, size_t len, unsigned int flags)', {
+        Call('SPLICE', 'ssize_t splice(int fd_in, __off64_t * off_in, int fd_out, __off64_t * off_out, size_t len, unsigned int flags)', {
                 'splice_fd_in': 'fd_in',
                 'splice_off_in': 'off_in ? (__u64)*off_in : (__u64)-1',
+                'splice_off_in': 'off_in ? (typeof(sqe->splice_off_in))*off_in : (typeof(sqe->splice_off_in))-1',
                 'fd': 'fd_out',
                 'off': 'off_out ? (__u64)*off_out : (__u64)-1',
+                'off': 'off_out ? (typeof(sqe->off))*off_out : (typeof(sqe->off))-1',
                 'len': 'len',
                 'splice_flags': 'flags'

libcfa/src/concurrency/io/setup.cfa

r2ed94a9	rb110bcc
15	15
16	16	#define __cforall_thread__
17		~~#define _GNU_SOURCE~~
18	17
19	18	#if defined(__CFA_DEBUG__)

libcfa/src/concurrency/iofwd.hfa

-              r2ed94a9
+              rb110bcc
 // Author           : Thierry Delisle
 // Created On       : Thu Apr 23 17:31:00 2020
 // Last Modified By :
 // Last Modified On :
 // Update Count     :
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Mon Mar 13 23:54:57 2023
+// Update Count     : 1
 //
 …
 #include <unistd.h>
+#include <sys/socket.h>
 extern "C" {
         #include <asm/types.h>
 …
 typedef __off64_t off64_t;
-struct cluster;
-struct io_context$;
-struct iovec;
-struct msghdr;
-struct sockaddr;
-struct statx;
 struct epoll_event;
-struct io_uring_sqe;
 //-----------------------------------------------------------------------
 …
 // synchronous calls
 #if defined(CFA_HAVE_PREADV2)
         extern ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
+        extern ssize_t cfa_preadv2(int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
 #if defined(CFA_HAVE_PWRITEV2)
         extern ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
+        extern ssize_t cfa_pwritev2(int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
 extern int cfa_fsync(int fd, __u64 submit_flags);
 extern int cfa_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event, __u64 submit_flags);
+extern int cfa_epoll_ctl(int epfd, int op, int fd, struct epoll_event * event, __u64 submit_flags);
 extern int cfa_sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags, __u64 submit_flags);
 extern  ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags, __u64 submit_flags);
 extern ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags, __u64 submit_flags);
 extern ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags, __u64 submit_flags);
 extern ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags, __u64 submit_flags);
 extern int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, __u64 submit_flags);
 extern int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen, __u64 submit_flags);
+extern  ssize_t cfa_sendmsg(int sockfd, const struct msghdr * msg, int flags, __u64 submit_flags);
+extern ssize_t cfa_recvmsg(int sockfd, struct msghdr * msg, int flags, __u64 submit_flags);
+extern ssize_t cfa_send(int sockfd, const void * buf, size_t len, int flags, __u64 submit_flags);
+extern ssize_t cfa_recv(int sockfd, void * buf, size_t len, int flags, __u64 submit_flags);
+extern int cfa_accept4(int sockfd, __SOCKADDR_ARG addr, socklen_t * restrict addrlen, int flags, __u64 submit_flags);
+extern int cfa_connect(int sockfd, __CONST_SOCKADDR_ARG addr, socklen_t addrlen, __u64 submit_flags);
 extern int cfa_fallocate(int fd, int mode, off_t offset, off_t len, __u64 submit_flags);
 extern int cfa_posix_fadvise(int fd, off_t offset, off_t len, int advice, __u64 submit_flags);
 extern int cfa_madvise(void *addr, size_t length, int advice, __u64 submit_flags);
 extern int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode, __u64 submit_flags);
+extern int cfa_madvise(void * addr, size_t length, int advice, __u64 submit_flags);
+extern int cfa_openat(int dirfd, const char * pathname, int flags, mode_t mode, __u64 submit_flags);
 #if defined(CFA_HAVE_OPENAT2)
         extern int cfa_openat2(int dirfd, const char *pathname, struct open_how * how, size_t size, __u64 submit_flags);
+        extern int cfa_openat2(int dirfd, const char * pathname, struct open_how * how, size_t size, __u64 submit_flags);
 #endif
 extern int cfa_close(int fd, __u64 submit_flags);
 #if defined(CFA_HAVE_STATX)
         extern int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, __u64 submit_flags);
+        extern int cfa_statx(int dirfd, const char * pathname, int flags, unsigned int mask, struct statx * statxbuf, __u64 submit_flags);
 #endif
 extern ssize_t cfa_read(int fd, void * buf, size_t count, __u64 submit_flags);
 extern ssize_t cfa_write(int fd, void * buf, size_t count, __u64 submit_flags);
 extern ssize_t cfa_splice(int fd_in, __off64_t *off_in, int fd_out, __off64_t *off_out, size_t len, unsigned int flags, __u64 submit_flags);
+extern ssize_t cfa_splice(int fd_in, __off64_t * off_in, int fd_out, __off64_t * off_out, size_t len, unsigned int flags, __u64 submit_flags);
 extern ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags, __u64 submit_flags);
 …
 // asynchronous calls
 #if defined(CFA_HAVE_PREADV2)
         extern void async_preadv2(io_future_t & future, int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
+        extern void async_preadv2(io_future_t & future, int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
 #if defined(CFA_HAVE_PWRITEV2)
         extern void async_pwritev2(io_future_t & future, int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
+        extern void async_pwritev2(io_future_t & future, int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
 extern void async_fsync(io_future_t & future, int fd, __u64 submit_flags);
 extern void async_epoll_ctl(io_future_t & future, int epfd, int op, int fd, struct epoll_event *event, __u64 submit_flags);
+extern void async_epoll_ctl(io_future_t & future, int epfd, int op, int fd, struct epoll_event * event, __u64 submit_flags);
 extern void async_sync_file_range(io_future_t & future, int fd, off64_t offset, off64_t nbytes, unsigned int flags, __u64 submit_flags);
 extern void async_sendmsg(io_future_t & future, int sockfd, const struct msghdr *msg, int flags, __u64 submit_flags);
 extern void async_recvmsg(io_future_t & future, int sockfd, struct msghdr *msg, int flags, __u64 submit_flags);
 extern void async_send(io_future_t & future, int sockfd, const void *buf, size_t len, int flags, __u64 submit_flags);
 extern void async_recv(io_future_t & future, int sockfd, void *buf, size_t len, int flags, __u64 submit_flags);
 extern void async_accept4(io_future_t & future, int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, __u64 submit_flags);
 extern void async_connect(io_future_t & future, int sockfd, const struct sockaddr *addr, socklen_t addrlen, __u64 submit_flags);
+extern void async_sendmsg(io_future_t & future, int sockfd, const struct msghdr * msg, int flags, __u64 submit_flags);
+extern void async_recvmsg(io_future_t & future, int sockfd, struct msghdr * msg, int flags, __u64 submit_flags);
+extern void async_send(io_future_t & future, int sockfd, const void * buf, size_t len, int flags, __u64 submit_flags);
+extern void async_recv(io_future_t & future, int sockfd, void * buf, size_t len, int flags, __u64 submit_flags);
+extern void async_accept4(io_future_t & future, int sockfd, __SOCKADDR_ARG addr, socklen_t * restrict addrlen, int flags, __u64 submit_flags);
+extern void async_connect(io_future_t & future, int sockfd, __CONST_SOCKADDR_ARG addr, socklen_t addrlen, __u64 submit_flags);
 extern void async_fallocate(io_future_t & future, int fd, int mode, off_t offset, off_t len, __u64 submit_flags);
 extern void async_posix_fadvise(io_future_t & future, int fd, off_t offset, off_t len, int advice, __u64 submit_flags);
 extern void async_madvise(io_future_t & future, void *addr, size_t length, int advice, __u64 submit_flags);
 extern void async_openat(io_future_t & future, int dirfd, const char *pathname, int flags, mode_t mode, __u64 submit_flags);
+extern void async_madvise(io_future_t & future, void * addr, size_t length, int advice, __u64 submit_flags);
+extern void async_openat(io_future_t & future, int dirfd, const char * pathname, int flags, mode_t mode, __u64 submit_flags);
 #if defined(CFA_HAVE_OPENAT2)
         extern void async_openat2(io_future_t & future, int dirfd, const char *pathname, struct open_how * how, size_t size, __u64 submit_flags);
+        extern void async_openat2(io_future_t & future, int dirfd, const char * pathname, struct open_how * how, size_t size, __u64 submit_flags);
 #endif
 extern void async_close(io_future_t & future, int fd, __u64 submit_flags);
 #if defined(CFA_HAVE_STATX)
         extern void async_statx(io_future_t & future, int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, __u64 submit_flags);
+        extern void async_statx(io_future_t & future, int dirfd, const char * pathname, int flags, unsigned int mask, struct statx * statxbuf, __u64 submit_flags);
 #endif
 void async_read(io_future_t & future, int fd, void * buf, size_t count, __u64 submit_flags);
 extern void async_write(io_future_t & future, int fd, void * buf, size_t count, __u64 submit_flags);
 extern void async_splice(io_future_t & future, int fd_in, __off64_t *off_in, int fd_out, __off64_t *off_out, size_t len, unsigned int flags, __u64 submit_flags);
+extern void async_splice(io_future_t & future, int fd_in, __off64_t * off_in, int fd_out, __off64_t * off_out, size_t len, unsigned int flags, __u64 submit_flags);
 extern void async_tee(io_future_t & future, int fd_in, int fd_out, size_t len, unsigned int flags, __u64 submit_flags);

libcfa/src/concurrency/kernel.cfa

-              r2ed94a9
+              rb110bcc
 // Created On       : Tue Jan 17 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Nov 30 18:14:08 2022
 // Update Count     : 76
+// Last Modified On : Mon Jan  9 08:42:05 2023
+// Update Count     : 77
 //
 #define __cforall_thread__
-#define _GNU_SOURCE
 // #define __CFA_DEBUG_PRINT_RUNTIME_CORE__

libcfa/src/concurrency/kernel/cluster.cfa

-              r2ed94a9
+              rb110bcc
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include "bits/defs.hfa"
 …
         return max_cores_l;
+}
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        // No forward declaration needed
-        #define __kernel_rseq_register rseq_register_current_thread
-        #define __kernel_rseq_unregister rseq_unregister_current_thread
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        static void __kernel_raw_rseq_register  (void);
-        static void __kernel_raw_rseq_unregister(void);
-        #define __kernel_rseq_register __kernel_raw_rseq_register
-        #define __kernel_rseq_unregister __kernel_raw_rseq_unregister
-#else
-        // No forward declaration needed
-        // No initialization needed
-        static inline void noop(void) {}
-        #define __kernel_rseq_register noop
-        #define __kernel_rseq_unregister noop
-#endif
 //=======================================================================
 …
 // Lock-Free registering/unregistering of threads
 unsigned register_proc_id( void ) with(__scheduler_lock.lock) {
-        __kernel_rseq_register();
         bool * handle = (bool *)&kernelTLS().sched_lock;
 …
         __atomic_store_n(cell, 0p, __ATOMIC_RELEASE);
-        __kernel_rseq_unregister();
+}
 …
         /* paranoid */ verify( mock_head(this)    == this.l.prev );
+}
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        // No definition needed
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        #if defined( __x86_64 ) || defined( __i386 )
-                #define RSEQ_SIG        0x53053053
-        #elif defined( __ARM_ARCH )
-                #ifdef __ARMEB__
-                #define RSEQ_SIG    0xf3def5e7      /* udf    #24035    ; 0x5de3 (ARMv6+) */
-                #else
-                #define RSEQ_SIG    0xe7f5def3      /* udf    #24035    ; 0x5de3 */
-                #endif
-        #endif
-        extern void __disable_interrupts_hard();
-        extern void __enable_interrupts_hard();
-        static void __kernel_raw_rseq_register  (void) {
-                /* paranoid */ verify( __cfaabi_rseq.cpu_id == RSEQ_CPU_ID_UNINITIALIZED );
-                // int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, (sigset_t *)0p, _NSIG / 8);
-                int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, RSEQ_SIG);
-                if(ret != 0) {
-                        int e = errno;
-                        switch(e) {
-                        case EINVAL: abort("KERNEL ERROR: rseq register invalid argument");
-                        case ENOSYS: abort("KERNEL ERROR: rseq register no supported");
-                        case EFAULT: abort("KERNEL ERROR: rseq register with invalid argument");
-                        case EBUSY : abort("KERNEL ERROR: rseq register already registered");
-                        case EPERM : abort("KERNEL ERROR: rseq register sig  argument  on unregistration does not match the signature received on registration");
-                        default: abort("KERNEL ERROR: rseq register unexpected return %d", e);
+                        }
+                }
+        }
-        static void __kernel_raw_rseq_unregister(void) {
-                /* paranoid */ verify( __cfaabi_rseq.cpu_id >= 0 );
-                // int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, (sigset_t *)0p, _NSIG / 8);
-                int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
-                if(ret != 0) {
-                        int e = errno;
-                        switch(e) {
-                        case EINVAL: abort("KERNEL ERROR: rseq unregister invalid argument");
-                        case ENOSYS: abort("KERNEL ERROR: rseq unregister no supported");
-                        case EFAULT: abort("KERNEL ERROR: rseq unregister with invalid argument");
-                        case EBUSY : abort("KERNEL ERROR: rseq unregister already registered");
-                        case EPERM : abort("KERNEL ERROR: rseq unregister sig  argument  on unregistration does not match the signature received on registration");
-                        default: abort("KERNEL ERROR: rseq unregisteunexpected return %d", e);
+                        }
+                }
+        }
-#else
-        // No definition needed
-#endif

libcfa/src/concurrency/kernel/cluster.hfa

-              r2ed94a9
+              rb110bcc
+}
 static struct {
         const unsigned readyq;
         const unsigned io;
+const static struct {
+        unsigned readyq;
+        unsigned io;
 } __shard_factor = { 2, 1 };

libcfa/src/concurrency/kernel/private.hfa

-              r2ed94a9
+              rb110bcc
 // Created On       : Mon Feb 13 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Aug 12 08:21:33 2020
 // Update Count     : 9
+// Last Modified On : Thu Mar  2 16:04:46 2023
+// Update Count     : 11
 //
 …
 extern "C" {
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        #include <rseq/rseq.h>
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        #include <linux/rseq.h>
-#else
-        #ifndef _GNU_SOURCE
-        #error kernel/private requires gnu_source
-        #endif
         #include <sched.h>
-#endif
+}
 …
 // Hardware
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        // No data needed
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        extern "Cforall" {
-                extern __attribute__((aligned(64))) __thread volatile struct rseq __cfaabi_rseq;
+        }
-#else
-        // No data needed
-#endif
 static inline int __kernel_getcpu() {
         /* paranoid */ verify( ! __preemption_enabled() );
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        return rseq_current_cpu();
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        int r = __cfaabi_rseq.cpu_id;
-        /* paranoid */ verify( r >= 0 );
-        return r;
-#else
         return sched_getcpu();
-#endif
+}

libcfa/src/concurrency/kernel/startup.cfa

-              r2ed94a9
+              rb110bcc
 #define __cforall_thread__
-#define _GNU_SOURCE
 // #define __CFA_DEBUG_PRINT_RUNTIME_CORE__
 // C Includes
 #include <errno.h>                                      // errno
+#include <errno.h>                                                                              // errno
 #include <signal.h>
 #include <string.h>                                     // strerror
 #include <unistd.h>                                     // sysconf
+#include <string.h>                                                                             // strerror
+#include <unistd.h>
+#include <limits.h>                                                                             // PTHREAD_STACK_MIN
 extern "C" {
+        #include <limits.h>                             // PTHREAD_STACK_MIN
+        #include <unistd.h>                             // syscall
+        #include <sys/eventfd.h>                        // eventfd
+        #include <sys/mman.h>                           // mprotect
+        #include <sys/resource.h>                       // getrlimit
+        #include <sys/eventfd.h>                                                        // eventfd
+        #include <sys/mman.h>                                                           // mprotect
+        #include <sys/resource.h>                                                       // getrlimit
+}
 …
 #include "kernel/private.hfa"
 #include "iofwd.hfa"
 #include "startup.hfa"                                  // STARTUP_PRIORITY_XXX
+#include "startup.hfa"                                                                  // STARTUP_PRIORITY_XXX
 #include "limits.hfa"
 #include "math.hfa"
 …
 __scheduler_RWLock_t __scheduler_lock @= { 0 };
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        // No data needed
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        extern "Cforall" {
-                __attribute__((aligned(64))) __thread volatile struct rseq __cfaabi_rseq @= {
-                        .cpu_id : RSEQ_CPU_ID_UNINITIALIZED,
-                };
+        }
-#else
-        // No data needed
-#endif
 //-----------------------------------------------------------------------------
 // Struct to steal stack

libcfa/src/concurrency/locks.cfa

r2ed94a9	rb110bcc
16	16
17	17	#define __cforall_thread__
18		~~#define _GNU_SOURCE~~
19	18
20	19	#include "locks.hfa"

libcfa/src/concurrency/locks.hfa

-              r2ed94a9
+              rb110bcc
 #include <fstream.hfa>
 // futex headers
 #include <linux/futex.h>      /* Definition of FUTEX_* constants */
 …
 // futex_mutex
-// - No cond var support
 // - Kernel thd blocking alternative to the spinlock
 // - No ownership (will deadlock on reacq)
 …
         int state;
+        // // linear backoff omitted for now
+        // for( int spin = 4; spin < 1024; spin += spin) {
+        //      state = 0;
+        //      // if unlocked, lock and return
+        //      if (internal_try_lock(this, state)) return;
+        //      if (2 == state) break;
+        //      for (int i = 0; i < spin; i++) Pause();
+        // }
+        // no contention try to acquire
+        if (internal_try_lock(this, state)) return;
+        for( int spin = 4; spin < 1024; spin += spin) {
+                state = 0;
+                // if unlocked, lock and return
+                if (internal_try_lock(this, state)) return;
+                if (2 == state) break;
+                for (int i = 0; i < spin; i++) Pause();
+        }
+        // // no contention try to acquire
+        // if (internal_try_lock(this, state)) return;
         // if not in contended state, set to be in contended state
 …
 static inline void unlock(futex_mutex & this) with(this) {
         // if uncontended do atomice unlock and then return
+        if (__atomic_fetch_sub(&val, 1, __ATOMIC_RELEASE) == 1) return; // TODO: try acq/rel
+        // if uncontended do atomic unlock and then return
+    if (__atomic_exchange_n(&val, 0, __ATOMIC_RELEASE) == 1) return;
         // otherwise threads are blocked so we must wake one
-        __atomic_store_n((int *)&val, 0, __ATOMIC_RELEASE);
         futex((int *)&val, FUTEX_WAKE, 1);
+}
 …
 // to set recursion count after getting signalled;
 static inline void on_wakeup( futex_mutex & f, size_t recursion ) {}
+//-----------------------------------------------------------------------------
+// go_mutex
+// - Kernel thd blocking alternative to the spinlock
+// - No ownership (will deadlock on reacq)
+// - Golang's flavour of mutex
+// - Impl taken from Golang: src/runtime/lock_futex.go
+struct go_mutex {
+        // lock state any state other than UNLOCKED is locked
+        // enum LockState { UNLOCKED = 0, LOCKED = 1, SLEEPING = 2 };
+        // stores a lock state
+        int val;
+};
+static inline void  ?{}( go_mutex & this ) with(this) { val = 0; }
+static inline bool internal_try_lock(go_mutex & this, int & compare_val, int new_val ) with(this) {
+        return __atomic_compare_exchange_n((int*)&val, (int*)&compare_val, new_val, false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE);
+}
+static inline int internal_exchange(go_mutex & this, int swap ) with(this) {
+        return __atomic_exchange_n((int*)&val, swap, __ATOMIC_ACQUIRE);
+}
+// if this is called recursively IT WILL DEADLOCK!!!!!
+static inline void lock(go_mutex & this) with(this) {
+        int state, init_state;
+    // speculative grab
+    state = internal_exchange(this, 1);
+    if ( !state ) return; // state == 0
+    init_state = state;
+    for (;;) {
+        for( int i = 0; i < 4; i++ ) {
+            while( !val ) { // lock unlocked
+                state = 0;
+                if (internal_try_lock(this, state, init_state)) return;
+            }
+            for (int i = 0; i < 30; i++) Pause();
+        }
+        while( !val ) { // lock unlocked
+            state = 0;
+            if (internal_try_lock(this, state, init_state)) return;
+        }
+        sched_yield();
+        // if not in contended state, set to be in contended state
+        state = internal_exchange(this, 2);
+        if ( !state ) return; // state == 0
+        init_state = 2;
+        futex((int*)&val, FUTEX_WAIT, 2); // if val is not 2 this returns with EWOULDBLOCK
+    }
+}
+static inline void unlock( go_mutex & this ) with(this) {
+        // if uncontended do atomic unlock and then return
+    if (__atomic_exchange_n(&val, 0, __ATOMIC_RELEASE) == 1) return;
+        // otherwise threads are blocked so we must wake one
+        futex((int *)&val, FUTEX_WAKE, 1);
+}
+static inline void on_notify( go_mutex & f, thread$ * t){ unpark(t); }
+static inline size_t on_wait( go_mutex & f ) {unlock(f); return 0;}
+static inline void on_wakeup( go_mutex & f, size_t recursion ) {}
 //-----------------------------------------------------------------------------
 …
 static inline void on_wakeup(clh_lock & this, size_t recursion ) { lock(this); }
 //-----------------------------------------------------------------------------
 // Exponential backoff then block lock
 …
         this.lock_value = 0;
+}
+static inline void ^?{}( exp_backoff_then_block_lock & this ) {}
+// static inline void ?{}( exp_backoff_then_block_lock & this, exp_backoff_then_block_lock this2 ) = void;
+// static inline void ?=?( exp_backoff_then_block_lock & this, exp_backoff_then_block_lock this2 ) = void;
+static inline void  ^?{}( exp_backoff_then_block_lock & this ){}
 static inline bool internal_try_lock(exp_backoff_then_block_lock & this, size_t & compare_val) with(this) {
+        if (__atomic_compare_exchange_n(&lock_value, &compare_val, 1, false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
+                return true;
+        }
+        return false;
+        return __atomic_compare_exchange_n(&lock_value, &compare_val, 1, false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+}
 …
 static inline bool try_lock_contention(exp_backoff_then_block_lock & this) with(this) {
+        if (__atomic_exchange_n(&lock_value, 2, __ATOMIC_ACQUIRE) == 0) {
+                return true;
+        }
+        return false;
+        return !__atomic_exchange_n(&lock_value, 2, __ATOMIC_ACQUIRE);
+}
 static inline bool block(exp_backoff_then_block_lock & this) with(this) {
+        lock( spinlock __cfaabi_dbg_ctx2 ); // TODO change to lockfree queue (MPSC)
         if (lock_value != 2) {
                 unlock( spinlock );
                 return true;
+        }
         insert_last( blocked_threads, *active_thread() );
         unlock( spinlock );
+    lock( spinlock __cfaabi_dbg_ctx2 );
+    if (__atomic_load_n( &lock_value, __ATOMIC_SEQ_CST) != 2) {
+        unlock( spinlock );
+        return true;
+    }
+    insert_last( blocked_threads, *active_thread() );
+    unlock( spinlock );
         park( );
         return true;
 …
         size_t compare_val = 0;
         int spin = 4;
         // linear backoff
         for( ;; ) {
 …
 static inline void unlock(exp_backoff_then_block_lock & this) with(this) {
     if (__atomic_exchange_n(&lock_value, 0, __ATOMIC_RELEASE) == 1) return;
         lock( spinlock __cfaabi_dbg_ctx2 );
         thread$ * t = &try_pop_front( blocked_threads );
         unlock( spinlock );
         unpark( t );
+    lock( spinlock __cfaabi_dbg_ctx2 );
+    thread$ * t = &try_pop_front( blocked_threads );
+    unlock( spinlock );
+    unpark( t );
+}

libcfa/src/concurrency/monitor.cfa

-              r2ed94a9
+              rb110bcc
 // Created On       : Thd Feb 23 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Dec  4 07:55:14 2019
 // Update Count     : 10
+// Last Modified On : Sun Feb 19 17:00:59 2023
+// Update Count     : 12
 //
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include "monitor.hfa"

libcfa/src/concurrency/mutex.cfa

-              r2ed94a9
+              rb110bcc
 // Created On       : Fri May 25 01:37:11 2018
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Dec  4 09:16:39 2019
 // Update Count     : 1
+// Last Modified On : Sun Feb 19 17:01:36 2023
+// Update Count     : 3
 //
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include "mutex.hfa"

libcfa/src/concurrency/mutex_stmt.hfa

-              r2ed94a9
+              rb110bcc
+#pragma once
 #include "bits/algorithm.hfa"
 #include "bits/defs.hfa"
 …
 //-----------------------------------------------------------------------------
 // is_lock
+trait is_lock(L & | sized(L)) {
+forall(L & | sized(L))
+trait is_lock {
         // For acquiring a lock
         void lock( L & );
 …
     // Sort locks based on address
     __libcfa_small_sort(this.lockarr, count);
-    // acquire locks in order
-    // for ( size_t i = 0; i < count; i++ ) {
-    //     lock(*this.lockarr[i]);
-    // }
+}
-static inline void ^?{}( __mutex_stmt_lock_guard & this ) with(this) {
-    // for ( size_t i = count; i > 0; i-- ) {
-    //     unlock(*lockarr[i - 1]);
-    // }
+}

libcfa/src/concurrency/preemption.cfa

-              r2ed94a9
+              rb110bcc
 // Created On       : Mon Jun 5 14:20:42 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Feb 17 11:18:57 2022
 // Update Count     : 59
+// Last Modified On : Mon Jan  9 08:42:59 2023
+// Update Count     : 60
 //
 #define __cforall_thread__
-#define _GNU_SOURCE
 // #define __CFA_DEBUG_PRINT_PREEMPTION__

libcfa/src/concurrency/pthread.cfa

-              r2ed94a9
+              rb110bcc
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include <signal.h>
 …
 struct pthread_values{
         inline Seqable;
         void* value;
+        void * value;
         bool in_use;
 };
 …
 struct pthread_keys {
         bool in_use;
         void (*destructor)( void * );
+        void (* destructor)( void * );
         Sequence(pthread_values) threads;
 };
 static void ?{}(pthread_keys& k){
+static void ?{}(pthread_keys& k) {
         k.threads{};
+}
 …
 static pthread_keys cfa_pthread_keys_storage[PTHREAD_KEYS_MAX] __attribute__((aligned (16)));
 static void init_pthread_storage(){
         for (int i = 0; i < PTHREAD_KEYS_MAX; i++){
+static void init_pthread_storage() {
+        for ( int i = 0; i < PTHREAD_KEYS_MAX; i++ ) {
                 cfa_pthread_keys_storage[i]{};
+        }
 …
 /* condvar helper routines */
 static void init(pthread_cond_t* pcond){
+static void init(pthread_cond_t * pcond) {
         static_assert(sizeof(pthread_cond_t) >= sizeof(cfa2pthr_cond_var_t),"sizeof(pthread_t) < sizeof(cfa2pthr_cond_var_t)");
         cfa2pthr_cond_var_t* _cond = (cfa2pthr_cond_var_t*)pcond;
+        cfa2pthr_cond_var_t * _cond = (cfa2pthr_cond_var_t *)pcond;
         ?{}(*_cond);
+}
 static cfa2pthr_cond_var_t* get(pthread_cond_t* pcond){
+static cfa2pthr_cond_var_t * get(pthread_cond_t * pcond) {
         static_assert(sizeof(pthread_cond_t) >= sizeof(cfa2pthr_cond_var_t),"sizeof(pthread_t) < sizeof(cfa2pthr_cond_var_t)");
         return (cfa2pthr_cond_var_t*)pcond;
+}
 static void destroy(pthread_cond_t* cond){
+        return (cfa2pthr_cond_var_t *)pcond;
+}
+static void destroy(pthread_cond_t * cond) {
         static_assert(sizeof(pthread_cond_t) >= sizeof(cfa2pthr_cond_var_t),"sizeof(pthread_t) < sizeof(cfa2pthr_cond_var_t)");
         ^?{}(*get(cond));
 …
 /* mutex helper routines */
 static void mutex_check(pthread_mutex_t* t){
+static void mutex_check(pthread_mutex_t * t) {
         // Use double check to improve performance.
         // Check is safe on x86; volatile prevents compiler reordering
         volatile pthread_mutex_t *const mutex_ = t;
+        volatile pthread_mutex_t * const mutex_ = t;
         // SKULLDUGGERY: not a portable way to access the kind field, /usr/include/x86_64-linux-gnu/bits/pthreadtypes.h
 …
 static void init(pthread_mutex_t* plock){
+static void init(pthread_mutex_t * plock) {
         static_assert(sizeof(pthread_mutex_t) >= sizeof(simple_owner_lock),"sizeof(pthread_mutex_t) < sizeof(simple_owner_lock)");
         simple_owner_lock* _lock = (simple_owner_lock*)plock;
+        simple_owner_lock * _lock = (simple_owner_lock *)plock;
         ?{}(*_lock);
+}
 static simple_owner_lock* get(pthread_mutex_t* plock){
+static simple_owner_lock * get(pthread_mutex_t * plock) {
         static_assert(sizeof(pthread_mutex_t) >= sizeof(simple_owner_lock),"sizeof(pthread_mutex_t) < sizeof(simple_owner_lock)");
         return (simple_owner_lock*)plock;
+}
 static void destroy(pthread_mutex_t* plock){
+        return (simple_owner_lock *)plock;
+}
+static void destroy(pthread_mutex_t * plock) {
         static_assert(sizeof(pthread_mutex_t) >= sizeof(simple_owner_lock),"sizeof(pthread_mutex_t) < sizeof(simple_owner_lock)");
         ^?{}(*get(plock));
 …
 //######################### Attr helpers #########################
 struct cfaPthread_attr_t {                                                              // thread attributes
+typedef struct cfaPthread_attr_t {                                              // thread attributes
                 int contentionscope;
                 int detachstate;
                 size_t stacksize;
                 void *stackaddr;
+                void * stackaddr;
                 int policy;
                 int inheritsched;
                 struct sched_param param;
 } typedef cfaPthread_attr_t;
 static const cfaPthread_attr_t default_attrs{
+} cfaPthread_attr_t;
+static const cfaPthread_attr_t default_attrs {
 ,
 ,
         (size_t)65000,
         (void *)NULL,
+_000,
+        NULL,
 ,
 ,
 …
 };
 static cfaPthread_attr_t* get(const pthread_attr_t* attr){
         static_assert(sizeof(pthread_attr_t) >= sizeof(cfaPthread_attr_t),"sizeof(pthread_attr_t) < sizeof(cfaPthread_attr_t)");
         return (cfaPthread_attr_t*)attr;
+static cfaPthread_attr_t * get(const pthread_attr_t * attr) {
+        static_assert(sizeof(pthread_attr_t) >= sizeof(cfaPthread_attr_t), "sizeof(pthread_attr_t) < sizeof(cfaPthread_attr_t)");
+        return (cfaPthread_attr_t *)attr;
+}
 …
         // pthreads return value
         void *joinval;
+        void * joinval;
         // pthread attributes
         pthread_attr_t pthread_attr;
         void *(*start_routine)(void *);
         void *start_arg;
+        void *(* start_routine)(void *);
+        void * start_arg;
         // thread local data
         pthread_values* pthreadData;
+        pthread_values * pthreadData;
         // flag used for tryjoin
 …
 /* thread part routines */
 //  cfaPthread entry point
 void main(cfaPthread& _thread) with(_thread){
         joinval =  start_routine(start_arg);
+void main(cfaPthread & _thread) with(_thread) {
+        joinval = start_routine(start_arg);
         isTerminated = true;
+}
 static cfaPthread *lookup( pthread_t p ){
         static_assert(sizeof(pthread_t) >= sizeof(cfaPthread*),"sizeof(pthread_t) < sizeof(cfaPthread*)");
         return (cfaPthread*)p;
+}
 static void pthread_deletespecific_( pthread_values* values )  { // see uMachContext::invokeTask
         pthread_values* value;
         pthread_keys* key;
+static cfaPthread * lookup( pthread_t p ) {
+        static_assert(sizeof(pthread_t) >= sizeof(cfaPthread *),"sizeof(pthread_t) < sizeof(cfaPthread *)");
+        return (cfaPthread *)p;
+}
+static void pthread_deletespecific_( pthread_values * values )  { // see uMachContext::invokeTask
+        pthread_values * value;
+        pthread_keys * key;
         bool destcalled = true;
         if (values != NULL){
+        if (values != NULL) {
                 for ( int attempts = 0; attempts < PTHREAD_DESTRUCTOR_ITERATIONS && destcalled ; attempts += 1 ) {
                         destcalled = false;
                         lock(key_lock);
                         for (int i = 0; i < PTHREAD_KEYS_MAX; i++){
+                        for ( int i = 0; i < PTHREAD_KEYS_MAX; i++ ) {
                                 // for each valid key
                                 if ( values[i].in_use){
+                                if ( values[i].in_use) {
                                         value = &values[i];
                                         key = &cfa_pthread_keys[i];
 …
                                         // if  a  key  value  has  a  non-NULL  destructor pointer,  and  the  thread  has  a  non-NULL  value associated with that key,
                                         // the value of the key is set to NULL, and then the function pointed to is called with the previously associated value as its sole argument.
                                         if (value->value != NULL && key->destructor != NULL){
+                                        if (value->value != NULL && key->destructor != NULL) {
                                                 unlock(key_lock);
                                                 key->destructor(value->value); // run destructor
 …
+}
 static void ^?{}(cfaPthread & mutex t){
+static void ^?{}(cfaPthread & mutex t) {
         // delete pthread local storage
         pthread_values * values = t.pthreadData;
 …
+}
 static void ?{}(cfaPthread &t, pthread_t* _thread, const pthread_attr_t * _attr,void *(*start_routine)(void *), void * arg) {
         static_assert(sizeof(pthread_t) >= sizeof(cfaPthread*), "pthread_t too small to hold a pointer: sizeof(pthread_t) < sizeof(cfaPthread*)");
+static void ?{}(cfaPthread & t, pthread_t * _thread, const pthread_attr_t * _attr,void *(* start_routine)(void *), void * arg) {
+        static_assert(sizeof(pthread_t) >= sizeof(cfaPthread *), "pthread_t too small to hold a pointer: sizeof(pthread_t) < sizeof(cfaPthread *)");
         // set up user thread stackSize
 …
         //######################### Pthread Attrs #########################
         int pthread_attr_init(pthread_attr_t *attr) libcfa_public __THROW {
                 cfaPthread_attr_t* _attr = get(attr);
+        int pthread_attr_init(pthread_attr_t * attr) libcfa_public __THROW {
+                cfaPthread_attr_t * _attr = get(attr);
                 ?{}(*_attr, default_attrs);
                 return 0;
+        }
         int pthread_attr_destroy(pthread_attr_t *attr) libcfa_public __THROW {
+        int pthread_attr_destroy(pthread_attr_t * attr) libcfa_public __THROW {
                 ^?{}(*get(attr));
                 return 0;
+        }
         int pthread_attr_setscope( pthread_attr_t *attr, int contentionscope ) libcfa_public __THROW {
+        int pthread_attr_setscope( pthread_attr_t * attr, int contentionscope ) libcfa_public __THROW {
                 get( attr )->contentionscope = contentionscope;
                 return 0;
         } // pthread_attr_setscope
         int pthread_attr_getscope( const pthread_attr_t *attr, int *contentionscope ) libcfa_public __THROW {
+        int pthread_attr_getscope( const pthread_attr_t * attr, int * contentionscope ) libcfa_public __THROW {
                 *contentionscope = get( attr )->contentionscope;
                 return 0;
         } // pthread_attr_getscope
         int pthread_attr_setdetachstate( pthread_attr_t *attr, int detachstate ) libcfa_public __THROW {
+        int pthread_attr_setdetachstate( pthread_attr_t * attr, int detachstate ) libcfa_public __THROW {
                 get( attr )->detachstate = detachstate;
                 return 0;
         } // pthread_attr_setdetachstate
         int pthread_attr_getdetachstate( const pthread_attr_t *attr, int *detachstate ) libcfa_public __THROW {
+        int pthread_attr_getdetachstate( const pthread_attr_t * attr, int * detachstate ) libcfa_public __THROW {
                 *detachstate = get( attr )->detachstate;
                 return 0;
         } // pthread_attr_getdetachstate
         int pthread_attr_setstacksize( pthread_attr_t *attr, size_t stacksize ) libcfa_public __THROW {
+        int pthread_attr_setstacksize( pthread_attr_t * attr, size_t stacksize ) libcfa_public __THROW {
                 get( attr )->stacksize = stacksize;
                 return 0;
         } // pthread_attr_setstacksize
         int pthread_attr_getstacksize( const pthread_attr_t *attr, size_t *stacksize ) libcfa_public __THROW {
+        int pthread_attr_getstacksize( const pthread_attr_t * attr, size_t * stacksize ) libcfa_public __THROW {
                 *stacksize = get( attr )->stacksize;
                 return 0;
 …
         } // pthread_attr_setguardsize
         int pthread_attr_setstackaddr( pthread_attr_t *attr, void *stackaddr ) libcfa_public __THROW {
+        int pthread_attr_setstackaddr( pthread_attr_t * attr, void * stackaddr ) libcfa_public __THROW {
                 get( attr )->stackaddr = stackaddr;
                 return 0;
         } // pthread_attr_setstackaddr
         int pthread_attr_getstackaddr( const pthread_attr_t *attr, void **stackaddr ) libcfa_public __THROW {
+        int pthread_attr_getstackaddr( const pthread_attr_t * attr, void ** stackaddr ) libcfa_public __THROW {
                 *stackaddr = get( attr )->stackaddr;
                 return 0;
         } // pthread_attr_getstackaddr
         int pthread_attr_setstack( pthread_attr_t *attr, void *stackaddr, size_t stacksize ) libcfa_public __THROW {
+        int pthread_attr_setstack( pthread_attr_t * attr, void * stackaddr, size_t stacksize ) libcfa_public __THROW {
                 get( attr )->stackaddr = stackaddr;
                 get( attr )->stacksize = stacksize;
 …
         } // pthread_attr_setstack
         int pthread_attr_getstack( const pthread_attr_t *attr, void **stackaddr, size_t *stacksize ) libcfa_public __THROW {
+        int pthread_attr_getstack( const pthread_attr_t * attr, void ** stackaddr, size_t * stacksize ) libcfa_public __THROW {
                 *stackaddr = get( attr )->stackaddr;
                 *stacksize = get( attr )->stacksize;
 …
         // already running thread threadID. It shall be called on unitialized attr
         // and destroyed with pthread_attr_destroy when no longer needed.
         int pthread_getattr_np( pthread_t threadID, pthread_attr_t *attr ) libcfa_public __THROW { // GNU extension
+        int pthread_getattr_np( pthread_t threadID, pthread_attr_t * attr ) libcfa_public __THROW { // GNU extension
                 check_nonnull(attr);
 …
         //######################### Threads #########################
         int pthread_create(pthread_t * _thread, const pthread_attr_t * attr, void *(*start_routine)(void *), void * arg) libcfa_public __THROW {
                 cfaPthread *t = alloc();
+        int pthread_create(pthread_t * _thread, const pthread_attr_t * attr, void *(* start_routine)(void *), void * arg) libcfa_public __THROW {
+                cfaPthread * t = alloc();
                 (*t){_thread, attr, start_routine, arg};
                 return 0;
+        }
+        int pthread_join(pthread_t _thread, void **value_ptr) libcfa_public __THROW {
+        int pthread_join(pthread_t _thread, void ** value_ptr) libcfa_public __THROW {
                 // if thread is invalid
                 if (_thread == NULL) return EINVAL;
 …
                 // get user thr pointer
                 cfaPthread* p = lookup(_thread);
+                cfaPthread * p = lookup(_thread);
                 try {
                         join(*p);
 …
+        }
         int pthread_tryjoin_np(pthread_t _thread, void **value_ptr) libcfa_public __THROW {
+        int pthread_tryjoin_np(pthread_t _thread, void ** value_ptr) libcfa_public __THROW {
                 // if thread is invalid
                 if (_thread == NULL) return EINVAL;
                 if (_thread == pthread_self()) return EDEADLK;
                 cfaPthread* p = lookup(_thread);
+                cfaPthread * p = lookup(_thread);
                 // thread not finished ?
 …
         void pthread_exit(void * status) libcfa_public __THROW {
                 pthread_t pid = pthread_self();
                 cfaPthread* _thread = (cfaPthread*)pid;
+                cfaPthread * _thread = (cfaPthread *)pid;
                 _thread->joinval = status;  // set return value
                 _thread->isTerminated = 1;  // set terminated flag
 …
         //######################### Mutex #########################
         int pthread_mutex_init(pthread_mutex_t *_mutex, const pthread_mutexattr_t *attr) libcfa_public __THROW {
+        int pthread_mutex_init(pthread_mutex_t *_mutex, const pthread_mutexattr_t * attr) libcfa_public __THROW {
                 check_nonnull(_mutex);
                 init(_mutex);
 …
         int pthread_mutex_destroy(pthread_mutex_t *_mutex) libcfa_public __THROW {
                 check_nonnull(_mutex);
                 simple_owner_lock* _lock = get(_mutex);
                 if (_lock->owner != NULL){
+                simple_owner_lock * _lock = get(_mutex);
+                if (_lock->owner != NULL) {
                         return EBUSY;
+                }
 …
                 check_nonnull(_mutex);
                 mutex_check(_mutex);
                 simple_owner_lock* _lock = get(_mutex);
+                simple_owner_lock * _lock = get(_mutex);
                 lock(*_lock);
                 return 0;
 …
         int pthread_mutex_unlock(pthread_mutex_t *_mutex) libcfa_public __THROW {
                 check_nonnull(_mutex);
                 simple_owner_lock* _lock = get(_mutex);
                 if (_lock->owner != active_thread()){
+                simple_owner_lock * _lock = get(_mutex);
+                if (_lock->owner != active_thread()) {
                         return EPERM;
                 } // current thread does not hold the mutex
 …
         int pthread_mutex_trylock(pthread_mutex_t *_mutex) libcfa_public __THROW {
                 check_nonnull(_mutex);
                 simple_owner_lock* _lock = get(_mutex);
                 if (_lock->owner != active_thread() && _lock->owner != NULL){
+                simple_owner_lock * _lock = get(_mutex);
+                if (_lock->owner != active_thread() && _lock->owner != NULL) {
                         return EBUSY;
                 }   // if mutex is owned
 …
         /* conditional variable routines */
         int pthread_cond_init(pthread_cond_t *cond, const pthread_condattr_t *attr) libcfa_public __THROW {
+        int pthread_cond_init(pthread_cond_t * cond, const pthread_condattr_t * attr) libcfa_public __THROW {
                 check_nonnull(cond);
                 init(cond);
 …
         }  //pthread_cond_init
         int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *_mutex) libcfa_public __THROW {
+        int pthread_cond_wait(pthread_cond_t * cond, pthread_mutex_t *_mutex) libcfa_public __THROW {
                 check_nonnull(_mutex);
                 check_nonnull(cond);
 …
         } // pthread_cond_timedwait
         int pthread_cond_signal(pthread_cond_t *cond) libcfa_public __THROW {
+        int pthread_cond_signal(pthread_cond_t * cond) libcfa_public __THROW {
                 check_nonnull(cond);
                 return notify_one(*get(cond));
         } // pthread_cond_signal
         int pthread_cond_broadcast(pthread_cond_t *cond) libcfa_public __THROW {
+        int pthread_cond_broadcast(pthread_cond_t * cond) libcfa_public __THROW {
                 check_nonnull(cond);
                 return notify_all(*get(cond));
         } // pthread_cond_broadcast
         int pthread_cond_destroy(pthread_cond_t *cond) libcfa_public __THROW {
+        int pthread_cond_destroy(pthread_cond_t * cond) libcfa_public __THROW {
                 check_nonnull(cond);
                 destroy(cond);
 …
         //######################### Local storage #########################
         int pthread_once(pthread_once_t *once_control, void (*init_routine)(void)) libcfa_public __THROW {
+        int pthread_once(pthread_once_t * once_control, void (* init_routine)(void)) libcfa_public __THROW {
                 static_assert(sizeof(pthread_once_t) >= sizeof(int),"sizeof(pthread_once_t) < sizeof(int)");
                 check_nonnull(once_control);
 …
         } // pthread_once
         int pthread_key_create( pthread_key_t *key, void (*destructor)( void * ) ) libcfa_public __THROW {
+        int pthread_key_create( pthread_key_t * key, void (* destructor)( void * ) ) libcfa_public __THROW {
                 lock(key_lock);
                 for ( int i = 0; i < PTHREAD_KEYS_MAX; i += 1 ) {
 …
         }   // pthread_key_delete
         int pthread_setspecific( pthread_key_t key, const void *value ) libcfa_public __THROW {
+        int pthread_setspecific( pthread_key_t key, const void * value ) libcfa_public __THROW {
                 // get current thread
                 cfaPthread* t = lookup(pthread_self());
+                cfaPthread * t = lookup(pthread_self());
                 // if current thread's pthreadData is NULL; initialize it
                 pthread_values* values;
                 if (t->pthreadData == NULL){
+                pthread_values * values;
+                if (t->pthreadData == NULL) {
                         values = anew( PTHREAD_KEYS_MAX);
                         t->pthreadData = values;
                         for (int i = 0;i < PTHREAD_KEYS_MAX; i++){
+                        for ( int i = 0;i < PTHREAD_KEYS_MAX; i++ ) {
                                 t->pthreadData[i].in_use = false;
                         }   // for
 …
         } //pthread_setspecific
         void* pthread_getspecific(pthread_key_t key) libcfa_public __THROW {
+        void * pthread_getspecific(pthread_key_t key) libcfa_public __THROW {
                 if (key >= PTHREAD_KEYS_MAX || ! cfa_pthread_keys[key].in_use) return NULL;
                 // get current thread
                 cfaPthread* t = lookup(pthread_self());
+                cfaPthread * t = lookup(pthread_self());
                 if (t->pthreadData == NULL) return NULL;
                 lock(key_lock);
                 pthread_values &entry = ((pthread_values *)t->pthreadData)[key];
+                pthread_values & entry = ((pthread_values *)t->pthreadData)[key];
                 if ( ! entry.in_use ) {
                         unlock( key_lock );
                         return NULL;
                 } // if
                 void *value = entry.value;
+                void * value = entry.value;
                 unlock(key_lock);
 …
         //######################### Parallelism #########################
         int pthread_setaffinity_np( pthread_t /* __th */, size_t /* __cpusetsize */, __const cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
                 abort( "pthread_setaffinity_np" );
         } // pthread_setaffinity_np
         int pthread_getaffinity_np( pthread_t /* __th */, size_t /* __cpusetsize */, cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
                 abort( "pthread_getaffinity_np" );
         } // pthread_getaffinity_np
         int pthread_attr_setaffinity_np( pthread_attr_t * /* __attr */, size_t /* __cpusetsize */, __const cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
                 abort( "pthread_attr_setaffinity_np" );
         } // pthread_attr_setaffinity_np
         int pthread_attr_getaffinity_np( __const pthread_attr_t * /* __attr */, size_t /* __cpusetsize */, cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
                 abort( "pthread_attr_getaffinity_np" );
         } // pthread_attr_getaffinity_np
+        // int pthread_setaffinity_np( pthread_t /* __th */, size_t /* __cpusetsize */, __const cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
+        //      abort( "pthread_setaffinity_np" );
+        // } // pthread_setaffinity_np
+        // int pthread_getaffinity_np( pthread_t /* __th */, size_t /* __cpusetsize */, cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
+        //      abort( "pthread_getaffinity_np" );
+        // } // pthread_getaffinity_np
+        // int pthread_attr_setaffinity_np( pthread_attr_t * /* __attr */, size_t /* __cpusetsize */, __const cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
+        //      abort( "pthread_attr_setaffinity_np" );
+        // } // pthread_attr_setaffinity_np
+        // int pthread_attr_getaffinity_np( __const pthread_attr_t * /* __attr */, size_t /* __cpusetsize */, cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
+        //      abort( "pthread_attr_getaffinity_np" );
+        // } // pthread_attr_getaffinity_np
         //######################### Cancellation #########################
 …
         } // pthread_cancel
         int pthread_setcancelstate( int state, int *oldstate ) libcfa_public __THROW {
+        int pthread_setcancelstate( int state, int * oldstate ) libcfa_public __THROW {
                 abort("pthread_setcancelstate not implemented");
                 return 0;
         } // pthread_setcancelstate
         int pthread_setcanceltype( int type, int *oldtype ) libcfa_public __THROW {
+        int pthread_setcanceltype( int type, int * oldtype ) libcfa_public __THROW {
                 abort("pthread_setcanceltype not implemented");
                 return 0;
 …
 #pragma GCC diagnostic pop

libcfa/src/concurrency/ready_queue.cfa

r2ed94a9	rb110bcc
15	15
16	16	#define __cforall_thread__
17		~~#define _GNU_SOURCE~~
18	17
19	18	// #define __CFA_DEBUG_PRINT_READY_QUEUE__

libcfa/src/concurrency/select.hfa

r2ed94a9	rb110bcc
	1	#pragma once
	2
1	3	#include "containers/list.hfa"
2	4	#include <stdint.h>

libcfa/src/concurrency/thread.cfa

-              r2ed94a9
+              rb110bcc
 // Created On       : Tue Jan 17 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sun Dec 11 20:56:54 2022
 // Update Count     : 102
+// Last Modified On : Mon Jan  9 08:42:33 2023
+// Update Count     : 103
 //
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include "thread.hfa"

libcfa/src/containers/array.hfa

-              r2ed94a9
+              rb110bcc
+//
+// Single-dim array sruct (with explicit packing and atom)
+//
+//
+// The `array` macro is the public interface.
+// It computes the type of a dense (trivially strided) array.
+// All user-declared objects are dense arrays.
+//
+// The `arpk` (ARray with PacKing info explicit) type is, generally, a slice with _any_ striding.
+// This type is meant for internal use.
+// CFA programmers should not instantiate it directly, nor access its field.
+// CFA programmers should call ?[?] on it.
+// Yet user-given `array(stuff)` expands to `arpk(stuff')`.
+// The comments here explain the resulting internals.
+//
+// Just as a plain-C "multidimesional" array is really array-of-array-of-...,
+// so does arpk generally show up as arpk-of-arpk-of...
+//
+// In the example of `array(float, 3, 4, 5) a;`,
+// `typeof(a)` is an `arpk` instantiation.
+// These comments explain _its_ arguments, i.e. those of the topmost `arpk` level.
+//
+// [N]    : the number of elements in `a`; 3 in the example
+// S      : carries the stride size (distance in bytes between &myA[0] and &myA[1]), in sizeof(S);
+//          same as Timmed when striding is trivial, same as Timmed in the example
+// Timmed : (T-immediate) the inner type; conceptually, `typeof(a)` is "arpk of Timmed";
+//          array(float, 4, 5) in the example
+// Tbase  : (T-base) the deepest element type that is not arpk; float in the example
+//
 forall( [N], S & | sized(S), Timmed &, Tbase & ) {
+    //
+    // Single-dim array sruct (with explicit packing and atom)
+    //
     struct arpk {
         S strides[N];

libcfa/src/containers/list.hfa

-              r2ed94a9
+              rb110bcc
 static inline tytagref(void, T) ?`inner ( T & this ) { tytagref( void, T ) ret = {this}; return ret; }
+// use this on every case of plan-9 inheritance, to make embedded a closure of plan-9 inheritance
+#define P9_EMBEDDED( derived, immedBase ) \
+forall( Tbase &, TdiscardPath & | { tytagref( TdiscardPath, Tbase ) ?`inner( immedBase & ); } ) \
+    static inline tytagref(immedBase, Tbase) ?`inner( derived & this ) { \
+//
+// P9_EMBEDDED: Use on every case of plan-9 inheritance, to make "implements embedded" be a closure of plan-9 inheritance.
+//
+// struct foo {
+//    int a, b, c;
+//    inline (bar);
+// };
+// P9_EMBEDDED( foo, bar )
+//
+// usual version, for structs that are top-level declarations
+#define P9_EMBEDDED(        derived, immedBase ) P9_EMBEDDED_DECL_( derived, immedBase, static ) P9_EMBEDDED_BDY_( immedBase )
+// special version, for structs that are declared in functions
+#define P9_EMBEDDED_INFUNC( derived, immedBase ) P9_EMBEDDED_DECL_( derived, immedBase,        ) P9_EMBEDDED_BDY_( immedBase )
+// forward declarations of both the above; generally not needed
+// may help you control where the P9_EMBEEDED cruft goes, in case "right after the stuct" isn't where you want it
+#define P9_EMBEDDED_FWD(        derived, immedBase )      P9_EMBEDDED_DECL_( derived, immedBase, static ) ;
+#define P9_EMBEDDED_FWD_INFUNC( derived, immedBase ) auto P9_EMBEDDED_DECL_( derived, immedBase,        ) ;
+// private helpers
+#define P9_EMBEDDED_DECL_( derived, immedBase, STORAGE ) \
+    forall( Tbase &, TdiscardPath & | { tytagref( TdiscardPath, Tbase ) ?`inner( immedBase & ); } ) \
+    STORAGE inline tytagref(immedBase, Tbase) ?`inner( derived & this )
+#define P9_EMBEDDED_BDY_( immedBase ) { \
         immedBase & ib = this; \
         Tbase & b = ib`inner; \

libcfa/src/containers/vector2.hfa

-              r2ed94a9
+              rb110bcc
 // Author           : Michael Brooks
 // Created On       : Thu Jun 23 22:00:00 2021
+// Last Modified By : Michael Brooks
+// Last Modified On : Thu Jun 23 22:00:00 2021
+// Update Count     : 1
+//
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Tue Mar 14 08:40:53 2023
+// Update Count     : 2
+//
+#pragma once
 #include <stdlib.hfa>

libcfa/src/interpose.cfa

-              r2ed94a9
+              rb110bcc
 // Created On       : Wed Mar 29 16:10:31 2017
 // Last Modified By : Peter A. Buhr
+// Last Modified On : Thu Jan  5 22:23:57 2023
+// Update Count     : 180
+//
+#include <stdarg.h>                                                                             // va_start, va_end
+// Last Modified On : Mon Mar 27 21:09:03 2023
+// Update Count     : 196
+//
 #include <stdio.h>
-#include <string.h>                                                                             // strlen
 #include <unistd.h>                                                                             // _exit, getpid
-#include <signal.h>
 extern "C" {
 #include <dlfcn.h>                                                                              // dlopen, dlsym
 …
+}
-#include "bits/debug.hfa"
 #include "bits/defs.hfa"
 #include "bits/signal.hfa"                                                              // sigHandler_?
 …
 typedef void (* generic_fptr_t)(void);
 static generic_fptr_t do_interpose_symbol( void * library, const char symbol[], const char version[] ) {
-        const char * error;
         union { generic_fptr_t fptr; void * ptr; } originalFunc;
         #if defined( _GNU_SOURCE )
                 if ( version ) {
                         originalFunc.ptr = dlvsym( library, symbol, version );
                 } else {
                         originalFunc.ptr = dlsym( library, symbol );
+                }
+        if ( version ) {
+                originalFunc.ptr = dlvsym( library, symbol, version );
+        } else {
+                originalFunc.ptr = dlsym( library, symbol );
+        } // if
         #else
                 originalFunc.ptr = dlsym( library, symbol );
+        originalFunc.ptr = dlsym( library, symbol );
         #endif // _GNU_SOURCE
         error = dlerror();
         if ( error ) abort( "interpose_symbol : internal error, %s\n", error );
+        if ( ! originalFunc.ptr ) {                                                     // == nullptr
+                abort( "interpose_symbol : internal error, %s\n", dlerror() );
+        } // if
         return originalFunc.fptr;
+}
 static generic_fptr_t interpose_symbol( const char symbol[], const char version[] ) {
+        const char * error;
+        static void * library;
+        static void * pthread_library;
+        if ( ! library ) {
+                #if defined( RTLD_NEXT )
+                        library = RTLD_NEXT;
+                #else
+                        // missing RTLD_NEXT => must hard-code library name, assuming libstdc++
+                        library = dlopen( "libc.so.6", RTLD_LAZY );
+                        error = dlerror();
+                        if ( error ) {
+                                abort( "interpose_symbol : failed to open libc, %s\n", error );
+                        }
+                #endif
+        void * library;
+        #if defined( RTLD_NEXT )
+        library = RTLD_NEXT;
+        #else
+        // missing RTLD_NEXT => must hard-code library name, assuming libstdc++
+        library = dlopen( "libc.so.6", RTLD_LAZY );
+        if ( ! library ) {                                                                      // == nullptr
+                abort( "interpose_symbol : failed to open libc, %s\n", dlerror() );
         } // if
+        if ( ! pthread_library ) {
+                #if defined( RTLD_NEXT )
+                        pthread_library = RTLD_NEXT;
+                #else
+                        // missing RTLD_NEXT => must hard-code library name, assuming libstdc++
+                        pthread_library = dlopen( "libpthread.so", RTLD_LAZY );
+                        error = dlerror();
+                        if ( error ) {
+                                abort( "interpose_symbol : failed to open libpthread, %s\n", error );
+                        }
+                #endif
+        } // if
+        return do_interpose_symbol(library, symbol, version);
+        #endif // RTLD_NEXT
+        return do_interpose_symbol( library, symbol, version );
+}
 …
                 preload_libgcc();
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
+                #pragma GCC diagnostic push
+                #pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
                 INTERPOSE_LIBC( abort, version );
                 INTERPOSE_LIBC( exit , version );
 #pragma GCC diagnostic pop
+                #pragma GCC diagnostic pop
                 if(__cfathreadabi_interpose_startup) __cfathreadabi_interpose_startup( do_interpose_symbol );
+                // SKULLDUGGERY: In Ubuntu 22.04, someone augmented signal.h to allow SIGSTKSZ to be "sysconf(_SC_SIGSTKSZ)" in
+                // sigstksz.h, as well as 8192 in sigstack.h. HOWEVER, they forgot to provide a mechanism to tell signal.h to
+                // use sigstack.h rather than sigstksz.h. (I'm not happy.) By undefining _GNU_SOURCE before signal.h and
+                // redefining it afterwards, you can get 8192, but then nothing works correctly inside of signal.h without
+                // _GNU_SOURCE defined.  So what is needed is a way to get signal.h to use sigstack.h WITH _GNU_SOURCE defined.
+                // Basically something is wrong with features.h and its use in signal.h.
+                #undef SIGSTKSZ
+                #define SIGSTKSZ 8192
                 // As a precaution (and necessity), errors that result in termination are delivered on a separate stack because
 …
         va_start( args, fmt );
         __abort( false, fmt, args );
     // CONTROL NEVER REACHES HERE!
+        // CONTROL NEVER REACHES HERE!
         va_end( args );
+}
 void abort( bool signalAbort, const char fmt[], ... ) {
     va_list args;
     va_start( args, fmt );
     __abort( signalAbort, fmt, args );
     // CONTROL NEVER REACHES HERE!
     va_end( args );
+        va_list args;
+        va_start( args, fmt );
+        __abort( signalAbort, fmt, args );
+        // CONTROL NEVER REACHES HERE!
+        va_end( args );
+}

libcfa/src/interpose_thread.cfa

-              r2ed94a9
+              rb110bcc
 //
+#include <stdarg.h>                                                                             // va_start, va_end
+#include <stdio.h>
+#include <string.h>                                                                             // strlen
+// BUG in 32-bit gcc with interpose: fixed in >= gcc-9.5, gcc-10.4, gcc-12.2
+#ifdef __i386__                                                                                 // 32-bit architecture
+#undef _GNU_SOURCE
+#endif // __i386__
 #include <signal.h>
 #include <pthread.h>
+#include <signal.h>
 extern "C" {
 #include <dlfcn.h>                                                                              // dlopen, dlsym
-#include <execinfo.h>                                                                   // backtrace, messages
+}
-#include "bits/debug.hfa"
 #include "bits/defs.hfa"
-#include <assert.h>
 //=============================================================================================
 …
 typedef void (* generic_fptr_t)(void);
 generic_fptr_t interpose_symbol(
+generic_fptr_t libcfa_public interpose_symbol(
         generic_fptr_t (*do_interpose_symbol)( void * library, const char symbol[], const char version[] ),
         const char symbol[],
         const char version[]
 ) libcfa_public {
         const char * error;
+) {
+        void * library;
+        static void * library;
+        if ( ! library ) {
+                #if defined( RTLD_NEXT )
+                        library = RTLD_NEXT;
+                #else
+                        // missing RTLD_NEXT => must hard-code library name, assuming libstdc++
+                        library = dlopen( "libpthread.so", RTLD_LAZY );
+                        error = dlerror();
+                        if ( error ) {
+                                abort( "interpose_symbol : failed to open libpthread, %s\n", error );
+                        }
+                #endif
+        #if defined( RTLD_NEXT )
+        library = RTLD_NEXT;
+        #else
+        // missing RTLD_NEXT => must hard-code library name, assuming libstdc++
+        library = dlopen( "libpthread.so", RTLD_LAZY );
+        if ( ! library ) {                                                                      // == nullptr
+                abort( "interpose_symbol : failed to open libpthread, %s\n", dlerror() );
         } // if
+        #endif // RTLD_NEXT
         return do_interpose_symbol(library, symbol, version);
+        return do_interpose_symbol( library, symbol, version );
+}
 …
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
                 INTERPOSE( pthread_create , version );
                 INTERPOSE( pthread_join , version );
                 INTERPOSE( pthread_self , version );
                 INTERPOSE( pthread_attr_init , version );
                 INTERPOSE( pthread_attr_destroy , version );
                 INTERPOSE( pthread_attr_setstack , version );
                 INTERPOSE( pthread_attr_getstacksize , version );
                 INTERPOSE( pthread_sigmask , version );
                 INTERPOSE( pthread_sigqueue , version );
                 INTERPOSE( pthread_once , version );
+                INTERPOSE( pthread_create, version );
+                INTERPOSE( pthread_join, version );
+                INTERPOSE( pthread_self, version );
+                INTERPOSE( pthread_attr_init, version );
+                INTERPOSE( pthread_attr_destroy, version );
+                INTERPOSE( pthread_attr_setstack, version );
+                INTERPOSE( pthread_attr_getstacksize, version );
+                INTERPOSE( pthread_sigmask, version );
+                INTERPOSE( pthread_sigqueue, version );
+                INTERPOSE( pthread_once, version );
 #pragma GCC diagnostic pop
+        }

libcfa/src/iostream.cfa

-              r2ed94a9
+              rb110bcc
 // Created On       : Wed May 27 17:56:53 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sat Aug 27 15:04:15 2022
 // Update Count     : 1358
+// Last Modified On : Mon Jan  9 09:27:58 2023
+// Update Count     : 1361
 //
 …
                         } /* if */ \
                         if ( ! f.flags.nobsdp || (exp10 < SUFFIXES_START) || (exp10 > SUFFIXES_END) ) { \
                                 len2 = snprintf( &buf[len], size - len, "e%d", exp10 ); \
+                                len2 = snprintf( &buf[len], size - len, "e%d", (int)exp10 /* ambiguity with function exp10 */ ); \
                         } else { \
                                 len2 = snprintf( &buf[len], size - len, "%s", suffixes[(exp10 - SUFFIXES_START) / 3] ); \

libcfa/src/limits.cfa

-              r2ed94a9
+              rb110bcc
 // Created On       : Wed Apr  6 18:06:52 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Jan  5 22:27:40 2023
 // Update Count     : 84
+// Last Modified On : Fri Feb 17 12:25:39 2023
+// Update Count     : 87
 //
-#define _GNU_SOURCE                                                                             // access long double M_*l in math.h
 #include <limits.h>
 #include <float.h>

libcfa/src/stdlib.cfa

-              r2ed94a9
+              rb110bcc
 // Created On       : Thu Jan 28 17:10:29 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Fri Dec  9 15:11:30 2022
 // Update Count     : 631
+// Last Modified On : Thu Feb 16 16:31:34 2023
+// Update Count     : 633
 //
 …
 //---------------------------------------
-#define _XOPEN_SOURCE 600                                                               // posix_memalign, *rand48
 #include <string.h>                                                                             // memcpy, memset
 //#include <math.h>                                                                             // fabsf, fabs, fabsl

libcfa/src/vec/vec.hfa

-              r2ed94a9
+              rb110bcc
 #include <math.hfa>
+trait fromint(T) {
+forall(T)
+trait fromint {
     void ?{}(T&, int);
 };
+trait zeroinit(T) {
+forall(T)
+trait zeroinit {
     void ?{}(T&, zero_t);
 };
+trait zero_assign(T) {
+forall(T)
+trait zero_assign {
     T ?=?(T&, zero_t);
 };
+trait subtract(T) {
+forall(T)
+trait subtract {
     T ?-?(T, T);
 };
+trait negate(T) {
+forall(T)
+trait negate {
     T -?(T);
 };
+trait add(T) {
+forall(T)
+trait add {
     T ?+?(T, T);
 };
+trait multiply(T) {
+forall(T)
+trait multiply {
     T ?*?(T, T);
 };
+trait divide(T) {
+forall(T)
+trait divide {
     T ?/?(T, T);
 };
+trait lessthan(T) {
+forall(T)
+trait lessthan {
     int ?<?(T, T);
 };
+trait equality(T) {
+forall(T)
+trait equality {
     int ?==?(T, T);
 };
+trait sqrt(T) {
+forall(T)
+trait sqrt {
     T sqrt(T);
 };

Context Navigation

Legend:

Download in other formats: