Changeset 24d6572 for libcfa/src

libcfa/src/Makefile.am

-              r34b4268
+              r24d6572
 ## Created On       : Sun May 31 08:54:01 2015
 ## Last Modified By : Peter A. Buhr
 ## Last Modified On : Fri Jul 16 16:00:40 2021
 ## Update Count     : 255
+## Last Modified On : Thu May 25 15:20:04 2023
+## Update Count     : 259
 ###############################################################################
 …
         math.hfa \
         time_t.hfa \
+        virtual_dtor.hfa \
         bits/algorithm.hfa \
         bits/align.hfa \
 …
         bits/queue.hfa \
         bits/sequence.hfa \
+        concurrency/atomic.hfa \
         concurrency/iofwd.hfa \
         concurrency/barrier.hfa \
 …
         concurrency/once.hfa \
         concurrency/kernel/fwd.hfa \
+        concurrency/mutex_stmt.hfa
+        concurrency/mutex_stmt.hfa \
+        concurrency/channel.hfa \
+        concurrency/actor.hfa
 inst_thread_headers_src = \
 …
         concurrency/monitor.hfa \
         concurrency/mutex.hfa \
+        concurrency/select.hfa \
         concurrency/thread.hfa

libcfa/src/algorithms/range_iterator.hfa

-              r34b4268
+              r24d6572
 // Author           : Thierry Delisle
 // Created On       : Tue Nov 30 13:06:22 2021
 // Last Modified By :
 // Last Modified On :
 // Update Count     :
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Mon Mar 13 23:10:35 2023
+// Update Count     : 1
 //
+#pragma once
 generator RangeIter {

libcfa/src/bitmanip.hfa

-              r34b4268
+              r24d6572
 // Created On       : Sat Mar 14 18:12:27 2020
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sat Oct  8 08:28:15 2022
 // Update Count     : 142
+// Last Modified On : Mon Jan  9 09:02:43 2023
+// Update Count     : 144
 //
 #pragma once
+#include "bits/debug.hfa"                                                               // verify
 // Reference: Bit Twiddling Hacks: http://graphics.stanford.edu/%7Eseander/bithacks.html#CountBitsSetNaive

libcfa/src/bits/containers.hfa

-              r34b4268
+              r24d6572
 // Created On       : Tue Oct 31 16:38:50 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Jan 15 07:42:35 2020
 // Update Count     : 28
+// Last Modified On : Thu Feb  2 11:33:08 2023
+// Update Count     : 29
 #pragma once
 …
 #ifdef __cforall
+        trait is_node(T &) {
+        forall( T & )
+        trait is_node {
                 T *& get_next( T & );
         };

libcfa/src/bits/random.hfa

-              r34b4268
+              r24d6572
 // Created On       : Fri Jan 14 07:18:11 2022
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sun Dec 11 18:43:58 2022
 // Update Count     : 171
+// Last Modified On : Mon Mar 20 21:45:24 2023
+// Update Count     : 186
 //
 #pragma once
 #include <stdint.h>
+#include <stdint.h>                                                                             // uintXX_t
 #define GLUE2( x, y ) x##y
 …
 #ifdef __x86_64__                                                                               // 64-bit architecture
         // 64-bit generators
         #define LEHMER64
+        //#define LEHMER64
         //#define XORSHIFT_12_25_27
         //#define XOSHIRO256PP
+        #define XOSHIRO256PP
         //#define KISS_64
+    // #define SPLITMIX_64
         // 32-bit generators
+        #define XORSHIFT_6_21_7
+        //#define XOSHIRO128PP
+        //#define XORSHIFT_6_21_7
+        #define XOSHIRO128PP
+    // #define SPLITMIX_32
 #else                                                                                                   // 32-bit architecture
         // 64-bit generators
+        #define XORSHIFT_13_7_17
+        //#define XORSHIFT_13_7_17
+        #define XOSHIRO256PP
+    // #define SPLITMIX_64
         // 32-bit generators
+        #define XORSHIFT_6_21_7
+        //#define XORSHIFT_6_21_7
+        #define XOSHIRO128PP
+    // #define SPLITMIX_32
 #endif // __x86_64__
 // Define C/CFA PRNG name and random-state.
-// SKULLDUGGERY: typedefs name struct and typedef with the same name to deal with CFA typedef numbering problem.
 #ifdef XOSHIRO256PP
 #define PRNG_NAME_64 xoshiro256pp
 #define PRNG_STATE_64_T GLUE(PRNG_NAME_64,_t)
 typedef struct PRNG_STATE_64_T { uint64_t s[4]; } PRNG_STATE_64_T;
+typedef struct { uint64_t s0, s1, s2, s3; } PRNG_STATE_64_T;
 #endif // XOSHIRO256PP
 …
 #define PRNG_NAME_32 xoshiro128pp
 #define PRNG_STATE_32_T GLUE(PRNG_NAME_32,_t)
 typedef struct PRNG_STATE_32_T { uint32_t s[4]; } PRNG_STATE_32_T;
+typedef struct { uint32_t s0, s1, s2, s3; } PRNG_STATE_32_T;
 #endif // XOSHIRO128PP
 …
 #endif // XORSHIFT_12_25_27
+#ifdef SPLITMIX_64
+#define PRNG_NAME_64 splitmix64
+#define PRNG_STATE_64_T uint64_t
+#endif // SPLITMIX32
+#ifdef SPLITMIX_32
+#define PRNG_NAME_32 splitmix32
+#define PRNG_STATE_32_T uint32_t
+#endif // SPLITMIX32
 #ifdef KISS_64
 #define PRNG_NAME_64 kiss_64
 #define PRNG_STATE_64_T GLUE(PRNG_NAME_64,_t)
 typedef struct PRNG_STATE_64_T { uint64_t z, w, jsr, jcong; } PRNG_STATE_64_T;
+typedef struct { uint64_t z, w, jsr, jcong; } PRNG_STATE_64_T;
 #endif // KISS_^64
 …
 #define PRNG_NAME_32 xorwow
 #define PRNG_STATE_32_T GLUE(PRNG_NAME_32,_t)
 typedef struct PRNG_STATE_32_T { uint32_t a, b, c, d, counter; } PRNG_STATE_32_T;
+typedef struct { uint32_t a, b, c, d, counter; } PRNG_STATE_32_T;
 #endif // XOSHIRO128PP
 …
 // ALL PRNG ALGORITHMS ARE OPTIMIZED SO THAT THE PRNG LOGIC CAN HAPPEN IN PARALLEL WITH THE USE OF THE RESULT.
+// Therefore, the set_seed routine primes the PRNG by calling it with the state so the seed is not return as the
+// first random value.
+// Specifically, the current random state is copied for returning, before computing the next value.  As a consequence,
+// the set_seed routine primes the PRNG by calling it with the state so the seed is not return as the first random
+// value.
 #ifdef __cforall                                                                                // don't include in C code (invoke.h)
+// https://rosettacode.org/wiki/Pseudo-random_numbers/Splitmix64
+//
+// Splitmix64 is not recommended for demanding random number requirements, but is often used to calculate initial states
+// for other more complex pseudo-random number generators (see https://prng.di.unimi.it).
+// Also https://rosettacode.org/wiki/Pseudo-random_numbers/Splitmix64.
+static inline uint64_t splitmix64( uint64_t & state ) {
+    state += 0x9e3779b97f4a7c15;
+    uint64_t z = state;
+    z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
+    z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
+    return z ^ (z >> 31);
+} // splitmix64
+static inline void splitmix64_set_seed( uint64_t & state , uint64_t seed ) {
+    state = seed;
+    splitmix64( state );                                                                // prime
+} // splitmix64_set_seed
+// https://github.com/bryc/code/blob/master/jshash/PRNGs.md#splitmix32
+//
+// Splitmix32 is not recommended for demanding random number requirements, but is often used to calculate initial states
+// for other more complex pseudo-random number generators (see https://prng.di.unimi.it).
+static inline uint32_t splitmix32( uint32_t & state ) {
+    state += 0x9e3779b9;
+    uint64_t z = state;
+    z = (z ^ (z >> 15)) * 0x85ebca6b;
+    z = (z ^ (z >> 13)) * 0xc2b2ae35;
+    return z ^ (z >> 16);
+} // splitmix32
+static inline void splitmix32_set_seed( uint32_t & state, uint64_t seed ) {
+    state = seed;
+    splitmix32( state );                                                                // prime
+} // splitmix32_set_seed
+#ifdef __SIZEOF_INT128__
+//--------------------------------------------------
+static inline uint64_t lehmer64( __uint128_t & state ) {
+        __uint128_t ret = state;
+        state *= 0x_da94_2042_e4dd_58b5;
+        return ret >> 64;
+} // lehmer64
+static inline void lehmer64_set_seed( __uint128_t & state, uint64_t seed ) {
+        // The seed needs to be coprime with the 2^64 modulus to get the largest period, so no factors of 2 in the seed.
+        state = splitmix64( seed );                                                     // prime
+} // lehmer64_set_seed
+//--------------------------------------------------
+static inline uint64_t wyhash64( uint64_t & state ) {
+        uint64_t ret = state;
+        state += 0x_60be_e2be_e120_fc15;
+        __uint128_t tmp;
+        tmp = (__uint128_t) ret * 0x_a3b1_9535_4a39_b70d;
+        uint64_t m1 = (tmp >> 64) ^ tmp;
+        tmp = (__uint128_t)m1 * 0x_1b03_7387_12fa_d5c9;
+        uint64_t m2 = (tmp >> 64) ^ tmp;
+        return m2;
+} // wyhash64
+static inline void wyhash64_set_seed( uint64_t & state, uint64_t seed ) {
+        state = splitmix64( seed );                                                     // prime
+} // wyhash64_set_seed
+#endif // __SIZEOF_INT128__
 // https://prng.di.unimi.it/xoshiro256starstar.c
 …
 #ifndef XOSHIRO256PP
 typedef struct xoshiro256pp_t { uint64_t s[4]; } xoshiro256pp_t;
+typedef struct { uint64_t s0, s1, s2, s3; } xoshiro256pp_t;
 #endif // ! XOSHIRO256PP
 static inline uint64_t xoshiro256pp( xoshiro256pp_t & rs ) with(rs) {
         inline uint64_t rotl(const uint64_t x, int k) {
+        inline uint64_t rotl( const uint64_t x, int k ) {
                 return (x << k) | (x >> (64 - k));
         } // rotl
         const uint64_t result = rotl( s[0] + s[3], 23 ) + s[0];
         const uint64_t t = s[1] << 17;
         s[2] ^= s[0];
         s[3] ^= s[1];
         s[1] ^= s[2];
         s[0] ^= s[3];
         s[2] ^= t;
         s[3] = rotl( s[3], 45 );
+        const uint64_t result = rotl( s0 + s3, 23 ) + s0;
+        const uint64_t t = s1 << 17;
+        s2 ^= s0;
+        s3 ^= s1;
+        s1 ^= s2;
+        s0 ^= s3;
+        s2 ^= t;
+        s3 = rotl( s3, 45 );
         return result;
 } // xoshiro256pp
+static inline void xoshiro256pp_set_seed( xoshiro256pp_t & state,  uint64_t seed ) {
+        state = (xoshiro256pp_t){ {seed, seed, seed, seed} };
+        xoshiro256pp( state );
+static inline void xoshiro256pp_set_seed( xoshiro256pp_t & state, uint64_t seed ) {
+    // To attain repeatable seeding, compute seeds separately because the order of argument evaluation is undefined.
+    uint64_t seed1 = splitmix64( seed );                                // prime
+    uint64_t seed2 = splitmix64( seed );
+    uint64_t seed3 = splitmix64( seed );
+    uint64_t seed4 = splitmix64( seed );
+        state = (xoshiro256pp_t){ seed1, seed2, seed3, seed4 };
 } // xoshiro256pp_set_seed
 …
 #ifndef XOSHIRO128PP
 typedef struct xoshiro128pp_t { uint32_t s[4]; } xoshiro128pp_t;
+typedef struct { uint32_t s0, s1, s2, s3; } xoshiro128pp_t;
 #endif // ! XOSHIRO128PP
 …
         } // rotl
         const uint32_t result = rotl( s[0] + s[3], 7 ) + s[0];
         const uint32_t t = s[1] << 9;
         s[2] ^= s[0];
         s[3] ^= s[1];
         s[1] ^= s[2];
         s[0] ^= s[3];
         s[2] ^= t;
         s[3] = rotl( s[3], 11 );
+        const uint32_t result = rotl( s0 + s3, 7 ) + s0;
+        const uint32_t t = s1 << 9;
+        s2 ^= s0;
+        s3 ^= s1;
+        s1 ^= s2;
+        s0 ^= s3;
+        s2 ^= t;
+        s3 = rotl( s3, 11 );
         return result;
 } // xoshiro128pp
 static inline void xoshiro128pp_set_seed( xoshiro128pp_t & state, uint32_t seed ) {
+        state = (xoshiro128pp_t){ {seed, seed, seed, seed} };
+        xoshiro128pp( state );                                                          // prime
+    // To attain repeatable seeding, compute seeds separately because the order of argument evaluation is undefined.
+    uint32_t seed1 = splitmix32( seed );                                // prime
+    uint32_t seed2 = splitmix32( seed );
+    uint32_t seed3 = splitmix32( seed );
+    uint32_t seed4 = splitmix32( seed );
+        state = (xoshiro128pp_t){ seed1, seed2, seed3, seed4 };
 } // xoshiro128pp_set_seed
-#ifdef __SIZEOF_INT128__
-        // Pipelined to allow out-of-order overlap with reduced dependencies. Critically, the current random state is
-        // returned (copied), and then compute and store the next random value.
-        //--------------------------------------------------
-        static inline uint64_t lehmer64( __uint128_t & state ) {
-                __uint128_t ret = state;
-                state *= 0xda942042e4dd58b5;
-                return ret >> 64;
-        } // lehmer64
-        static inline void lehmer64_set_seed( __uint128_t & state, uint64_t seed ) {
-                state = seed;
-                lehmer64( state );
-        } // lehmer64_set_seed
-        //--------------------------------------------------
-        static inline uint64_t wyhash64( uint64_t & state ) {
-                uint64_t ret = state;
-                state += 0x_60be_e2be_e120_fc15;
-                __uint128_t tmp;
-                tmp = (__uint128_t) ret * 0x_a3b1_9535_4a39_b70d;
-                uint64_t m1 = (tmp >> 64) ^ tmp;
-                tmp = (__uint128_t)m1 * 0x_1b03_7387_12fa_d5c9;
-                uint64_t m2 = (tmp >> 64) ^ tmp;
-                return m2;
-        } // wyhash64
-        static inline void wyhash64_set_seed( uint64_t & state, uint64_t seed ) {
-                state = seed;
-                wyhash64( state );                                                              // prime
-        } // wyhash64_set_seed
-#endif // __SIZEOF_INT128__
 //--------------------------------------------------
 …
 static inline void xorshift_13_7_17_set_seed( uint64_t & state, uint64_t seed ) {
+        state = seed;
+        xorshift_13_7_17( state );                                                      // prime
+        state = splitmix64( seed );                                                     // prime
 } // xorshift_13_7_17_set_seed
 …
 static inline void xorshift_6_21_7_set_seed( uint32_t & state, uint32_t seed ) {
+        state = seed;
+        xorshift_6_21_7( state );                                                       // prime
+    state = splitmix32( seed );                                                 // prime
 } // xorshift_6_21_7_set_seed
 …
 static inline void xorshift_12_25_27_set_seed( uint64_t & state, uint64_t seed ) {
+        state = seed;
+        xorshift_12_25_27( state );                                                     // prime
+        state = splitmix64( seed );                                                     // prime
 } // xorshift_12_25_27_set_seed
 …
 // The state must be seeded with a nonzero value.
 #ifndef KISS_64
 typedef struct kiss_64_t { uint64_t z, w, jsr, jcong; } kiss_64_t;
+typedef struct { uint64_t z, w, jsr, jcong; } kiss_64_t;
 #endif // ! KISS_64
 static inline uint64_t kiss_64( kiss_64_t & state ) with(state) {
         kiss_64_t ret = state;
+static inline uint64_t kiss_64( kiss_64_t & rs ) with(rs) {
+        kiss_64_t ret = rs;
         z = 36969 * (z & 65535) + (z >> 16);
         w = 18000 * (w & 65535) + (w >> 16);
-        jsr ^= (jsr << 17);
         jsr ^= (jsr << 13);
+        jsr ^= (jsr >> 17);
         jsr ^= (jsr << 5);
         jcong = 69069 * jcong + 1234567;
 …
 } // kiss_64
+static inline void kiss_64_set_seed( kiss_64_t & state, uint64_t seed ) with(state) {
+        z = 1; w = 1; jsr = 4; jcong = seed;
+        kiss_64( state );                                                                       // prime
+static inline void kiss_64_set_seed( kiss_64_t & rs, uint64_t seed ) with(rs) {
+        z = 1; w = 1; jsr = 4; jcong = splitmix64( seed );      // prime
 } // kiss_64_set_seed
 …
 // The state array must be initialized to non-zero in the first four words.
 #ifndef XORWOW
 typedef struct xorwow_t { uint32_t a, b, c, d, counter; } xorwow_t;
+typedef struct { uint32_t a, b, c, d, counter; } xorwow_t;
 #endif // ! XORWOW
 static inline uint32_t xorwow( xorwow_t & state ) with(state) {
+static inline uint32_t xorwow( xorwow_t & rs ) with(rs) {
         // Algorithm "xorwow" from p. 5 of Marsaglia, "Xorshift RNGs".
         uint32_t ret = a + counter;
 …
 } // xorwow
+static inline void xorwow_set_seed( xorwow_t & state, uint32_t seed ) {
+        state = (xorwow_t){ seed, seed, seed, seed, 0 };
+        xorwow( state );                                                                        // prime
+static inline void xorwow_set_seed( xorwow_t & rs, uint32_t seed ) {
+    // To attain repeatable seeding, compute seeds separately because the order of argument evaluation is undefined.
+    uint32_t seed1 = splitmix32( seed );                                // prime
+    uint32_t seed2 = splitmix32( seed );
+    uint32_t seed3 = splitmix32( seed );
+    uint32_t seed4 = splitmix32( seed );
+        rs = (xorwow_t){ seed1, seed2, seed3, seed4, 0 };
 } // xorwow_set_seed
 …
 // Used in __tls_rand_fwd
 #define M  (1_l64u << 48_l64u)
 #define A  (25214903917_l64u)
 #define AI (18446708753438544741_l64u)
+#define A  (25_214_903_917_l64u)
+#define AI (18_446_708_753_438_544_741_l64u)
 #define C  (11_l64u)
 #define D  (16_l64u)
 // Bi-directional LCG random-number generator
 static inline uint32_t LCGBI_fwd( uint64_t & state ) {
         state = (A * state + C) & (M - 1);
         return state >> D;
+static inline uint32_t LCGBI_fwd( uint64_t & rs ) {
+        rs = (A * rs + C) & (M - 1);
+        return rs >> D;
 } // LCGBI_fwd
 static inline uint32_t LCGBI_bck( uint64_t & state ) {
         unsigned int r = state >> D;
         state = AI * (state - C) & (M - 1);
+static inline uint32_t LCGBI_bck( uint64_t & rs ) {
+        unsigned int r = rs >> D;
+        rs = AI * (rs - C) & (M - 1);
         return r;
 } // LCGBI_bck

libcfa/src/bits/weakso_locks.cfa

-              r34b4268
+              r24d6572
 // Update Count     :
 //
 #include "bits/weakso_locks.hfa"
 #pragma GCC visibility push(default)
 …
 void unlock( blocking_lock & ) {}
 void on_notify( blocking_lock &, struct thread$ * ) {}
 size_t on_wait( blocking_lock & ) { return 0; }
+size_t on_wait( blocking_lock &, void (*pp_fn)( void * ), void * pp_datum ) { return 0; }
 void on_wakeup( blocking_lock &, size_t ) {}
 size_t wait_count( blocking_lock & ) { return 0; }
+bool register_select( blocking_lock & this, select_node & node ) { return false; }
+bool unregister_select( blocking_lock & this, select_node & node ) { return false; }
+void on_selected( blocking_lock & this, select_node & node ) {}

libcfa/src/bits/weakso_locks.hfa

-              r34b4268
+              r24d6572
 #include "containers/list.hfa"
 struct thread$;
+struct select_node;
 //-----------------------------------------------------------------------------
 …
         // List of blocked threads
         dlist( thread$ ) blocked_threads;
+        dlist( select_node ) blocked_threads;
         // Count of current blocked threads
 …
 void unlock( blocking_lock & this ) OPTIONAL_THREAD;
 void on_notify( blocking_lock & this, struct thread$ * t ) OPTIONAL_THREAD;
 size_t on_wait( blocking_lock & this ) OPTIONAL_THREAD;
+size_t on_wait( blocking_lock & this, void (*pp_fn)( void * ), void * pp_datum ) OPTIONAL_THREAD;
 void on_wakeup( blocking_lock & this, size_t ) OPTIONAL_THREAD;
 size_t wait_count( blocking_lock & this ) OPTIONAL_THREAD;
+bool register_select( blocking_lock & this, select_node & node ) OPTIONAL_THREAD;
+bool unregister_select( blocking_lock & this, select_node & node ) OPTIONAL_THREAD;
+void on_selected( blocking_lock & this, select_node & node ) OPTIONAL_THREAD;
 //----------
 …
 static inline bool   try_lock ( multiple_acquisition_lock & this ) { return try_lock( (blocking_lock &)this ); }
 static inline void   unlock   ( multiple_acquisition_lock & this ) { unlock  ( (blocking_lock &)this ); }
 static inline size_t on_wait  ( multiple_acquisition_lock & this ) { return on_wait ( (blocking_lock &)this ); }
+static inline size_t on_wait  ( multiple_acquisition_lock & this, void (*pp_fn)( void * ), void * pp_datum ) { return on_wait ( (blocking_lock &)this, pp_fn, pp_datum ); }
 static inline void   on_wakeup( multiple_acquisition_lock & this, size_t v ) { on_wakeup ( (blocking_lock &)this, v ); }
 static inline void   on_notify( multiple_acquisition_lock & this, struct thread$ * t ){ on_notify( (blocking_lock &)this, t ); }
+static inline bool   register_select( multiple_acquisition_lock & this, select_node & node ) { return register_select( (blocking_lock &)this, node ); }
+static inline bool   unregister_select( multiple_acquisition_lock & this, select_node & node ) { return unregister_select( (blocking_lock &)this, node ); }
+static inline void   on_selected( multiple_acquisition_lock & this, select_node & node ) { on_selected( (blocking_lock &)this, node ); }

libcfa/src/common.hfa

-              r34b4268
+              r24d6572
 } // extern "C"
 static inline __attribute__((always_inline)) {
         unsigned char abs( signed char v ) { return abs( (int)v ); }
+        unsigned char abs( signed char v ) { return (int)abs( (int)v ); }
         // use default C routine for int
         unsigned long int abs( long int v ) { return labs( v ); }
 …
         unsigned int min( unsigned int v1, unsigned int v2 ) { return v1 < v2 ? v1 : v2; }
         long int min( long int v1, long int v2 ) { return v1 < v2 ? v1 : v2; }
         unsigned long int min( unsigned long int v1, unsigned int v2 ) { return v1 < v2 ? v1 : v2; }
+        unsigned long int min( unsigned long int v1, unsigned long int v2 ) { return v1 < v2 ? v1 : v2; }
         long long int min( long long int v1, long long int v2 ) { return v1 < v2 ? v1 : v2; }
         unsigned long long int min( unsigned long long int v1, unsigned int v2 ) { return v1 < v2 ? v1 : v2; }
+        unsigned long long int min( unsigned long long int v1, unsigned long long int v2 ) { return v1 < v2 ? v1 : v2; }
         forall( T | { int ?<?( T, T ); } )                                      // generic
         T min( T v1, T v2 ) { return v1 < v2 ? v1 : v2; }

libcfa/src/concurrency/clib/cfathread.cfa

-              r34b4268
+              r24d6572
 // #define EPOLL_FOR_SOCKETS
+#include <string.h>
 #include "fstream.hfa"
 #include "locks.hfa"
 …
 #include "time.hfa"
 #include "stdlib.hfa"
+#include "iofwd.hfa"
 #include "cfathread.h"
-extern "C" {
-                #include <string.h>
-                #include <errno.h>
+}
 extern void ?{}(processor &, const char[], cluster &, thread$ *);
 extern "C" {
+      extern void __cfactx_invoke_thread(void (*main)(void *), void * this);
+        extern int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
+        extern void __cfactx_invoke_thread(void (*main)(void *), void * this);
+}
 …
         // Mutex
         struct cfathread_mutex {
                 linear_backoff_then_block_lock impl;
+                exp_backoff_then_block_lock impl;
         };
         int cfathread_mutex_init(cfathread_mutex_t *restrict mut, const cfathread_mutexattr_t *restrict) __attribute__((nonnull (1))) { *mut = new(); return 0; }
 …
         // Condition
         struct cfathread_condition {
                 condition_variable(linear_backoff_then_block_lock) impl;
+                condition_variable(exp_backoff_then_block_lock) impl;
         };
         int cfathread_cond_init(cfathread_cond_t *restrict cond, const cfathread_condattr_t *restrict) __attribute__((nonnull (1))) { *cond = new(); return 0; }
 …
+}
-#include <iofwd.hfa>
 extern "C" {
-        #include <unistd.h>
-        #include <sys/types.h>
-        #include <sys/socket.h>
         //--------------------
         // IO operations
 …
                 , protocol);
+        }
         int cfathread_bind(int socket, const struct sockaddr *address, socklen_t address_len) {
+        int cfathread_bind(int socket, __CONST_SOCKADDR_ARG address, socklen_t address_len) {
                 return bind(socket, address, address_len);
+        }
 …
+        }
         int cfathread_accept(int socket, struct sockaddr *restrict address, socklen_t *restrict address_len) {
+        int cfathread_accept(int socket, __SOCKADDR_ARG address, socklen_t *restrict address_len) {
                 #if defined(EPOLL_FOR_SOCKETS)
                         int ret;
 …
+        }
         int cfathread_connect(int socket, const struct sockaddr *address, socklen_t address_len) {
+        int cfathread_connect(int socket, __CONST_SOCKADDR_ARG address, socklen_t address_len) {
                 #if defined(EPOLL_FOR_SOCKETS)
                         int ret;

libcfa/src/concurrency/clib/cfathread.h

-              r34b4268
+              r24d6572
 // Author           : Thierry Delisle
 // Created On       : Tue Sep 22 15:31:20 2020
 // Last Modified By :
 // Last Modified On :
 // Update Count     :
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Mon Mar 13 23:48:40 2023
+// Update Count     : 7
 //
+#pragma once
 #if defined(__cforall) || defined(__cplusplus)
+#include <unistd.h>
+#include <errno.h>
+#include <sys/socket.h>
 extern "C" {
 #endif
-        #include <asm/types.h>
-        #include <errno.h>
-        #include <unistd.h>
         //--------------------
         // Basic types
 …
         } cfathread_mutexattr_t;
         typedef struct cfathread_mutex * cfathread_mutex_t;
         int cfathread_mutex_init(cfathread_mutex_t *restrict mut, const cfathread_mutexattr_t *restrict attr) __attribute__((nonnull (1)));
+        int cfathread_mutex_init(cfathread_mutex_t * restrict mut, const cfathread_mutexattr_t * restrict attr) __attribute__((nonnull (1)));
         int cfathread_mutex_destroy(cfathread_mutex_t *mut) __attribute__((nonnull (1)));
         int cfathread_mutex_lock(cfathread_mutex_t *mut) __attribute__((nonnull (1)));
 …
         //--------------------
         // IO operations
-        struct sockaddr;
-        struct msghdr;
         int cfathread_socket(int domain, int type, int protocol);
         int cfathread_bind(int socket, const struct sockaddr *address, socklen_t address_len);
+        int cfathread_bind(int socket, __CONST_SOCKADDR_ARG address, socklen_t address_len);
         int cfathread_listen(int socket, int backlog);
         int cfathread_accept(int socket, struct sockaddr *restrict address, socklen_t *restrict address_len);
         int cfathread_connect(int socket, const struct sockaddr *address, socklen_t address_len);
+        int cfathread_accept(int socket, __SOCKADDR_ARG address, socklen_t * restrict address_len);
+        int cfathread_connect(int socket, __CONST_SOCKADDR_ARG address, socklen_t address_len);
         int cfathread_dup(int fildes);
         int cfathread_close(int fildes);

libcfa/src/concurrency/coroutine.cfa

-              r34b4268
+              r24d6572
 // Created On       : Mon Nov 28 12:27:26 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Dec 15 12:06:04 2020
 // Update Count     : 23
+// Last Modified On : Thu Feb 16 15:34:46 2023
+// Update Count     : 24
 //
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include "coroutine.hfa"

libcfa/src/concurrency/coroutine.hfa

-              r34b4268
+              r24d6572
 // Created On       : Mon Nov 28 12:27:26 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Jan  6 16:33:16 2022
 // Update Count     : 12
+// Last Modified On : Thu Feb  2 11:31:42 2023
+// Update Count     : 13
 //
 …
 // Anything that implements this trait can be resumed.
 // Anything that is resumed is a coroutine.
+trait is_coroutine(T & | IS_RESUMPTION_EXCEPTION(CoroutineCancelled(T))) {
+forall( T & | IS_RESUMPTION_EXCEPTION(CoroutineCancelled(T)) )
+trait is_coroutine {
         void main(T & this);
         coroutine$ * get_coroutine(T & this);

libcfa/src/concurrency/future.hfa

-              r34b4268
+              r24d6572
 // file "LICENCE" distributed with Cforall.
 //
 // io/types.hfa --
 //
 // Author           : Thierry Delisle & Peiran Hong
+// concurrency/future.hfa --
+//
+// Author           : Thierry Delisle & Peiran Hong & Colby Parsons
 // Created On       : Wed Jan 06 17:33:18 2021
 // Last Modified By :
 …
 #include "bits/locks.hfa"
 #include "monitor.hfa"
+#include "select.hfa"
+#include "locks.hfa"
+//----------------------------------------------------------------------------
+// future
+// I don't use future_t here since I need to use a lock for this future
+//  since it supports multiple consumers
+//  future_t is lockfree and uses atomics which aren't needed given we use locks here
 forall( T ) {
+    // enum { FUTURE_EMPTY = 0, FUTURE_FULFILLED = 1 }; // Enums seem to be broken so feel free to add this back afterwards
+    // temporary enum replacement
+    const int FUTURE_EMPTY = 0;
+    const int FUTURE_FULFILLED = 1;
         struct future {
+                int state;
+                T result;
+                dlist( select_node ) waiters;
+        futex_mutex lock;
+        };
+    struct future_node {
+        inline select_node;
+        T * my_result;
+    };
+        static inline {
+        void ?{}( future_node(T) & this, thread$ * blocked_thread, T * my_result ) {
+            ((select_node &)this){ blocked_thread };
+            this.my_result = my_result;
+        }
+        void ?{}( future(T) & this ) {
+                        this.waiters{};
+            this.state = FUTURE_EMPTY;
+            this.lock{};
+                }
+                // Reset future back to original state
+                void reset( future(T) & this ) with(this)
+        {
+            lock( lock );
+            if( ! waiters`isEmpty )
+                abort("Attempting to reset a future with blocked waiters");
+            state = FUTURE_EMPTY;
+            unlock( lock );
+        }
+                // check if the future is available
+        // currently no mutual exclusion because I can't see when you need this call to be synchronous or protected
+                bool available( future(T) & this ) { return __atomic_load_n( &this.state, __ATOMIC_RELAXED ); }
+        // memcpy wrapper to help copy values
+        void copy_T( T & from, T & to ) {
+            memcpy((void *)&to, (void *)&from, sizeof(T));
+        }
+        // internal helper to signal waiters off of the future
+        void _internal_flush( future(T) & this ) with(this) {
+            while( ! waiters`isEmpty ) {
+                if ( !__handle_waituntil_OR( waiters ) ) // handle special waituntil OR case
+                    break; // if handle_OR returns false then waiters is empty so break
+                select_node &s = try_pop_front( waiters );
+                if ( s.clause_status == 0p ) // poke in result so that woken threads do not need to reacquire any locks
+                    copy_T( result, *(((future_node(T) &)s).my_result) );
+                wake_one( waiters, s );
+            }
+        }
+                // Fulfil the future, returns whether or not someone was unblocked
+                bool fulfil( future(T) & this, T val ) with(this) {
+            lock( lock );
+            if( state != FUTURE_EMPTY )
+                abort("Attempting to fulfil a future that has already been fulfilled");
+            copy_T( val, result );
+            bool ret_val = ! waiters`isEmpty;
+            state = FUTURE_FULFILLED;
+                        _internal_flush( this );
+            unlock( lock );
+            return ret_val;
+                }
+                // Wait for the future to be fulfilled
+                // Also return whether the thread had to block or not
+                [T, bool] get( future(T) & this ) with( this ) {
+            lock( lock );
+            T ret_val;
+            if( state == FUTURE_FULFILLED ) {
+                copy_T( result, ret_val );
+                unlock( lock );
+                return [ret_val, false];
+            }
+            future_node(T) node = { active_thread(), &ret_val };
+            insert_last( waiters, ((select_node &)node) );
+            unlock( lock );
+            park( );
+                        return [ret_val, true];
+                }
+                // Wait for the future to be fulfilled
+                T get( future(T) & this ) {
+                        [T, bool] tt;
+                        tt = get(this);
+                        return tt.0;
+                }
+        // Gets value if it is available and returns [ val, true ]
+        // otherwise returns [ default_val, false]
+        // will not block
+        [T, bool] try_get( future(T) & this ) with(this) {
+            lock( lock );
+            T ret_val;
+            if( state == FUTURE_FULFILLED ) {
+                copy_T( result, ret_val );
+                unlock( lock );
+                return [ret_val, true];
+            }
+            unlock( lock );
+            return [ret_val, false];
+        }
+        bool register_select( future(T) & this, select_node & s ) with(this) {
+            lock( lock );
+            // check if we can complete operation. If so race to establish winner in special OR case
+            if ( !s.park_counter && state != FUTURE_EMPTY ) {
+                if ( !__make_select_node_available( s ) ) { // we didn't win the race so give up on registering
+                    unlock( lock );
+                    return false;
+                }
+            }
+            // future not ready -> insert select node and return
+            if( state == FUTURE_EMPTY ) {
+                insert_last( waiters, s );
+                unlock( lock );
+                return false;
+            }
+            __make_select_node_available( s );
+            unlock( lock );
+            return true;
+        }
+        bool unregister_select( future(T) & this, select_node & s ) with(this) {
+            if ( ! s`isListed ) return false;
+            lock( lock );
+            if ( s`isListed ) remove( s );
+            unlock( lock );
+            return false;
+        }
+        void on_selected( future(T) & this, select_node & node ) {}
+        }
+}
+//--------------------------------------------------------------------------------------------------------
+// These futures below do not support select statements so they may not have as many features as 'future'
+//  however the 'single_future' is cheap and cheerful and is most likely more performant than 'future'
+//  since it uses raw atomics and no locks
+//
+// As far as 'multi_future' goes I can't see many use cases as it will be less performant than 'future'
+//  since it is monitor based and also is not compatible with select statements
+//--------------------------------------------------------------------------------------------------------
+forall( T ) {
+        struct single_future {
                 inline future_t;
                 T result;
 …
         static inline {
                 // Reset future back to original state
                 void reset(future(T) & this) { reset( (future_t&)this ); }
+                void reset(single_future(T) & this) { reset( (future_t&)this ); }
                 // check if the future is available
                 bool available( future(T) & this ) { return available( (future_t&)this ); }
+                bool available( single_future(T) & this ) { return available( (future_t&)this ); }
                 // Mark the future as abandoned, meaning it will be deleted by the server
                 // This doesn't work beause of the potential need for a destructor
                 void abandon( future(T) & this );
+                void abandon( single_future(T) & this );
                 // Fulfil the future, returns whether or not someone was unblocked
                 thread$ * fulfil( future(T) & this, T result ) {
+                thread$ * fulfil( single_future(T) & this, T result ) {
                         this.result = result;
                         return fulfil( (future_t&)this );
 …
                 // Wait for the future to be fulfilled
                 // Also return whether the thread had to block or not
                 [T, bool] wait( future(T) & this ) {
+                [T, bool] wait( single_future(T) & this ) {
                         bool r = wait( (future_t&)this );
                         return [this.result, r];
 …
                 // Wait for the future to be fulfilled
                 T wait( future(T) & this ) {
+                T wait( single_future(T) & this ) {
                         [T, bool] tt;
                         tt = wait(this);

libcfa/src/concurrency/invoke.h

-              r34b4268
+              r24d6572
 // Created On       : Tue Jan 17 12:27:26 2016
 // Last Modified By : Peter A. Buhr
+// Last Modified On : Tue Nov 29 20:42:21 2022
+// Update Count     : 56
+//
+// Last Modified On : Tue Mar 14 13:39:31 2023
+// Update Count     : 59
+//
+// No not use #pragma once was this file is included twice in some places. It has its own guard system.
 #include "bits/containers.hfa"
 …
                 struct __thread_user_link cltr_link;
-                // used to store state between clh lock/unlock
-                volatile bool * clh_prev;
-                // used to point to this thd's current clh node
-                volatile bool * clh_node;
                 struct processor * last_proc;
+        // ptr used during handover between blocking lists to allow for stack allocation of intrusive nodes
+        // main use case is wait-morphing to allow a different node to be used to block on condvar vs lock
+        void * link_node;
                 PRNG_STATE_T random_state;                                              // fast random numbers

libcfa/src/concurrency/io.cfa

-              r34b4268
+              r24d6572
 #define __cforall_thread__
-#define _GNU_SOURCE
 #if defined(__CFA_DEBUG__)
 …
         static io_context$ * __ioarbiter_allocate( io_arbiter$ & this, __u32 idxs[], __u32 want );
         static void __ioarbiter_submit( io_context$ * , __u32 idxs[], __u32 have, bool lazy );
         static void __ioarbiter_flush ( io_context$ & );
+        static void __ioarbiter_flush ( io_context$ &, bool kernel );
         static inline void __ioarbiter_notify( io_context$ & ctx );
 //=============================================================================================
 …
         extern void __kernel_unpark( thread$ * thrd, unpark_hint );
+        static inline void __post(oneshot & this, bool kernel, unpark_hint hint) {
+                thread$ * t = post( this, false );
+                if(kernel) __kernel_unpark( t, hint );
+                else unpark( t, hint );
+        }
+        // actual system call of io uring
+        // wrap so everything that needs to happen around it is always done
+        //   i.e., stats, book keeping, sqe reclamation, etc.
         static void ioring_syscsll( struct io_context$ & ctx, unsigned int min_comp, unsigned int flags ) {
                 __STATS__( true, io.calls.flush++; )
                 int ret;
                 for() {
+                        // do the system call in a loop, repeat on interrupts
                         ret = syscall( __NR_io_uring_enter, ctx.fd, ctx.sq.to_submit, min_comp, flags, (sigset_t *)0p, _NSIG / 8);
                         if( ret < 0 ) {
 …
                 /* paranoid */ verify( ctx.sq.to_submit >= ret );
+                ctx.sq.to_submit -= ret;
+                // keep track of how many still need submitting
+                __atomic_fetch_sub(&ctx.sq.to_submit, ret, __ATOMIC_SEQ_CST);
                 /* paranoid */ verify( ctx.sq.to_submit <= *ctx.sq.num );
 …
                 /* paranoid */ verify( ! __preemption_enabled() );
+                // mark that there is no pending io left
                 __atomic_store_n(&ctx.proc->io.pending, false, __ATOMIC_RELAXED);
+        }
+        // try to acquire an io context for draining, helping means we never *need* to drain, we can always do it later
         static bool try_acquire( io_context$ * ctx ) __attribute__((nonnull(1))) {
                 /* paranoid */ verify( ! __preemption_enabled() );
 …
+                {
+                        // if there is nothing to drain there is no point in acquiring anything
                         const __u32 head = *ctx->cq.head;
                         const __u32 tail = *ctx->cq.tail;
 …
+                }
+                // Drain the queue
+                if(!__atomic_try_acquire(&ctx->cq.lock)) {
+                // try a simple spinlock acquire, it's likely there are completions to drain
+                if(!__atomic_try_acquire(&ctx->cq.try_lock)) {
+                        // some other processor already has it
                         __STATS__( false, io.calls.locked++; )
                         return false;
+                }
+                // acquired!!
                 return true;
+        }
+        // actually drain the completion
         static bool __cfa_do_drain( io_context$ * ctx, cluster * cltr ) __attribute__((nonnull(1, 2))) {
                 /* paranoid */ verify( ! __preemption_enabled() );
                 /* paranoid */ verify( ready_schedule_islocked() );
+                /* paranoid */ verify( ctx->cq.lock == true );
+                /* paranoid */ verify( ctx->cq.try_lock == true );
+                // get all the invariants and initial state
                 const __u32 mask = *ctx->cq.mask;
                 const __u32 num  = *ctx->cq.num;
 …
                 for() {
                         // re-read the head and tail in case it already changed.
+                        // count the difference between the two
                         const __u32 head = *ctx->cq.head;
                         const __u32 tail = *ctx->cq.tail;
 …
                         __STATS__( false, io.calls.drain++; io.calls.completed += count; )
+                        // for everything between head and tail, drain it
                         for(i; count) {
                                 unsigned idx = (head + i) & mask;
 …
                                 /* paranoid */ verify(&cqe);
+                                // find the future in the completion
                                 struct io_future_t * future = (struct io_future_t *)(uintptr_t)cqe.user_data;
                                 // __cfadbg_print_safe( io, "Kernel I/O : Syscall completed : cqe %p, result %d for %p\n", &cqe, cqe.res, future );
+                                // don't directly fulfill the future, preemption is disabled so we need to use kernel_unpark
                                 __kernel_unpark( fulfil( *future, cqe.res, false ), UNPARK_LOCAL );
+                        }
+                        // update the timestamps accordingly
+                        // keep a local copy so we can update the relaxed copy
                         ts_next = ctx->cq.ts = rdtscl();
 …
                         ctx->proc->idle_wctx.drain_time = ts_next;
+                        // we finished draining the completions... unless the ring buffer was full and there are more secret completions in the kernel.
                         if(likely(count < num)) break;
+                        // the ring buffer was full, there could be more stuff in the kernel.
                         ioring_syscsll( *ctx, 0, IORING_ENTER_GETEVENTS);
+                }
 …
                 /* paranoid */ verify( ! __preemption_enabled() );
+                __atomic_unlock(&ctx->cq.lock);
+                // everything is drained, we can release the lock
+                __atomic_unlock(&ctx->cq.try_lock);
+                // update the relaxed timestamp
                 touch_tsc( cltr->sched.io.tscs, ctx->cq.id, ts_prev, ts_next, false );
 …
+        }
+        // call from a processor to flush
+        // contains all the bookkeeping a proc must do, not just the barebones flushing logic
+        void __cfa_do_flush( io_context$ & ctx, bool kernel ) {
+                /* paranoid */ verify( ! __preemption_enabled() );
+                // flush any external requests
+                ctx.sq.last_external = false; // clear the external bit, the arbiter will reset it if needed
+                __ioarbiter_flush( ctx, kernel );
+                // if submitting must be submitted, do the system call
+                if(ctx.sq.to_submit != 0) {
+                        ioring_syscsll(ctx, 0, 0);
+                }
+        }
+        // call from a processor to drain
+        // contains all the bookkeeping a proc must do, not just the barebones draining logic
         bool __cfa_io_drain( struct processor * proc ) {
                 bool local = false;
                 bool remote = false;
+                // make sure no ones creates/destroys io contexts
                 ready_schedule_lock();
 …
                 /* paranoid */ verify( ctx );
+                // Help if needed
                 with(cltr->sched) {
                         const size_t ctxs_count = io.count;
 …
                         const unsigned long long ctsc = rdtscl();
+                        // only help once every other time
+                        // pick a target when not helping
                         if(proc->io.target == UINT_MAX) {
                                 uint64_t chaos = __tls_rand();
+                                // choose who to help and whether to accept helping far processors
                                 unsigned ext = chaos & 0xff;
                                 unsigned other  = (chaos >> 8) % (ctxs_count);
+                                // if the processor is on the same cache line or is lucky ( 3 out of 256 odds ) help it
                                 if(ext < 3 || __atomic_load_n(&caches[other / __shard_factor.io].id, __ATOMIC_RELAXED) == this_cache) {
                                         proc->io.target = other;
 …
+                        }
                         else {
+                                // a target was picked last time, help it
                                 const unsigned target = proc->io.target;
                                 /* paranoid */ verify( io.tscs[target].t.tv != ULLONG_MAX );
+                                // make sure the target hasn't stopped existing since last time
                                 HELP: if(target < ctxs_count) {
+                                        // calculate it's age and how young it could be before we give up on helping
                                         const __readyQ_avg_t cutoff = calc_cutoff(ctsc, ctx->cq.id, ctxs_count, io.data, io.tscs, __shard_factor.io, false);
                                         const __readyQ_avg_t age = moving_average(ctsc, io.tscs[target].t.tv, io.tscs[target].t.ma, false);
                                         __cfadbg_print_safe(io, "Kernel I/O: Help attempt on %u from %u, age %'llu vs cutoff %'llu, %s\n", target, ctx->cq.id, age, cutoff, age > cutoff ? "yes" : "no");
+                                        // is the target older than the cutoff, recall 0 is oldest and bigger ints are younger
                                         if(age <= cutoff) break HELP;
+                                        if(!try_acquire(io.data[target])) break HELP;
+                                        // attempt to help the submission side
+                                        __cfa_do_flush( *io.data[target], true );
+                                        // attempt to help the completion side
+                                        if(!try_acquire(io.data[target])) break HELP; // already acquire no help needed
+                                        // actually help
                                         if(!__cfa_do_drain( io.data[target], cltr )) break HELP;
+                                        // track we did help someone
                                         remote = true;
                                         __STATS__( true, io.calls.helped++; )
+                                }
+                                // reset the target
                                 proc->io.target = UINT_MAX;
+                        }
+                }
                 // Drain the local queue
 …
                 ready_schedule_unlock();
+                // return true if some completion entry, local or remote, was drained
                 return local || remote;
+        }
+        // call from a processor to flush
+        // contains all the bookkeeping a proc must do, not just the barebones flushing logic
         bool __cfa_io_flush( struct processor * proc ) {
                 /* paranoid */ verify( ! __preemption_enabled() );
 …
                 /* paranoid */ verify( proc->io.ctx );
+                io_context$ & ctx = *proc->io.ctx;
+                __ioarbiter_flush( ctx );
+                if(ctx.sq.to_submit != 0) {
+                        ioring_syscsll(ctx, 0, 0);
+                }
+                __cfa_do_flush( *proc->io.ctx, false );
+                // also drain since some stuff will immediately complete
                 return __cfa_io_drain( proc );
+        }
 …
         //=============================================================================================
         // submission
+        static inline void __submit_only( struct io_context$ * ctx, __u32 idxs[], __u32 have) {
+        // barebones logic to submit a group of sqes
+        static inline void __submit_only( struct io_context$ * ctx, __u32 idxs[], __u32 have, bool lock) {
+                if(!lock)
+                        lock( ctx->ext_sq.lock __cfaabi_dbg_ctx2 );
                 // We can proceed to the fast path
                 // Get the right objects
 …
                 // Make the sqes visible to the submitter
                 __atomic_store_n(sq.kring.tail, tail + have, __ATOMIC_RELEASE);
+                sq.to_submit += have;
+                __atomic_fetch_add(&sq.to_submit, have, __ATOMIC_SEQ_CST);
+                // set the bit to mark things need to be flushed
                 __atomic_store_n(&ctx->proc->io.pending, true, __ATOMIC_RELAXED);
                 __atomic_store_n(&ctx->proc->io.dirty  , true, __ATOMIC_RELAXED);
+        }
+                if(!lock)
+                        unlock( ctx->ext_sq.lock );
+        }
+        // submission logic + maybe flushing
         static inline void __submit( struct io_context$ * ctx, __u32 idxs[], __u32 have, bool lazy) {
                 __sub_ring_t & sq = ctx->sq;
                 __submit_only(ctx, idxs, have);
+                __submit_only(ctx, idxs, have, false);
                 if(sq.to_submit > 30) {
 …
+        }
+        // call from a processor to flush
+        // might require arbitration if the thread was migrated after the allocation
         void cfa_io_submit( struct io_context$ * inctx, __u32 idxs[], __u32 have, bool lazy ) __attribute__((nonnull (1))) libcfa_public {
                 // __cfadbg_print_safe(io, "Kernel I/O : attempting to submit %u (%s)\n", have, lazy ? "lazy" : "eager");
 …
                 if( ctx == inctx )              // We have the right instance?
+                {
+                        // yes! fast submit
                         __submit(ctx, idxs, have, lazy);
 …
                 __atomic_store_n(&ctx.sq.free_ring.tail, ftail + count, __ATOMIC_SEQ_CST);
+                // notify the allocator that new allocations can be made
                 __ioarbiter_notify(ctx);
 …
+        }
+        // notify the arbiter that new allocations are available
         static void __ioarbiter_notify( io_arbiter$ & this, io_context$ * ctx ) {
                 /* paranoid */ verify( !empty(this.pending.queue) );
+                /* paranoid */ verify( __preemption_enabled() );
+                // mutual exclusion is needed
                 lock( this.pending.lock __cfaabi_dbg_ctx2 );
+                {
+                        __cfadbg_print_safe(io, "Kernel I/O : notifying\n");
+                        // as long as there are pending allocations try to satisfy them
+                        // for simplicity do it in FIFO order
                         while( !empty(this.pending.queue) ) {
                                 __cfadbg_print_safe(io, "Kernel I/O : notifying\n");
+                                // get first pending allocs
                                 __u32 have = ctx->sq.free_ring.tail - ctx->sq.free_ring.head;
                                 __pending_alloc & pa = (__pending_alloc&)head( this.pending.queue );
+                                // check if we have enough to satisfy the request
                                 if( have > pa.want ) goto DONE;
+                                // if there are enough allocations it means we can drop the request
                                 drop( this.pending.queue );
                                 /* paranoid */__attribute__((unused)) bool ret =
+                                // actually do the alloc
                                 __alloc(ctx, pa.idxs, pa.want);
                                 /* paranoid */ verify( ret );
+                                // write out which context statisfied the request and post
+                                // this
                                 pa.ctx = ctx;
                                 post( pa.waitctx );
+                        }
 …
+                }
                 unlock( this.pending.lock );
+        }
+                /* paranoid */ verify( __preemption_enabled() );
+        }
+        // short hand to avoid the mutual exclusion of the pending is empty regardless
         static void __ioarbiter_notify( io_context$ & ctx ) {
+                if(!empty( ctx.arbiter->pending )) {
+                        __ioarbiter_notify( *ctx.arbiter, &ctx );
+                }
+        }
+        // Simply append to the pending
+                if(empty( ctx.arbiter->pending )) return;
+                __ioarbiter_notify( *ctx.arbiter, &ctx );
+        }
+        // Submit from outside the local processor: append to the outstanding list
         static void __ioarbiter_submit( io_context$ * ctx, __u32 idxs[], __u32 have, bool lazy ) {
                 __cfadbg_print_safe(io, "Kernel I/O : submitting %u from the arbiter to context %u\n", have, ctx->fd);
 …
                 __cfadbg_print_safe(io, "Kernel I/O : waiting to submit %u\n", have);
+                // create the intrusive object to append
                 __external_io ei;
                 ei.idxs = idxs;
 …
                 ei.lazy = lazy;
+                // enqueue the io
                 bool we = enqueue(ctx->ext_sq, (__outstanding_io&)ei);
+                // mark pending
                 __atomic_store_n(&ctx->proc->io.pending, true, __ATOMIC_SEQ_CST);
+                // if this is the first to be enqueued, signal the processor in an attempt to speed up flushing
+                // if it's not the first enqueue, a signal is already in transit
                 if( we ) {
                         sigval_t value = { PREEMPT_IO };
                         __cfaabi_pthread_sigqueue(ctx->proc->kernel_thread, SIGUSR1, value);
+                }
+                        __STATS__( false, io.flush.signal += 1; )
+                }
+                __STATS__( false, io.submit.extr += 1; )
+                // to avoid dynamic allocation/memory reclamation headaches, wait for it to have been submitted
                 wait( ei.waitctx );
 …
+        }
+        static void __ioarbiter_flush( io_context$ & ctx ) {
+                if(!empty( ctx.ext_sq )) {
+                        __STATS__( false, io.flush.external += 1; )
+                        __cfadbg_print_safe(io, "Kernel I/O : arbiter flushing\n");
+                        lock( ctx.ext_sq.lock __cfaabi_dbg_ctx2 );
+                        {
+                                while( !empty(ctx.ext_sq.queue) ) {
+                                        __external_io & ei = (__external_io&)drop( ctx.ext_sq.queue );
+                                        __submit_only(&ctx, ei.idxs, ei.have);
+                                        post( ei.waitctx );
+                                }
+                                ctx.ext_sq.empty = true;
+        // flush the io arbiter: move all external io operations to the submission ring
+        static void __ioarbiter_flush( io_context$ & ctx, bool kernel ) {
+                // if there are no external operations just return
+                if(empty( ctx.ext_sq )) return;
+                // stats and logs
+                __STATS__( false, io.flush.external += 1; )
+                __cfadbg_print_safe(io, "Kernel I/O : arbiter flushing\n");
+                // this can happen from multiple processors, mutual exclusion is needed
+                lock( ctx.ext_sq.lock __cfaabi_dbg_ctx2 );
+                {
+                        // pop each operation one at a time.
+                        // There is no wait morphing because of the io sq ring
+                        while( !empty(ctx.ext_sq.queue) ) {
+                                // drop the element from the queue
+                                __external_io & ei = (__external_io&)drop( ctx.ext_sq.queue );
+                                // submit it
+                                __submit_only(&ctx, ei.idxs, ei.have, true);
+                                // wake the thread that was waiting on it
+                                // since this can both be called from kernel and user, check the flag before posting
+                                __post( ei.waitctx, kernel, UNPARK_LOCAL );
+                        }
+                        unlock(ctx.ext_sq.lock );
+                        // mark the queue as empty
+                        ctx.ext_sq.empty = true;
+                        ctx.sq.last_external = true;
+                }
+                unlock(ctx.ext_sq.lock );
+        }
+        extern "C" {
+                // debug functions used for gdb
+                // io_uring doesn't yet support gdb soe the kernel-shared data structures aren't viewable in gdb
+                // these functions read the data that gdb can't and should be removed once the support is added
+                static __u32 __cfagdb_cq_head( io_context$ * ctx ) __attribute__((nonnull(1),used,noinline)) { return *ctx->cq.head; }
+                static __u32 __cfagdb_cq_tail( io_context$ * ctx ) __attribute__((nonnull(1),used,noinline)) { return *ctx->cq.tail; }
+                static __u32 __cfagdb_cq_mask( io_context$ * ctx ) __attribute__((nonnull(1),used,noinline)) { return *ctx->cq.mask; }
+                static __u32 __cfagdb_sq_head( io_context$ * ctx ) __attribute__((nonnull(1),used,noinline)) { return *ctx->sq.kring.head; }
+                static __u32 __cfagdb_sq_tail( io_context$ * ctx ) __attribute__((nonnull(1),used,noinline)) { return *ctx->sq.kring.tail; }
+                static __u32 __cfagdb_sq_mask( io_context$ * ctx ) __attribute__((nonnull(1),used,noinline)) { return *ctx->sq.mask; }
+                // fancier version that reads an sqe and copies it out.
+                static struct io_uring_sqe __cfagdb_sq_at( io_context$ * ctx, __u32 at ) __attribute__((nonnull(1),used,noinline)) {
+                        __u32 ax = at & *ctx->sq.mask;
+                        __u32 ix = ctx->sq.kring.array[ax];
+                        return ctx->sq.sqes[ix];
+                }
+        }

libcfa/src/concurrency/io/call.cfa.in

-              r34b4268
+              r24d6572
 Prelude = """#define __cforall_thread__
+#include <unistd.h>
+#include <errno.h>
+#include <sys/socket.h>
+#include <time.hfa>
 #include "bits/defs.hfa"
 #include "kernel.hfa"
 …
         #include <assert.h>
         #include <stdint.h>
-        #include <errno.h>
         #include <linux/io_uring.h>
         #include "kernel/fwd.hfa"
 …
 // I/O Forwards
 //=============================================================================================
-#include <time.hfa>
-// Some forward declarations
-#include <errno.h>
-#include <unistd.h>
 extern "C" {
-        #include <asm/types.h>
-        #include <sys/socket.h>
-        #include <sys/syscall.h>
 #if defined(CFA_HAVE_PREADV2)
         struct iovec;
         extern ssize_t preadv2 (int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
+        extern ssize_t preadv2 (int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags);
 #endif
 #if defined(CFA_HAVE_PWRITEV2)
         struct iovec;
         extern ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
+        extern ssize_t pwritev2(int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags);
 #endif
 …
         struct msghdr;
         struct sockaddr;
+        extern ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags);
+        extern ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags);
+        extern ssize_t send(int sockfd, const void *buf, size_t len, int flags);
+        extern ssize_t recv(int sockfd, void *buf, size_t len, int flags);
+        extern int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
+        extern int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+        extern ssize_t sendmsg(int sockfd, const struct msghdr * msg, int flags);
+        extern ssize_t recvmsg(int sockfd, struct msghdr * msg, int flags);
+        extern ssize_t send(int sockfd, const void * buf, size_t len, int flags);
+        extern ssize_t recv(int sockfd, void * buf, size_t len, int flags);
         extern int fallocate(int fd, int mode, off_t offset, off_t len);
         extern int posix_fadvise(int fd, off_t offset, off_t len, int advice);
         extern int madvise(void *addr, size_t length, int advice);
         extern int openat(int dirfd, const char *pathname, int flags, mode_t mode);
+        extern int madvise(void * addr, size_t length, int advice);
+        extern int openat(int dirfd, const char * pathname, int flags, mode_t mode);
         extern int close(int fd);
         extern ssize_t read (int fd, void *buf, size_t count);
+        extern ssize_t read (int fd, void * buf, size_t count);
         struct epoll_event;
         extern int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
         extern ssize_t splice(int fd_in, __off64_t *off_in, int fd_out, __off64_t *off_out, size_t len, unsigned int flags);
+        extern int epoll_ctl(int epfd, int op, int fd, struct epoll_event * event);
+        extern ssize_t splice(int fd_in, __off64_t * off_in, int fd_out, __off64_t * off_out, size_t len, unsigned int flags);
         extern ssize_t tee(int fd_in, int fd_out, size_t len, unsigned int flags);
+}
 …
 calls = [
         # CFA_HAVE_IORING_OP_READV
         Call('READV', 'ssize_t preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags)', {
+        Call('READV', 'ssize_t preadv2(int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags)', {
                 'fd'  : 'fd',
+                'addr': '(typeof(sqe->addr))iov',
+                'len' : 'iovcnt',
                 'off' : 'offset',
+                'addr': '(uintptr_t)iov',
+                'len' : 'iovcnt',
+                'rw_flags' : 'flags'
         }, define = 'CFA_HAVE_PREADV2'),
         # CFA_HAVE_IORING_OP_WRITEV
         Call('WRITEV', 'ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags)', {
+        Call('WRITEV', 'ssize_t pwritev2(int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags)', {
                 'fd'  : 'fd',
+                'addr': '(typeof(sqe->addr))iov',
+                'len' : 'iovcnt',
                 'off' : 'offset',
+                'addr': '(uintptr_t)iov',
+                'len' : 'iovcnt'
+                'rw_flags' : 'flags'
         }, define = 'CFA_HAVE_PWRITEV2'),
         # CFA_HAVE_IORING_OP_FSYNC
 …
         }),
         # CFA_HAVE_IORING_OP_EPOLL_CTL
         Call('EPOLL_CTL', 'int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event)', {
+        Call('EPOLL_CTL', 'int epoll_ctl(int epfd, int op, int fd, struct epoll_event * event)', {
                 'fd': 'epfd',
+                'len': 'op',
                 'addr': 'fd',
+                'len': 'op',
+                'off': '(uintptr_t)event'
+                'off': '(typeof(sqe->off))event'
         }),
         # CFA_HAVE_IORING_OP_SYNC_FILE_RANGE
 …
         }),
         # CFA_HAVE_IORING_OP_SENDMSG
         Call('SENDMSG', 'ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(uintptr_t)(struct msghdr *)msg',
+        Call('SENDMSG', 'ssize_t sendmsg(int sockfd, const struct msghdr * msg, int flags)', {
+                'fd': 'sockfd',
+                'addr': '(typeof(sqe->addr))(struct msghdr *)msg',
                 'len': '1',
                 'msg_flags': 'flags'
         }),
         # CFA_HAVE_IORING_OP_RECVMSG
         Call('RECVMSG', 'ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(uintptr_t)(struct msghdr *)msg',
+        Call('RECVMSG', 'ssize_t recvmsg(int sockfd, struct msghdr * msg, int flags)', {
+                'fd': 'sockfd',
+                'addr': '(typeof(sqe->addr))(struct msghdr *)msg',
                 'len': '1',
                 'msg_flags': 'flags'
         }),
         # CFA_HAVE_IORING_OP_SEND
         Call('SEND', 'ssize_t send(int sockfd, const void *buf, size_t len, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(uintptr_t)buf',
+        Call('SEND', 'ssize_t send(int sockfd, const void * buf, size_t len, int flags)', {
+                'fd': 'sockfd',
+                'addr': '(typeof(sqe->addr))buf',
                 'len': 'len',
                 'msg_flags': 'flags'
         }),
         # CFA_HAVE_IORING_OP_RECV
         Call('RECV', 'ssize_t recv(int sockfd, void *buf, size_t len, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(uintptr_t)buf',
+        Call('RECV', 'ssize_t recv(int sockfd, void * buf, size_t len, int flags)', {
+                'fd': 'sockfd',
+                'addr': '(typeof(sqe->addr))buf',
                 'len': 'len',
                 'msg_flags': 'flags'
         }),
         # CFA_HAVE_IORING_OP_ACCEPT
         Call('ACCEPT', 'int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(uintptr_t)addr',
                 'addr2': '(uintptr_t)addrlen',
+        Call('ACCEPT', 'int accept4(int sockfd, __SOCKADDR_ARG addr, socklen_t * restrict addrlen, int flags)', {
+                'fd': 'sockfd',
+                'addr': '(typeof(sqe->addr))&addr',
+                'addr2': '(typeof(sqe->addr2))addrlen',
                 'accept_flags': 'flags'
         }),
         # CFA_HAVE_IORING_OP_CONNECT
         Call('CONNECT', 'int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen)', {
                 'fd': 'sockfd',
                 'addr': '(uintptr_t)addr',
+        Call('CONNECT', 'int connect(int sockfd, __CONST_SOCKADDR_ARG addr, socklen_t addrlen)', {
+                'fd': 'sockfd',
+                'addr': '(typeof(sqe->addr))&addr',
                 'off': 'addrlen'
         }),
 …
         Call('FALLOCATE', 'int fallocate(int fd, int mode, off_t offset, off_t len)', {
                 'fd': 'fd',
-                'addr': '(uintptr_t)len',
                 'len': 'mode',
+                'off': 'offset'
+                'off': 'offset',
+                'addr': 'len'
         }),
         # CFA_HAVE_IORING_OP_FADVISE
 …
         }),
         # CFA_HAVE_IORING_OP_MADVISE
         Call('MADVISE', 'int madvise(void *addr, size_t length, int advice)', {
                 'addr': '(uintptr_t)addr',
+        Call('MADVISE', 'int madvise(void * addr, size_t length, int advice)', {
+                'addr': '(typeof(sqe->addr))addr',
                 'len': 'length',
                 'fadvise_advice': 'advice'
         }),
         # CFA_HAVE_IORING_OP_OPENAT
         Call('OPENAT', 'int openat(int dirfd, const char *pathname, int flags, mode_t mode)', {
+        Call('OPENAT', 'int openat(int dirfd, const char * pathname, int flags, mode_t mode)', {
                 'fd': 'dirfd',
                 'addr': '(uintptr_t)pathname',
                 'len': 'mode',
                 'open_flags': 'flags;'
+                'addr': '(typeof(sqe->addr))pathname',
+                'open_flags': 'flags;',
+                'len': 'mode'
         }),
         # CFA_HAVE_IORING_OP_OPENAT2
         Call('OPENAT2', 'int openat2(int dirfd, const char *pathname, struct open_how * how, size_t size)', {
+        Call('OPENAT2', 'int openat2(int dirfd, const char * pathname, struct open_how * how, size_t size)', {
                 'fd': 'dirfd',
                 'addr': 'pathname',
                 'len': 'sizeof(*how)',
                 'off': '(uintptr_t)how',
+                'addr': '(typeof(sqe->addr))pathname',
+                'off': '(typeof(sqe->off))how',
+                'len': 'sizeof(*how)'
         }, define = 'CFA_HAVE_OPENAT2'),
         # CFA_HAVE_IORING_OP_CLOSE
 …
         }),
         # CFA_HAVE_IORING_OP_STATX
         Call('STATX', 'int statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf)', {
+        Call('STATX', 'int statx(int dirfd, const char * pathname, int flags, unsigned int mask, struct statx * statxbuf)', {
                 'fd': 'dirfd',
                 'off': '(uintptr_t)statxbuf',
                 'addr': 'pathname',
+                'addr': '(typeof(sqe->addr))pathname',
+                'statx_flags': 'flags',
                 'len': 'mask',
                 'statx_flags': 'flags'
+                'off': '(typeof(sqe->off))statxbuf'
         }, define = 'CFA_HAVE_STATX'),
         # CFA_HAVE_IORING_OP_READ
         Call('READ', 'ssize_t read(int fd, void * buf, size_t count)', {
                 'fd': 'fd',
                 'addr': '(uintptr_t)buf',
+                'addr': '(typeof(sqe->addr))buf',
                 'len': 'count'
         }),
 …
         Call('WRITE', 'ssize_t write(int fd, void * buf, size_t count)', {
                 'fd': 'fd',
                 'addr': '(uintptr_t)buf',
+                'addr': '(typeof(sqe->addr))buf',
                 'len': 'count'
         }),
         # CFA_HAVE_IORING_OP_SPLICE
         Call('SPLICE', 'ssize_t splice(int fd_in, __off64_t *off_in, int fd_out, __off64_t *off_out, size_t len, unsigned int flags)', {
+        Call('SPLICE', 'ssize_t splice(int fd_in, __off64_t * off_in, int fd_out, __off64_t * off_out, size_t len, unsigned int flags)', {
                 'splice_fd_in': 'fd_in',
                 'splice_off_in': 'off_in ? (__u64)*off_in : (__u64)-1',
+                'splice_off_in': 'off_in ? (typeof(sqe->splice_off_in))*off_in : (typeof(sqe->splice_off_in))-1',
                 'fd': 'fd_out',
                 'off': 'off_out ? (__u64)*off_out : (__u64)-1',
+                'off': 'off_out ? (typeof(sqe->off))*off_out : (typeof(sqe->off))-1',
                 'len': 'len',
                 'splice_flags': 'flags'

libcfa/src/concurrency/io/setup.cfa

-              r34b4268
+              r24d6572
 #define __cforall_thread__
-#define _GNU_SOURCE
 #if defined(__CFA_DEBUG__)
 …
                 // completion queue
                 cq.lock      = false;
+                cq.try_lock  = false;
                 cq.id        = MAX;
                 cq.ts        = rdtscl();

libcfa/src/concurrency/io/types.hfa

-              r34b4268
+              r24d6572
         //-----------------------------------------------------------------------
         // Ring Data structure
+      struct __sub_ring_t {
+        // represent the io_uring submission ring which contains operations that will be sent to io_uring for processing
+        struct __sub_ring_t {
+                // lock needed because remote processors might need to flush the instance
+                __spinlock_t lock;
                 struct {
                         // Head and tail of the ring (associated with array)
 …
                 // number of sqes to submit on next system call.
                 __u32 to_submit;
+                volatile __u32 to_submit;
                 // number of entries and mask to go with it
 …
                 void * ring_ptr;
                 size_t ring_sz;
+        };
+                // for debug purposes, whether or not the last flush was due to a arbiter flush
+                bool last_external;
+        };
+        // represent the io_uring completion ring which contains operations that have completed
         struct __cmp_ring_t {
+                volatile bool lock;
+                // needed because remote processors can help drain the buffer
+                volatile bool try_lock;
+                // id of the ring, used for the helping/topology algorithms
                 unsigned id;
+                // timestamp from last time it was drained
                 unsigned long long ts;
 …
         };
+        // struct representing an io operation that still needs processing
+        // actual operations are expected to inherit from this
         struct __outstanding_io {
+                // intrusive link fields
                 inline Colable;
+                // primitive on which to block until the io is processed
                 oneshot waitctx;
         };
         static inline __outstanding_io *& Next( __outstanding_io * n ) { return (__outstanding_io *)Next( (Colable *)n ); }
+        // queue of operations that are outstanding
         struct __outstanding_io_queue {
+                // spinlock for protection
+                // TODO: changing to a lock that blocks, I haven't examined whether it should be a kernel or user lock
                 __spinlock_t lock;
+                // the actual queue
                 Queue(__outstanding_io) queue;
+                // volatile used to avoid the need for taking the lock if it's empty
                 volatile bool empty;
         };
+        // struct representing an operation that was submitted
         struct __external_io {
+                // inherits from outstanding io
                 inline __outstanding_io;
+                // pointer and count to an array of ids to be submitted
                 __u32 * idxs;
                 __u32 have;
+                // whether or not these can be accumulated before flushing the buffer
                 bool lazy;
         };
+        // complete io_context, contains all the data for io submission and completion
         struct __attribute__((aligned(64))) io_context$ {
+                // arbiter, used in cases where threads for migrated at unfortunate moments
                 io_arbiter$ * arbiter;
+                // which prcessor the context is tied to
                 struct processor * proc;
+                // queue of io submissions that haven't beeen processed.
                 __outstanding_io_queue ext_sq;
+                // io_uring ring data structures
                 struct __sub_ring_t sq;
                 struct __cmp_ring_t cq;
+                // flag the io_uring rings where created with
                 __u32 ring_flags;
+                // file descriptor that identifies the io_uring instance
                 int fd;
         };
+        // short hand to check when the io_context was last processed (io drained)
         static inline unsigned long long ts(io_context$ *& this) {
                 const __u32 head = *this->cq.head;
                 const __u32 tail = *this->cq.tail;
+                // if there is no pending completions, just pretend it's infinetely recent
                 if(head == tail) return ULLONG_MAX;
 …
+        }
+        // structure represeting allocations that couldn't succeed locally
         struct __pending_alloc {
+                // inherit from outstanding io
                 inline __outstanding_io;
+                // array and size of the desired allocation
                 __u32 * idxs;
                 __u32 want;
+                // output param, the context the io was allocated from
                 io_context$ * ctx;
         };
+        // arbiter that handles cases where the context tied to the local processor is unable to satisfy the io
         monitor __attribute__((aligned(64))) io_arbiter$ {
+                // contains a queue of io for pending allocations
                 __outstanding_io_queue pending;
         };

libcfa/src/concurrency/iofwd.hfa

-              r34b4268
+              r24d6572
 // Author           : Thierry Delisle
 // Created On       : Thu Apr 23 17:31:00 2020
 // Last Modified By :
 // Last Modified On :
 // Update Count     :
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Mon Mar 13 23:54:57 2023
+// Update Count     : 1
 //
 …
 #include <unistd.h>
+#include <sys/socket.h>
 extern "C" {
         #include <asm/types.h>
 …
 typedef __off64_t off64_t;
-struct cluster;
-struct io_context$;
-struct iovec;
-struct msghdr;
-struct sockaddr;
-struct statx;
 struct epoll_event;
-struct io_uring_sqe;
 //-----------------------------------------------------------------------
 …
 // synchronous calls
 #if defined(CFA_HAVE_PREADV2)
         extern ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
+        extern ssize_t cfa_preadv2(int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
 #if defined(CFA_HAVE_PWRITEV2)
         extern ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
+        extern ssize_t cfa_pwritev2(int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
 extern int cfa_fsync(int fd, __u64 submit_flags);
 extern int cfa_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event, __u64 submit_flags);
+extern int cfa_epoll_ctl(int epfd, int op, int fd, struct epoll_event * event, __u64 submit_flags);
 extern int cfa_sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags, __u64 submit_flags);
 extern  ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags, __u64 submit_flags);
 extern ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags, __u64 submit_flags);
 extern ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags, __u64 submit_flags);
 extern ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags, __u64 submit_flags);
 extern int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, __u64 submit_flags);
 extern int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen, __u64 submit_flags);
+extern  ssize_t cfa_sendmsg(int sockfd, const struct msghdr * msg, int flags, __u64 submit_flags);
+extern ssize_t cfa_recvmsg(int sockfd, struct msghdr * msg, int flags, __u64 submit_flags);
+extern ssize_t cfa_send(int sockfd, const void * buf, size_t len, int flags, __u64 submit_flags);
+extern ssize_t cfa_recv(int sockfd, void * buf, size_t len, int flags, __u64 submit_flags);
+extern int cfa_accept4(int sockfd, __SOCKADDR_ARG addr, socklen_t * restrict addrlen, int flags, __u64 submit_flags);
+extern int cfa_connect(int sockfd, __CONST_SOCKADDR_ARG addr, socklen_t addrlen, __u64 submit_flags);
 extern int cfa_fallocate(int fd, int mode, off_t offset, off_t len, __u64 submit_flags);
 extern int cfa_posix_fadvise(int fd, off_t offset, off_t len, int advice, __u64 submit_flags);
 extern int cfa_madvise(void *addr, size_t length, int advice, __u64 submit_flags);
 extern int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode, __u64 submit_flags);
+extern int cfa_madvise(void * addr, size_t length, int advice, __u64 submit_flags);
+extern int cfa_openat(int dirfd, const char * pathname, int flags, mode_t mode, __u64 submit_flags);
 #if defined(CFA_HAVE_OPENAT2)
         extern int cfa_openat2(int dirfd, const char *pathname, struct open_how * how, size_t size, __u64 submit_flags);
+        extern int cfa_openat2(int dirfd, const char * pathname, struct open_how * how, size_t size, __u64 submit_flags);
 #endif
 extern int cfa_close(int fd, __u64 submit_flags);
 #if defined(CFA_HAVE_STATX)
         extern int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, __u64 submit_flags);
+        extern int cfa_statx(int dirfd, const char * pathname, int flags, unsigned int mask, struct statx * statxbuf, __u64 submit_flags);
 #endif
 extern ssize_t cfa_read(int fd, void * buf, size_t count, __u64 submit_flags);
 extern ssize_t cfa_write(int fd, void * buf, size_t count, __u64 submit_flags);
 extern ssize_t cfa_splice(int fd_in, __off64_t *off_in, int fd_out, __off64_t *off_out, size_t len, unsigned int flags, __u64 submit_flags);
+extern ssize_t cfa_splice(int fd_in, __off64_t * off_in, int fd_out, __off64_t * off_out, size_t len, unsigned int flags, __u64 submit_flags);
 extern ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags, __u64 submit_flags);
 …
 // asynchronous calls
 #if defined(CFA_HAVE_PREADV2)
         extern void async_preadv2(io_future_t & future, int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
+        extern void async_preadv2(io_future_t & future, int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
 #if defined(CFA_HAVE_PWRITEV2)
         extern void async_pwritev2(io_future_t & future, int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
+        extern void async_pwritev2(io_future_t & future, int fd, const struct iovec * iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
 extern void async_fsync(io_future_t & future, int fd, __u64 submit_flags);
 extern void async_epoll_ctl(io_future_t & future, int epfd, int op, int fd, struct epoll_event *event, __u64 submit_flags);
+extern void async_epoll_ctl(io_future_t & future, int epfd, int op, int fd, struct epoll_event * event, __u64 submit_flags);
 extern void async_sync_file_range(io_future_t & future, int fd, off64_t offset, off64_t nbytes, unsigned int flags, __u64 submit_flags);
 extern void async_sendmsg(io_future_t & future, int sockfd, const struct msghdr *msg, int flags, __u64 submit_flags);
 extern void async_recvmsg(io_future_t & future, int sockfd, struct msghdr *msg, int flags, __u64 submit_flags);
 extern void async_send(io_future_t & future, int sockfd, const void *buf, size_t len, int flags, __u64 submit_flags);
 extern void async_recv(io_future_t & future, int sockfd, void *buf, size_t len, int flags, __u64 submit_flags);
 extern void async_accept4(io_future_t & future, int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, __u64 submit_flags);
 extern void async_connect(io_future_t & future, int sockfd, const struct sockaddr *addr, socklen_t addrlen, __u64 submit_flags);
+extern void async_sendmsg(io_future_t & future, int sockfd, const struct msghdr * msg, int flags, __u64 submit_flags);
+extern void async_recvmsg(io_future_t & future, int sockfd, struct msghdr * msg, int flags, __u64 submit_flags);
+extern void async_send(io_future_t & future, int sockfd, const void * buf, size_t len, int flags, __u64 submit_flags);
+extern void async_recv(io_future_t & future, int sockfd, void * buf, size_t len, int flags, __u64 submit_flags);
+extern void async_accept4(io_future_t & future, int sockfd, __SOCKADDR_ARG addr, socklen_t * restrict addrlen, int flags, __u64 submit_flags);
+extern void async_connect(io_future_t & future, int sockfd, __CONST_SOCKADDR_ARG addr, socklen_t addrlen, __u64 submit_flags);
 extern void async_fallocate(io_future_t & future, int fd, int mode, off_t offset, off_t len, __u64 submit_flags);
 extern void async_posix_fadvise(io_future_t & future, int fd, off_t offset, off_t len, int advice, __u64 submit_flags);
 extern void async_madvise(io_future_t & future, void *addr, size_t length, int advice, __u64 submit_flags);
 extern void async_openat(io_future_t & future, int dirfd, const char *pathname, int flags, mode_t mode, __u64 submit_flags);
+extern void async_madvise(io_future_t & future, void * addr, size_t length, int advice, __u64 submit_flags);
+extern void async_openat(io_future_t & future, int dirfd, const char * pathname, int flags, mode_t mode, __u64 submit_flags);
 #if defined(CFA_HAVE_OPENAT2)
         extern void async_openat2(io_future_t & future, int dirfd, const char *pathname, struct open_how * how, size_t size, __u64 submit_flags);
+        extern void async_openat2(io_future_t & future, int dirfd, const char * pathname, struct open_how * how, size_t size, __u64 submit_flags);
 #endif
 extern void async_close(io_future_t & future, int fd, __u64 submit_flags);
 #if defined(CFA_HAVE_STATX)
         extern void async_statx(io_future_t & future, int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, __u64 submit_flags);
+        extern void async_statx(io_future_t & future, int dirfd, const char * pathname, int flags, unsigned int mask, struct statx * statxbuf, __u64 submit_flags);
 #endif
 void async_read(io_future_t & future, int fd, void * buf, size_t count, __u64 submit_flags);
 extern void async_write(io_future_t & future, int fd, void * buf, size_t count, __u64 submit_flags);
 extern void async_splice(io_future_t & future, int fd_in, __off64_t *off_in, int fd_out, __off64_t *off_out, size_t len, unsigned int flags, __u64 submit_flags);
+extern void async_splice(io_future_t & future, int fd_in, __off64_t * off_in, int fd_out, __off64_t * off_out, size_t len, unsigned int flags, __u64 submit_flags);
 extern void async_tee(io_future_t & future, int fd_in, int fd_out, size_t len, unsigned int flags, __u64 submit_flags);

libcfa/src/concurrency/kernel.cfa

-              r34b4268
+              r24d6572
 // Created On       : Tue Jan 17 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Nov 30 18:14:08 2022
 // Update Count     : 76
+// Last Modified On : Mon Jan  9 08:42:05 2023
+// Update Count     : 77
 //
 #define __cforall_thread__
-#define _GNU_SOURCE
 // #define __CFA_DEBUG_PRINT_RUNTIME_CORE__
 …
                 __cfadbg_print_safe(runtime_core, "Kernel : core %p stopping\n", this);
+        }
+        __cfa_io_flush( this );
+        __cfa_io_drain( this );
         post( this->terminated );

libcfa/src/concurrency/kernel/cluster.cfa

-              r34b4268
+              r24d6572
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include "bits/defs.hfa"
 …
         return max_cores_l;
+}
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        // No forward declaration needed
-        #define __kernel_rseq_register rseq_register_current_thread
-        #define __kernel_rseq_unregister rseq_unregister_current_thread
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        static void __kernel_raw_rseq_register  (void);
-        static void __kernel_raw_rseq_unregister(void);
-        #define __kernel_rseq_register __kernel_raw_rseq_register
-        #define __kernel_rseq_unregister __kernel_raw_rseq_unregister
-#else
-        // No forward declaration needed
-        // No initialization needed
-        static inline void noop(void) {}
-        #define __kernel_rseq_register noop
-        #define __kernel_rseq_unregister noop
-#endif
 //=======================================================================
 …
 // Lock-Free registering/unregistering of threads
 unsigned register_proc_id( void ) with(__scheduler_lock.lock) {
-        __kernel_rseq_register();
         bool * handle = (bool *)&kernelTLS().sched_lock;
 …
         __atomic_store_n(cell, 0p, __ATOMIC_RELEASE);
-        __kernel_rseq_unregister();
+}
 …
         /* paranoid */ verify( mock_head(this)    == this.l.prev );
+}
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        // No definition needed
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        #if defined( __x86_64 ) || defined( __i386 )
-                #define RSEQ_SIG        0x53053053
-        #elif defined( __ARM_ARCH )
-                #ifdef __ARMEB__
-                #define RSEQ_SIG    0xf3def5e7      /* udf    #24035    ; 0x5de3 (ARMv6+) */
-                #else
-                #define RSEQ_SIG    0xe7f5def3      /* udf    #24035    ; 0x5de3 */
-                #endif
-        #endif
-        extern void __disable_interrupts_hard();
-        extern void __enable_interrupts_hard();
-        static void __kernel_raw_rseq_register  (void) {
-                /* paranoid */ verify( __cfaabi_rseq.cpu_id == RSEQ_CPU_ID_UNINITIALIZED );
-                // int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, (sigset_t *)0p, _NSIG / 8);
-                int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, RSEQ_SIG);
-                if(ret != 0) {
-                        int e = errno;
-                        switch(e) {
-                        case EINVAL: abort("KERNEL ERROR: rseq register invalid argument");
-                        case ENOSYS: abort("KERNEL ERROR: rseq register no supported");
-                        case EFAULT: abort("KERNEL ERROR: rseq register with invalid argument");
-                        case EBUSY : abort("KERNEL ERROR: rseq register already registered");
-                        case EPERM : abort("KERNEL ERROR: rseq register sig  argument  on unregistration does not match the signature received on registration");
-                        default: abort("KERNEL ERROR: rseq register unexpected return %d", e);
+                        }
+                }
+        }
-        static void __kernel_raw_rseq_unregister(void) {
-                /* paranoid */ verify( __cfaabi_rseq.cpu_id >= 0 );
-                // int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, (sigset_t *)0p, _NSIG / 8);
-                int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
-                if(ret != 0) {
-                        int e = errno;
-                        switch(e) {
-                        case EINVAL: abort("KERNEL ERROR: rseq unregister invalid argument");
-                        case ENOSYS: abort("KERNEL ERROR: rseq unregister no supported");
-                        case EFAULT: abort("KERNEL ERROR: rseq unregister with invalid argument");
-                        case EBUSY : abort("KERNEL ERROR: rseq unregister already registered");
-                        case EPERM : abort("KERNEL ERROR: rseq unregister sig  argument  on unregistration does not match the signature received on registration");
-                        default: abort("KERNEL ERROR: rseq unregisteunexpected return %d", e);
+                        }
+                }
+        }
-#else
-        // No definition needed
-#endif

libcfa/src/concurrency/kernel/cluster.hfa

-              r34b4268
+              r24d6572
 // convert to log2 scale but using double
 static inline __readyQ_avg_t __to_readyQ_avg(unsigned long long intsc) { if(unlikely(0 == intsc)) return 0.0; else return log2(intsc); }
+static inline __readyQ_avg_t __to_readyQ_avg(unsigned long long intsc) { if(unlikely(0 == intsc)) return 0.0; else return log2((__readyQ_avg_t)intsc); }
 #define warn_large_before warnf( !strict || old_avg < 35.0, "Suspiciously large previous average: %'lf, %'" PRId64 "ms \n", old_avg, program()`ms )
 …
+}
 static struct {
         const unsigned readyq;
         const unsigned io;
+const static struct {
+        unsigned readyq;
+        unsigned io;
 } __shard_factor = { 2, 1 };

libcfa/src/concurrency/kernel/private.hfa

-              r34b4268
+              r24d6572
 // Created On       : Mon Feb 13 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Aug 12 08:21:33 2020
 // Update Count     : 9
+// Last Modified On : Thu Mar  2 16:04:46 2023
+// Update Count     : 11
 //
 …
 extern "C" {
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        #include <rseq/rseq.h>
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        #include <linux/rseq.h>
-#else
-        #ifndef _GNU_SOURCE
-        #error kernel/private requires gnu_source
-        #endif
         #include <sched.h>
-#endif
+}
 …
 // Hardware
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        // No data needed
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        extern "Cforall" {
-                extern __attribute__((aligned(64))) __thread volatile struct rseq __cfaabi_rseq;
+        }
-#else
-        // No data needed
-#endif
 static inline int __kernel_getcpu() {
         /* paranoid */ verify( ! __preemption_enabled() );
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        return rseq_current_cpu();
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        int r = __cfaabi_rseq.cpu_id;
-        /* paranoid */ verify( r >= 0 );
-        return r;
-#else
         return sched_getcpu();
-#endif
+}

libcfa/src/concurrency/kernel/startup.cfa

-              r34b4268
+              r24d6572
 #define __cforall_thread__
-#define _GNU_SOURCE
 // #define __CFA_DEBUG_PRINT_RUNTIME_CORE__
 // C Includes
 #include <errno.h>                                      // errno
+#include <errno.h>                                                                              // errno
 #include <signal.h>
 #include <string.h>                                     // strerror
 #include <unistd.h>                                     // sysconf
+#include <string.h>                                                                             // strerror
+#include <unistd.h>
+#include <limits.h>                                                                             // PTHREAD_STACK_MIN
 extern "C" {
+        #include <limits.h>                             // PTHREAD_STACK_MIN
+        #include <unistd.h>                             // syscall
+        #include <sys/eventfd.h>                        // eventfd
+        #include <sys/mman.h>                           // mprotect
+        #include <sys/resource.h>                       // getrlimit
+        #include <sys/eventfd.h>                                                        // eventfd
+        #include <sys/mman.h>                                                           // mprotect
+        #include <sys/resource.h>                                                       // getrlimit
+}
 …
 #include "kernel/private.hfa"
 #include "iofwd.hfa"
 #include "startup.hfa"                                  // STARTUP_PRIORITY_XXX
+#include "startup.hfa"                                                                  // STARTUP_PRIORITY_XXX
 #include "limits.hfa"
 #include "math.hfa"
 …
 __scheduler_RWLock_t __scheduler_lock @= { 0 };
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        // No data needed
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        extern "Cforall" {
-                __attribute__((aligned(64))) __thread volatile struct rseq __cfaabi_rseq @= {
-                        .cpu_id : RSEQ_CPU_ID_UNINITIALIZED,
-                };
+        }
-#else
-        // No data needed
-#endif
 //-----------------------------------------------------------------------------
 // Struct to steal stack

libcfa/src/concurrency/locks.cfa

-              r34b4268
+              r24d6572
 // file "LICENCE" distributed with Cforall.
 //
 // locks.hfa -- LIBCFATHREAD
+// locks.cfa -- LIBCFATHREAD
 // Runtime locks that used with the runtime thread system.
 //
 …
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include "locks.hfa"
 …
         // lock is held by some other thread
         if ( owner != 0p && owner != thrd ) {
+                insert_last( blocked_threads, *thrd );
+        select_node node;
+                insert_last( blocked_threads, node );
                 wait_count++;
                 unlock( lock );
                 park( );
+        }
+        // multi acquisition lock is held by current thread
+        else if ( owner == thrd && multi_acquisition ) {
+        return;
+        } else if ( owner == thrd && multi_acquisition ) { // multi acquisition lock is held by current thread
                 recursion_count++;
+                unlock( lock );
+        }
+        // lock isn't held
+        else {
+        } else {  // lock isn't held
                 owner = thrd;
                 recursion_count = 1;
                 unlock( lock );
+        }
+        }
+    unlock( lock );
+}
 …
+}
+static void pop_and_set_new_owner( blocking_lock & this ) with( this ) {
+        thread$ * t = &try_pop_front( blocked_threads );
+        owner = t;
+        recursion_count = ( t ? 1 : 0 );
+        if ( t ) wait_count--;
+        unpark( t );
+static inline void pop_node( blocking_lock & this ) with( this ) {
+    __handle_waituntil_OR( blocked_threads );
+    select_node * node = &try_pop_front( blocked_threads );
+    if ( node ) {
+        wait_count--;
+        owner = node->blocked_thread;
+        recursion_count = 1;
+        // if ( !node->clause_status || __make_select_node_available( *node ) ) unpark( node->blocked_thread );
+        wake_one( blocked_threads, *node );
+    } else {
+        owner = 0p;
+        recursion_count = 0;
+    }
+}
 …
         recursion_count--;
         if ( recursion_count == 0 ) {
                 pop_and_set_new_owner( this );
+                pop_node( this );
+        }
         unlock( lock );
 …
         // lock held
         if ( owner != 0p ) {
                 insert_last( blocked_threads, *t );
+                insert_last( blocked_threads, *(select_node *)t->link_node );
                 wait_count++;
-                unlock( lock );
+        }
         // lock not held
 …
                 recursion_count = 1;
                 unpark( t );
                 unlock( lock );
+        }
+}
 size_t on_wait( blocking_lock & this ) with( this ) {
+        }
+    unlock( lock );
+}
+size_t on_wait( blocking_lock & this, __cfa_pre_park pp_fn, void * pp_datum ) with( this ) {
         lock( lock __cfaabi_dbg_ctx2 );
         /* paranoid */ verifyf( owner != 0p, "Attempt to release lock %p that isn't held", &this );
 …
         size_t ret = recursion_count;
+        pop_and_set_new_owner( this );
+        pop_node( this );
+    select_node node;
+    active_thread()->link_node = (void *)&node;
         unlock( lock );
+    pre_park_then_park( pp_fn, pp_datum );
         return ret;
+}
 …
         recursion_count = recursion;
+}
+// waituntil() support
+bool register_select( blocking_lock & this, select_node & node ) with(this) {
+    lock( lock __cfaabi_dbg_ctx2 );
+        thread$ * thrd = active_thread();
+        // single acquisition lock is held by current thread
+        /* paranoid */ verifyf( owner != thrd || multi_acquisition, "Single acquisition lock holder (%p) attempted to reacquire the lock %p resulting in a deadlock.", owner, &this );
+    if ( !node.park_counter && ( (owner == thrd && multi_acquisition) || owner == 0p ) ) { // OR special case
+        if ( !__make_select_node_available( node ) ) { // we didn't win the race so give up on registering
+           unlock( lock );
+           return false;
+        }
+    }
+        // lock is held by some other thread
+        if ( owner != 0p && owner != thrd ) {
+                insert_last( blocked_threads, node );
+                wait_count++;
+                unlock( lock );
+        return false;
+        } else if ( owner == thrd && multi_acquisition ) { // multi acquisition lock is held by current thread
+                recursion_count++;
+        } else {  // lock isn't held
+                owner = thrd;
+                recursion_count = 1;
+        }
+    if ( node.park_counter ) __make_select_node_available( node );
+    unlock( lock );
+    return true;
+}
+bool unregister_select( blocking_lock & this, select_node & node ) with(this) {
+    lock( lock __cfaabi_dbg_ctx2 );
+    if ( node`isListed ) {
+        remove( node );
+        wait_count--;
+        unlock( lock );
+        return false;
+    }
+    if ( owner == active_thread() ) {
+        /* paranoid */ verifyf( recursion_count == 1 || multi_acquisition, "Thread %p attempted to unlock owner lock %p in waituntil unregister, which is not recursive but has a recursive count of %zu", active_thread(), &this, recursion_count );
+        // if recursion count is zero release lock and set new owner if one is waiting
+        recursion_count--;
+        if ( recursion_count == 0 ) {
+            pop_node( this );
+        }
+    }
+        unlock( lock );
+    return false;
+}
+void on_selected( blocking_lock & this, select_node & node ) {}
 //-----------------------------------------------------------------------------
 …
         int counter( condition_variable(L) & this ) with(this) { return count; }
         static size_t queue_and_get_recursion( condition_variable(L) & this, info_thread(L) * i ) with(this) {
+        static void enqueue_thread( condition_variable(L) & this, info_thread(L) * i ) with(this) {
                 // add info_thread to waiting queue
                 insert_last( blocked_threads, *i );
                 count++;
+                size_t recursion_count = 0;
+                if (i->lock) {
+                        // if lock was passed get recursion count to reset to after waking thread
+                        recursion_count = on_wait( *i->lock );
+                }
+                return recursion_count;
+        }
+        }
+    static size_t block_and_get_recursion( info_thread(L) & i, __cfa_pre_park pp_fn, void * pp_datum ) {
+        size_t recursion_count = 0;
+                if ( i.lock ) // if lock was passed get recursion count to reset to after waking thread
+                        recursion_count = on_wait( *i.lock, pp_fn, pp_datum ); // this call blocks
+                else
+            pre_park_then_park( pp_fn, pp_datum );
+        return recursion_count;
+    }
+    static size_t block_and_get_recursion( info_thread(L) & i ) { return block_and_get_recursion( i, pre_park_noop, 0p ); }
         // helper for wait()'s' with no timeout
         static void queue_info_thread( condition_variable(L) & this, info_thread(L) & i ) with(this) {
                 lock( lock __cfaabi_dbg_ctx2 );
                 size_t recursion_count = queue_and_get_recursion(this, &i);
+        enqueue_thread( this, &i );
                 unlock( lock );
                 // blocks here
                 park( );
+        size_t recursion_count = block_and_get_recursion( i );
                 // resets recursion count here after waking
                 if (i.lock) on_wakeup(*i.lock, recursion_count);
+                if ( i.lock ) on_wakeup( *i.lock, recursion_count );
+        }
 …
                 queue_info_thread( this, i );
+    static void cond_alarm_register( void * node_ptr ) { register_self( (alarm_node_t *)node_ptr ); }
         // helper for wait()'s' with a timeout
         static void queue_info_thread_timeout( condition_variable(L) & this, info_thread(L) & info, Duration t, Alarm_Callback callback ) with(this) {
                 lock( lock __cfaabi_dbg_ctx2 );
                 size_t recursion_count = queue_and_get_recursion(this, &info);
+        enqueue_thread( this, &info );
                 alarm_node_wrap(L) node_wrap = { t, 0`s, callback, &this, &info };
                 unlock( lock );
+                // registers alarm outside cond lock to avoid deadlock
+                register_self( &node_wrap.alarm_node );
+                // blocks here
+                park();
+                // blocks here and registers alarm node before blocking after releasing locks to avoid deadlock
+        size_t recursion_count = block_and_get_recursion( info, cond_alarm_register, (void *)(&node_wrap.alarm_node) );
+                // park();
                 // unregisters alarm so it doesn't go off if this happens first
 …
                 // resets recursion count here after waking
                 if (info.lock) on_wakeup(*info.lock, recursion_count);
+                if ( info.lock ) on_wakeup( *info.lock, recursion_count );
+        }
 …
                 info_thread( L ) i = { active_thread(), info, &l };
                 insert_last( blocked_threads, i );
                 size_t recursion_count = on_wait( *i.lock );
                 park( );
+                size_t recursion_count = on_wait( *i.lock, pre_park_noop, 0p ); // blocks here
+                // park( );
                 on_wakeup(*i.lock, recursion_count);
+        }
 …
         bool empty ( pthread_cond_var(L) & this ) with(this) { return blocked_threads`isEmpty; }
-        static size_t queue_and_get_recursion( pthread_cond_var(L) & this, info_thread(L) * i ) with(this) {
-                // add info_thread to waiting queue
-                insert_last( blocked_threads, *i );
-                size_t recursion_count = 0;
-                recursion_count = on_wait( *i->lock );
-                return recursion_count;
+        }
         static void queue_info_thread_timeout( pthread_cond_var(L) & this, info_thread(L) & info, Duration t, Alarm_Callback callback ) with(this) {
                 lock( lock __cfaabi_dbg_ctx2 );
                 size_t recursion_count = queue_and_get_recursion(this, &info);
+        insert_last( blocked_threads, info );
                 pthread_alarm_node_wrap(L) node_wrap = { t, 0`s, callback, &this, &info };
                 unlock( lock );
+                // registers alarm outside cond lock to avoid deadlock
+                register_self( &node_wrap.alarm_node );
+                // blocks here
+                park();
+                // unregisters alarm so it doesn't go off if this happens first
+                // blocks here and registers alarm node before blocking after releasing locks to avoid deadlock
+        size_t recursion_count = block_and_get_recursion( info, cond_alarm_register, (void *)(&node_wrap.alarm_node) );
+                // unregisters alarm so it doesn't go off if signal happens first
                 unregister_self( &node_wrap.alarm_node );
                 // resets recursion count here after waking
                 if (info.lock) on_wakeup(*info.lock, recursion_count);
+                if ( info.lock ) on_wakeup( *info.lock, recursion_count );
+        }
 …
                 lock( lock __cfaabi_dbg_ctx2 );
                 info_thread( L ) i = { active_thread(), info, &l };
+                size_t recursion_count = queue_and_get_recursion(this, &i);
+                unlock( lock );
+                park( );
+                on_wakeup(*i.lock, recursion_count);
+        insert_last( blocked_threads, i );
+                unlock( lock );
+        // blocks here
+                size_t recursion_count = block_and_get_recursion( i );
+                on_wakeup( *i.lock, recursion_count );
+        }
 …
         return thrd != 0p;
+}

libcfa/src/concurrency/locks.hfa

-              r34b4268
+              r24d6572
 #include "time.hfa"
+#include "select.hfa"
 #include <fstream.hfa>
 // futex headers
 …
 #include <unistd.h>
+// undef to make a number of the locks not reacquire upon waking from a condlock
+#define REACQ 1
+typedef void (*__cfa_pre_park)( void * );
+static inline void pre_park_noop( void * ) {}
+//-----------------------------------------------------------------------------
+// is_blocking_lock
+forall( L & | sized(L) )
+trait is_blocking_lock {
+        // For synchronization locks to use when acquiring
+        void on_notify( L &, struct thread$ * );
+        // For synchronization locks to use when releasing
+        size_t on_wait( L &, __cfa_pre_park pp_fn, void * pp_datum );
+        // to set recursion count after getting signalled;
+        void on_wakeup( L &, size_t recursion );
+};
+static inline void pre_park_then_park( __cfa_pre_park pp_fn, void * pp_datum ) {
+    pp_fn( pp_datum );
+    park();
+}
+// macros for default routine impls for is_blocking_lock trait that do not wait-morph
+#define DEFAULT_ON_NOTIFY( lock_type ) \
+    static inline void on_notify( lock_type & this, thread$ * t ){ unpark(t); }
+#define DEFAULT_ON_WAIT( lock_type ) \
+    static inline size_t on_wait( lock_type & this, __cfa_pre_park pp_fn, void * pp_datum ) { \
+        unlock( this ); \
+        pre_park_then_park( pp_fn, pp_datum ); \
+        return 0; \
+    }
+// on_wakeup impl if lock should be reacquired after waking up
+#define DEFAULT_ON_WAKEUP_REACQ( lock_type ) \
+    static inline void on_wakeup( lock_type & this, size_t recursion ) { lock( this ); }
+// on_wakeup impl if lock will not be reacquired after waking up
+#define DEFAULT_ON_WAKEUP_NO_REACQ( lock_type ) \
+    static inline void on_wakeup( lock_type & this, size_t recursion ) {}
 //-----------------------------------------------------------------------------
 …
 static inline bool   try_lock ( single_acquisition_lock & this ) { return try_lock( (blocking_lock &)this ); }
 static inline void   unlock   ( single_acquisition_lock & this ) { unlock  ( (blocking_lock &)this ); }
 static inline size_t on_wait  ( single_acquisition_lock & this ) { return on_wait ( (blocking_lock &)this ); }
+static inline size_t on_wait  ( single_acquisition_lock & this, __cfa_pre_park pp_fn, void * pp_datum ) { return on_wait ( (blocking_lock &)this, pp_fn, pp_datum ); }
 static inline void   on_wakeup( single_acquisition_lock & this, size_t v ) { on_wakeup ( (blocking_lock &)this, v ); }
 static inline void   on_notify( single_acquisition_lock & this, struct thread$ * t ) { on_notify( (blocking_lock &)this, t ); }
+static inline bool   register_select( single_acquisition_lock & this, select_node & node ) { return register_select( (blocking_lock &)this, node ); }
+static inline bool   unregister_select( single_acquisition_lock & this, select_node & node ) { return unregister_select( (blocking_lock &)this, node ); }
+static inline void   on_selected( single_acquisition_lock & this, select_node & node ) { on_selected( (blocking_lock &)this, node ); }
 //----------
 …
 static inline bool   try_lock ( owner_lock & this ) { return try_lock( (blocking_lock &)this ); }
 static inline void   unlock   ( owner_lock & this ) { unlock  ( (blocking_lock &)this ); }
 static inline size_t on_wait  ( owner_lock & this ) { return on_wait ( (blocking_lock &)this ); }
+static inline size_t on_wait  ( owner_lock & this, __cfa_pre_park pp_fn, void * pp_datum ) { return on_wait ( (blocking_lock &)this, pp_fn, pp_datum ); }
 static inline void   on_wakeup( owner_lock & this, size_t v ) { on_wakeup ( (blocking_lock &)this, v ); }
 static inline void   on_notify( owner_lock & this, struct thread$ * t ) { on_notify( (blocking_lock &)this, t ); }
+static inline bool   register_select( owner_lock & this, select_node & node ) { return register_select( (blocking_lock &)this, node ); }
+static inline bool   unregister_select( owner_lock & this, select_node & node ) { return unregister_select( (blocking_lock &)this, node ); }
+static inline void   on_selected( owner_lock & this, select_node & node ) { on_selected( (blocking_lock &)this, node ); }
 //-----------------------------------------------------------------------------
 …
 static inline void ?{}(mcs_spin_node & this) { this.next = 0p; this.locked = true; }
-static inline mcs_spin_node * volatile & ?`next ( mcs_spin_node * node ) {
-        return node->next;
+}
 struct mcs_spin_lock {
         mcs_spin_queue queue;
 …
 static inline void lock(mcs_spin_lock & l, mcs_spin_node & n) {
+    n.locked = true;
         mcs_spin_node * prev = __atomic_exchange_n(&l.queue.tail, &n, __ATOMIC_SEQ_CST);
+        n.locked = true;
+        if(prev == 0p) return;
+        if( prev == 0p ) return;
         prev->next = &n;
         while(__atomic_load_n(&n.locked, __ATOMIC_RELAXED)) Pause();
+        while( __atomic_load_n(&n.locked, __ATOMIC_RELAXED) ) Pause();
+}
 …
         mcs_spin_node * n_ptr = &n;
         if (__atomic_compare_exchange_n(&l.queue.tail, &n_ptr, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) return;
         while (__atomic_load_n(&n.next, __ATOMIC_RELAXED) == 0p) {}
+        while (__atomic_load_n(&n.next, __ATOMIC_RELAXED) == 0p) Pause();
         n.next->locked = false;
+}
 …
 // futex_mutex
-// - No cond var support
 // - Kernel thd blocking alternative to the spinlock
 // - No ownership (will deadlock on reacq)
+// - no reacq on wakeup
 struct futex_mutex {
         // lock state any state other than UNLOCKED is locked
 …
+}
 static inline void  ?{}( futex_mutex & this ) with(this) { val = 0; }
 static inline bool internal_try_lock(futex_mutex & this, int & compare_val) with(this) {
+static inline void ?{}( futex_mutex & this ) with(this) { val = 0; }
+static inline bool internal_try_lock( futex_mutex & this, int & compare_val) with(this) {
         return __atomic_compare_exchange_n((int*)&val, (int*)&compare_val, 1, false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE);
+}
 static inline int internal_exchange(futex_mutex & this) with(this) {
+static inline int internal_exchange( futex_mutex & this ) with(this) {
         return __atomic_exchange_n((int*)&val, 2, __ATOMIC_ACQUIRE);
+}
 // if this is called recursively IT WILL DEADLOCK!!!!!
 static inline void lock(futex_mutex & this) with(this) {
+static inline void lock( futex_mutex & this ) with(this) {
         int state;
+        // // linear backoff omitted for now
+        // for( int spin = 4; spin < 1024; spin += spin) {
+        //      state = 0;
+        //      // if unlocked, lock and return
+        //      if (internal_try_lock(this, state)) return;
+        //      if (2 == state) break;
+        //      for (int i = 0; i < spin; i++) Pause();
+        // }
+        // no contention try to acquire
+        if (internal_try_lock(this, state)) return;
+        for( int spin = 4; spin < 1024; spin += spin) {
+                state = 0;
+                // if unlocked, lock and return
+                if (internal_try_lock(this, state)) return;
+                if (2 == state) break;
+                for (int i = 0; i < spin; i++) Pause();
+        }
         // if not in contended state, set to be in contended state
 …
 static inline void unlock(futex_mutex & this) with(this) {
         // if uncontended do atomice unlock and then return
+        if (__atomic_fetch_sub(&val, 1, __ATOMIC_RELEASE) == 1) return; // TODO: try acq/rel
+        // if uncontended do atomic unlock and then return
+    if (__atomic_exchange_n(&val, 0, __ATOMIC_RELEASE) == 1) return;
         // otherwise threads are blocked so we must wake one
-        __atomic_store_n((int *)&val, 0, __ATOMIC_RELEASE);
         futex((int *)&val, FUTEX_WAKE, 1);
+}
+static inline void on_notify( futex_mutex & f, thread$ * t){ unpark(t); }
+static inline size_t on_wait( futex_mutex & f ) {unlock(f); return 0;}
+// to set recursion count after getting signalled;
+static inline void on_wakeup( futex_mutex & f, size_t recursion ) {}
+//-----------------------------------------------------------------------------
+// CLH Spinlock
+// - No recursive acquisition
+// - Needs to be released by owner
+struct clh_lock {
+        volatile bool * volatile tail;
+};
+static inline void  ?{}( clh_lock & this ) { this.tail = malloc(); *this.tail = true; }
+static inline void ^?{}( clh_lock & this ) { free(this.tail); }
+static inline void lock(clh_lock & l) {
+        thread$ * curr_thd = active_thread();
+        *(curr_thd->clh_node) = false;
+        volatile bool * prev = __atomic_exchange_n((bool **)(&l.tail), (bool *)(curr_thd->clh_node), __ATOMIC_SEQ_CST);
+        while(!__atomic_load_n(prev, __ATOMIC_ACQUIRE)) Pause();
+        curr_thd->clh_prev = prev;
+}
+static inline void unlock(clh_lock & l) {
+        thread$ * curr_thd = active_thread();
+        __atomic_store_n(curr_thd->clh_node, true, __ATOMIC_RELEASE);
+        curr_thd->clh_node = curr_thd->clh_prev;
+}
+static inline void on_notify(clh_lock & this, struct thread$ * t ) { unpark(t); }
+static inline size_t on_wait(clh_lock & this) { unlock(this); return 0; }
+static inline void on_wakeup(clh_lock & this, size_t recursion ) {
+        #ifdef REACQ
+        lock(this);
+        #endif
+}
+//-----------------------------------------------------------------------------
+// Linear backoff Spinlock
+struct linear_backoff_then_block_lock {
+DEFAULT_ON_NOTIFY( futex_mutex )
+DEFAULT_ON_WAIT( futex_mutex )
+DEFAULT_ON_WAKEUP_NO_REACQ( futex_mutex )
+//-----------------------------------------------------------------------------
+// go_mutex
+// - Kernel thd blocking alternative to the spinlock
+// - No ownership (will deadlock on reacq)
+// - Golang's flavour of mutex
+// - Impl taken from Golang: src/runtime/lock_futex.go
+struct go_mutex {
+        // lock state any state other than UNLOCKED is locked
+        // enum LockState { UNLOCKED = 0, LOCKED = 1, SLEEPING = 2 };
+        // stores a lock state
+        int val;
+};
+static inline void  ?{}( go_mutex & this ) with(this) { val = 0; }
+// static inline void ?{}( go_mutex & this, go_mutex this2 ) = void; // these don't compile correctly at the moment so they should be omitted
+// static inline void ?=?( go_mutex & this, go_mutex this2 ) = void;
+static inline bool internal_try_lock(go_mutex & this, int & compare_val, int new_val ) with(this) {
+        return __atomic_compare_exchange_n((int*)&val, (int*)&compare_val, new_val, false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE);
+}
+static inline int internal_exchange(go_mutex & this, int swap ) with(this) {
+        return __atomic_exchange_n((int*)&val, swap, __ATOMIC_ACQUIRE);
+}
+// if this is called recursively IT WILL DEADLOCK!!!!!
+static inline void lock( go_mutex & this ) with( this ) {
+        int state, init_state;
+    // speculative grab
+    state = internal_exchange(this, 1);
+    if ( !state ) return; // state == 0
+    init_state = state;
+    for (;;) {
+        for( int i = 0; i < 4; i++ ) {
+            while( !val ) { // lock unlocked
+                state = 0;
+                if ( internal_try_lock( this, state, init_state ) ) return;
+            }
+            for (int i = 0; i < 30; i++) Pause();
+        }
+        while( !val ) { // lock unlocked
+            state = 0;
+            if ( internal_try_lock( this, state, init_state ) ) return;
+        }
+        sched_yield();
+        // if not in contended state, set to be in contended state
+        state = internal_exchange( this, 2 );
+        if ( !state ) return; // state == 0
+        init_state = 2;
+        futex( (int*)&val, FUTEX_WAIT, 2 ); // if val is not 2 this returns with EWOULDBLOCK
+    }
+}
+static inline void unlock( go_mutex & this ) with(this) {
+        // if uncontended do atomic unlock and then return
+    if ( __atomic_exchange_n(&val, 0, __ATOMIC_RELEASE) == 1 ) return;
+        // otherwise threads are blocked so we must wake one
+        futex( (int *)&val, FUTEX_WAKE, 1 );
+}
+DEFAULT_ON_NOTIFY( go_mutex )
+DEFAULT_ON_WAIT( go_mutex )
+DEFAULT_ON_WAKEUP_NO_REACQ( go_mutex )
+//-----------------------------------------------------------------------------
+// Exponential backoff then block lock
+struct exp_backoff_then_block_lock {
         // Spin lock used for mutual exclusion
         __spinlock_t spinlock;
 …
 };
 static inline void  ?{}( linear_backoff_then_block_lock & this ) {
+static inline void  ?{}( exp_backoff_then_block_lock & this ) {
         this.spinlock{};
         this.blocked_threads{};
         this.lock_value = 0;
+}
+static inline void ^?{}( linear_backoff_then_block_lock & this ) {}
+// static inline void ?{}( linear_backoff_then_block_lock & this, linear_backoff_then_block_lock this2 ) = void;
+// static inline void ?=?( linear_backoff_then_block_lock & this, linear_backoff_then_block_lock this2 ) = void;
+static inline bool internal_try_lock(linear_backoff_then_block_lock & this, size_t & compare_val) with(this) {
+        if (__atomic_compare_exchange_n(&lock_value, &compare_val, 1, false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
+                return true;
+        }
+        return false;
+}
+static inline bool try_lock(linear_backoff_then_block_lock & this) { size_t compare_val = 0; return internal_try_lock(this, compare_val); }
+static inline bool try_lock_contention(linear_backoff_then_block_lock & this) with(this) {
+        if (__atomic_exchange_n(&lock_value, 2, __ATOMIC_ACQUIRE) == 0) {
+                return true;
+        }
+        return false;
+}
+static inline bool block(linear_backoff_then_block_lock & this) with(this) {
+        lock( spinlock __cfaabi_dbg_ctx2 ); // TODO change to lockfree queue (MPSC)
+        if (lock_value != 2) {
+                unlock( spinlock );
+                return true;
+        }
+        insert_last( blocked_threads, *active_thread() );
+        unlock( spinlock );
+static inline void ?{}( exp_backoff_then_block_lock & this, exp_backoff_then_block_lock this2 ) = void;
+static inline void ?=?( exp_backoff_then_block_lock & this, exp_backoff_then_block_lock this2 ) = void;
+static inline void  ^?{}( exp_backoff_then_block_lock & this ){}
+static inline bool internal_try_lock( exp_backoff_then_block_lock & this, size_t & compare_val ) with(this) {
+        return __atomic_compare_exchange_n(&lock_value, &compare_val, 1, false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+}
+static inline bool try_lock( exp_backoff_then_block_lock & this ) { size_t compare_val = 0; return internal_try_lock( this, compare_val ); }
+static inline bool try_lock_contention( exp_backoff_then_block_lock & this ) with(this) {
+        return !__atomic_exchange_n( &lock_value, 2, __ATOMIC_ACQUIRE );
+}
+static inline bool block( exp_backoff_then_block_lock & this ) with(this) {
+    lock( spinlock __cfaabi_dbg_ctx2 );
+    if (__atomic_load_n( &lock_value, __ATOMIC_SEQ_CST) != 2) {
+        unlock( spinlock );
+        return true;
+    }
+    insert_last( blocked_threads, *active_thread() );
+    unlock( spinlock );
         park( );
         return true;
+}
 static inline void lock(linear_backoff_then_block_lock & this) with(this) {
+static inline void lock( exp_backoff_then_block_lock & this ) with(this) {
         size_t compare_val = 0;
         int spin = 4;
         // linear backoff
         for( ;; ) {
 …
+}
 static inline void unlock(linear_backoff_then_block_lock & this) with(this) {
+static inline void unlock( exp_backoff_then_block_lock & this ) with(this) {
     if (__atomic_exchange_n(&lock_value, 0, __ATOMIC_RELEASE) == 1) return;
+        lock( spinlock __cfaabi_dbg_ctx2 );
+        thread$ * t = &try_pop_front( blocked_threads );
+        unlock( spinlock );
+        unpark( t );
+}
+static inline void on_notify(linear_backoff_then_block_lock & this, struct thread$ * t ) { unpark(t); }
+static inline size_t on_wait(linear_backoff_then_block_lock & this) { unlock(this); return 0; }
+static inline void on_wakeup(linear_backoff_then_block_lock & this, size_t recursion ) {
+        #ifdef REACQ
+        lock(this);
+        #endif
+}
+    lock( spinlock __cfaabi_dbg_ctx2 );
+    thread$ * t = &try_pop_front( blocked_threads );
+    unlock( spinlock );
+    unpark( t );
+}
+DEFAULT_ON_NOTIFY( exp_backoff_then_block_lock )
+DEFAULT_ON_WAIT( exp_backoff_then_block_lock )
+DEFAULT_ON_WAKEUP_REACQ( exp_backoff_then_block_lock )
 //-----------------------------------------------------------------------------
 …
 // if this is called recursively IT WILL DEADLOCK!!!!!
 static inline void lock(fast_block_lock & this) with(this) {
+static inline void lock( fast_block_lock & this ) with(this) {
         lock( lock __cfaabi_dbg_ctx2 );
         if ( held ) {
 …
+}
 static inline void unlock(fast_block_lock & this) with(this) {
+static inline void unlock( fast_block_lock & this ) with(this) {
         lock( lock __cfaabi_dbg_ctx2 );
         /* paranoid */ verifyf( held != false, "Attempt to release lock %p that isn't held", &this );
 …
+}
+static inline void on_notify(fast_block_lock & this, struct thread$ * t ) with(this) {
+        #ifdef REACQ
+                lock( lock __cfaabi_dbg_ctx2 );
+                insert_last( blocked_threads, *t );
+                unlock( lock );
+        #else
+                unpark(t);
+        #endif
+}
+static inline size_t on_wait(fast_block_lock & this) { unlock(this); return 0; }
+static inline void on_wakeup(fast_block_lock & this, size_t recursion ) { }
+static inline void on_notify( fast_block_lock & this, struct thread$ * t ) with(this) {
+    lock( lock __cfaabi_dbg_ctx2 );
+    insert_last( blocked_threads, *t );
+    unlock( lock );
+}
+DEFAULT_ON_WAIT( fast_block_lock )
+DEFAULT_ON_WAKEUP_NO_REACQ( fast_block_lock )
 //-----------------------------------------------------------------------------
 …
 struct simple_owner_lock {
         // List of blocked threads
         dlist( thread$ ) blocked_threads;
+        dlist( select_node ) blocked_threads;
         // Spin lock used for mutual exclusion
 …
 static inline void ?=?( simple_owner_lock & this, simple_owner_lock this2 ) = void;
 static inline void lock(simple_owner_lock & this) with(this) {
         if (owner == active_thread()) {
+static inline void lock( simple_owner_lock & this ) with(this) {
+        if ( owner == active_thread() ) {
                 recursion_count++;
                 return;
 …
         lock( lock __cfaabi_dbg_ctx2 );
+        if (owner != 0p) {
+                insert_last( blocked_threads, *active_thread() );
+        if ( owner != 0p ) {
+        select_node node;
+                insert_last( blocked_threads, node );
                 unlock( lock );
                 park( );
 …
+}
+// TODO: fix duplicate def issue and bring this back
+// void pop_and_set_new_owner( simple_owner_lock & this ) with( this ) {
+        // thread$ * t = &try_pop_front( blocked_threads );
+        // owner = t;
+        // recursion_count = ( t ? 1 : 0 );
+        // unpark( t );
+// }
+static inline void unlock(simple_owner_lock & this) with(this) {
+static inline void pop_node( simple_owner_lock & this ) with(this) {
+    __handle_waituntil_OR( blocked_threads );
+    select_node * node = &try_pop_front( blocked_threads );
+    if ( node ) {
+        owner = node->blocked_thread;
+        recursion_count = 1;
+        // if ( !node->clause_status || __make_select_node_available( *node ) ) unpark( node->blocked_thread );
+        wake_one( blocked_threads, *node );
+    } else {
+        owner = 0p;
+        recursion_count = 0;
+    }
+}
+static inline void unlock( simple_owner_lock & this ) with(this) {
         lock( lock __cfaabi_dbg_ctx2 );
         /* paranoid */ verifyf( owner != 0p, "Attempt to release lock %p that isn't held", &this );
 …
         recursion_count--;
         if ( recursion_count == 0 ) {
+                // pop_and_set_new_owner( this );
+                thread$ * t = &try_pop_front( blocked_threads );
+                owner = t;
+                recursion_count = ( t ? 1 : 0 );
+                unpark( t );
+                pop_node( this );
+        }
         unlock( lock );
+}
 static inline void on_notify(simple_owner_lock & this, struct thread$ * t ) with(this) {
+static inline void on_notify( simple_owner_lock & this, thread$ * t ) with(this) {
         lock( lock __cfaabi_dbg_ctx2 );
         // lock held
         if ( owner != 0p ) {
                 insert_last( blocked_threads, *t );
+                insert_last( blocked_threads, *(select_node *)t->link_node );
+        }
         // lock not held
 …
+}
 static inline size_t on_wait(simple_owner_lock & this) with(this) {
+static inline size_t on_wait( simple_owner_lock & this, __cfa_pre_park pp_fn, void * pp_datum ) with(this) {
         lock( lock __cfaabi_dbg_ctx2 );
         /* paranoid */ verifyf( owner != 0p, "Attempt to release lock %p that isn't held", &this );
 …
         size_t ret = recursion_count;
+        // pop_and_set_new_owner( this );
+        thread$ * t = &try_pop_front( blocked_threads );
+        owner = t;
+        recursion_count = ( t ? 1 : 0 );
+        unpark( t );
+        pop_node( this );
+    select_node node;
+    active_thread()->link_node = (void *)&node;
         unlock( lock );
+    pre_park_then_park( pp_fn, pp_datum );
         return ret;
+}
+static inline void on_wakeup(simple_owner_lock & this, size_t recursion ) with(this) { recursion_count = recursion; }
+static inline void on_wakeup( simple_owner_lock & this, size_t recursion ) with(this) { recursion_count = recursion; }
+// waituntil() support
+static inline bool register_select( simple_owner_lock & this, select_node & node ) with(this) {
+    lock( lock __cfaabi_dbg_ctx2 );
+    // check if we can complete operation. If so race to establish winner in special OR case
+    if ( !node.park_counter && ( owner == active_thread() || owner == 0p ) ) {
+        if ( !__make_select_node_available( node ) ) { // we didn't win the race so give up on registering
+           unlock( lock );
+           return false;
+        }
+    }
+    if ( owner == active_thread() ) {
+                recursion_count++;
+        if ( node.park_counter ) __make_select_node_available( node );
+        unlock( lock );
+                return true;
+        }
+    if ( owner != 0p ) {
+                insert_last( blocked_threads, node );
+                unlock( lock );
+                return false;
+        }
+        owner = active_thread();
+        recursion_count = 1;
+    if ( node.park_counter ) __make_select_node_available( node );
+    unlock( lock );
+    return true;
+}
+static inline bool unregister_select( simple_owner_lock & this, select_node & node ) with(this) {
+    lock( lock __cfaabi_dbg_ctx2 );
+    if ( node`isListed ) {
+        remove( node );
+        unlock( lock );
+        return false;
+    }
+    if ( owner == active_thread() ) {
+        recursion_count--;
+        if ( recursion_count == 0 ) {
+            pop_node( this );
+        }
+    }
+    unlock( lock );
+    return false;
+}
+static inline void on_selected( simple_owner_lock & this, select_node & node ) {}
 //-----------------------------------------------------------------------------
 …
         // flag showing if lock is held
         volatile bool held;
-        #ifdef __CFA_DEBUG__
-        // for deadlock detection
-        struct thread$ * owner;
-        #endif
 };
 …
 static inline void ?=?( spin_queue_lock & this, spin_queue_lock this2 ) = void;
 // if this is called recursively IT WILL DEADLOCK!!!!!
 static inline void lock(spin_queue_lock & this) with(this) {
+// if this is called recursively IT WILL DEADLOCK!
+static inline void lock( spin_queue_lock & this ) with(this) {
         mcs_spin_node node;
         lock( lock, node );
 …
+}
 static inline void unlock(spin_queue_lock & this) with(this) {
+static inline void unlock( spin_queue_lock & this ) with(this) {
         __atomic_store_n(&held, false, __ATOMIC_RELEASE);
+}
+static inline void on_notify(spin_queue_lock & this, struct thread$ * t ) {
+        unpark(t);
+}
+static inline size_t on_wait(spin_queue_lock & this) { unlock(this); return 0; }
+static inline void on_wakeup(spin_queue_lock & this, size_t recursion ) {
+        #ifdef REACQ
+        lock(this);
+        #endif
+}
+DEFAULT_ON_NOTIFY( spin_queue_lock )
+DEFAULT_ON_WAIT( spin_queue_lock )
+DEFAULT_ON_WAKEUP_REACQ( spin_queue_lock )
 //-----------------------------------------------------------------------------
 …
 // if this is called recursively IT WILL DEADLOCK!!!!!
 static inline void lock(mcs_block_spin_lock & this) with(this) {
+static inline void lock( mcs_block_spin_lock & this ) with(this) {
         mcs_node node;
         lock( lock, node );
 …
+}
+static inline void on_notify(mcs_block_spin_lock & this, struct thread$ * t ) { unpark(t); }
+static inline size_t on_wait(mcs_block_spin_lock & this) { unlock(this); return 0; }
+static inline void on_wakeup(mcs_block_spin_lock & this, size_t recursion ) {
+        #ifdef REACQ
+        lock(this);
+        #endif
+}
+DEFAULT_ON_NOTIFY( mcs_block_spin_lock )
+DEFAULT_ON_WAIT( mcs_block_spin_lock )
+DEFAULT_ON_WAKEUP_REACQ( mcs_block_spin_lock )
 //-----------------------------------------------------------------------------
 …
 // if this is called recursively IT WILL DEADLOCK!!!!!
 static inline void lock(block_spin_lock & this) with(this) {
+static inline void lock( block_spin_lock & this ) with(this) {
         lock( lock );
         while(__atomic_load_n(&held, __ATOMIC_SEQ_CST)) Pause();
 …
+}
 static inline void unlock(block_spin_lock & this) with(this) {
+static inline void unlock( block_spin_lock & this ) with(this) {
         __atomic_store_n(&held, false, __ATOMIC_RELEASE);
+}
+static inline void on_notify(block_spin_lock & this, struct thread$ * t ) with(this.lock) {
+  #ifdef REACQ
+static inline void on_notify( block_spin_lock & this, struct thread$ * t ) with(this.lock) {
         // first we acquire internal fast_block_lock
         lock( lock __cfaabi_dbg_ctx2 );
 …
         unlock( lock );
-  #endif
         unpark(t);
+}
+static inline size_t on_wait(block_spin_lock & this) { unlock(this); return 0; }
+static inline void on_wakeup(block_spin_lock & this, size_t recursion ) with(this) {
+  #ifdef REACQ
+}
+DEFAULT_ON_WAIT( block_spin_lock )
+static inline void on_wakeup( block_spin_lock & this, size_t recursion ) with(this) {
         // now we acquire the entire block_spin_lock upon waking up
         while(__atomic_load_n(&held, __ATOMIC_SEQ_CST)) Pause();
         __atomic_store_n(&held, true, __ATOMIC_RELEASE);
         unlock( lock ); // Now we release the internal fast_spin_lock
+  #endif
+}
+//-----------------------------------------------------------------------------
+// is_blocking_lock
+trait is_blocking_lock(L & | sized(L)) {
+        // For synchronization locks to use when acquiring
+        void on_notify( L &, struct thread$ * );
+        // For synchronization locks to use when releasing
+        size_t on_wait( L & );
+        // to set recursion count after getting signalled;
+        void on_wakeup( L &, size_t recursion );
+};
+}
 //-----------------------------------------------------------------------------
 …
 forall(L & | is_blocking_lock(L)) {
         struct info_thread;
-        // // for use by sequence
-        // info_thread(L) *& Back( info_thread(L) * this );
-        // info_thread(L) *& Next( info_thread(L) * this );
+}

libcfa/src/concurrency/monitor.cfa

-              r34b4268
+              r24d6572
 // Created On       : Thd Feb 23 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Dec  4 07:55:14 2019
 // Update Count     : 10
+// Last Modified On : Sun Feb 19 17:00:59 2023
+// Update Count     : 12
 //
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include "monitor.hfa"

libcfa/src/concurrency/monitor.hfa

-              r34b4268
+              r24d6572
 // Created On       : Thd Feb 23 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Dec  4 07:55:32 2019
 // Update Count     : 11
+// Last Modified On : Thu Feb  2 11:29:21 2023
+// Update Count     : 12
 //
 …
 #include "stdlib.hfa"
+trait is_monitor(T &) {
+forall( T & )
+trait is_monitor {
         monitor$ * get_monitor( T & );
         void ^?{}( T & mutex );

libcfa/src/concurrency/mutex.cfa

-              r34b4268
+              r24d6572
 // Created On       : Fri May 25 01:37:11 2018
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Dec  4 09:16:39 2019
 // Update Count     : 1
+// Last Modified On : Sun Feb 19 17:01:36 2023
+// Update Count     : 3
 //
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include "mutex.hfa"

libcfa/src/concurrency/mutex.hfa

-              r34b4268
+              r24d6572
 // Created On       : Fri May 25 01:24:09 2018
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Dec  4 09:16:53 2019
 // Update Count     : 1
+// Last Modified On : Thu Feb  2 11:46:08 2023
+// Update Count     : 2
 //
 …
 void unlock(recursive_mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+trait is_lock(L & | sized(L)) {
+forall( L & | sized(L) )
+trait is_lock {
         void lock  (L &);
         void unlock(L &);

libcfa/src/concurrency/mutex_stmt.hfa

-              r34b4268
+              r24d6572
+#pragma once
 #include "bits/algorithm.hfa"
 #include "bits/defs.hfa"
 …
 //-----------------------------------------------------------------------------
 // is_lock
+trait is_lock(L & | sized(L)) {
+forall(L & | sized(L))
+trait is_lock {
         // For acquiring a lock
         void lock( L & );
 …
         void unlock( L & );
 };
 struct __mutex_stmt_lock_guard {
 …
     // Sort locks based on address
     __libcfa_small_sort(this.lockarr, count);
-    // acquire locks in order
-    // for ( size_t i = 0; i < count; i++ ) {
-    //     lock(*this.lockarr[i]);
-    // }
+}
-static inline void ^?{}( __mutex_stmt_lock_guard & this ) with(this) {
-    // for ( size_t i = count; i > 0; i-- ) {
-    //     unlock(*lockarr[i - 1]);
-    // }
+}
 forall(L & | is_lock(L)) {
+    struct scoped_lock {
+        L * internal_lock;
+    };
+    static inline void ?{}( scoped_lock(L) & this, L & internal_lock ) {
+        this.internal_lock = &internal_lock;
+        lock(internal_lock);
+    }
+    static inline void ^?{}( scoped_lock(L) & this ) with(this) {
+        unlock(*internal_lock);
+    }
+    static inline void * __get_mutexstmt_lock_ptr( L & this ) {
+        return &this;
+    }
+    static inline L __get_mutexstmt_lock_type( L & this );
+    static inline L __get_mutexstmt_lock_type( L * this );
+    static inline void * __get_mutexstmt_lock_ptr( L & this ) { return &this; }
+    static inline L __get_mutexstmt_lock_type( L & this ) {}
+    static inline L __get_mutexstmt_lock_type( L * this ) {}
+}

libcfa/src/concurrency/preemption.cfa

-              r34b4268
+              r24d6572
 // Created On       : Mon Jun 5 14:20:42 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Feb 17 11:18:57 2022
 // Update Count     : 59
+// Last Modified On : Mon Jan  9 08:42:59 2023
+// Update Count     : 60
 //
 #define __cforall_thread__
-#define _GNU_SOURCE
 // #define __CFA_DEBUG_PRINT_PREEMPTION__
 …
                 __cfadbg_print_buffer_decl( preemption, " KERNEL: preemption tick %lu\n", currtime.tn);
                 Duration period = node->period;
                 if( period == 0) {
+                if( period == 0 ) {
                         node->set = false;                  // Node is one-shot, just mark it as not pending
+                }

libcfa/src/concurrency/pthread.cfa

-              r34b4268
+              r24d6572
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include <signal.h>
 …
 struct pthread_values{
         inline Seqable;
         void* value;
+        void * value;
         bool in_use;
 };
 …
 struct pthread_keys {
         bool in_use;
         void (*destructor)( void * );
+        void (* destructor)( void * );
         Sequence(pthread_values) threads;
 };
 static void ?{}(pthread_keys& k){
+static void ?{}(pthread_keys& k) {
         k.threads{};
+}
 …
 static pthread_keys cfa_pthread_keys_storage[PTHREAD_KEYS_MAX] __attribute__((aligned (16)));
 static void init_pthread_storage(){
         for (int i = 0; i < PTHREAD_KEYS_MAX; i++){
+static void init_pthread_storage() {
+        for ( int i = 0; i < PTHREAD_KEYS_MAX; i++ ) {
                 cfa_pthread_keys_storage[i]{};
+        }
 …
 /* condvar helper routines */
 static void init(pthread_cond_t* pcond){
+static void init(pthread_cond_t * pcond) {
         static_assert(sizeof(pthread_cond_t) >= sizeof(cfa2pthr_cond_var_t),"sizeof(pthread_t) < sizeof(cfa2pthr_cond_var_t)");
         cfa2pthr_cond_var_t* _cond = (cfa2pthr_cond_var_t*)pcond;
+        cfa2pthr_cond_var_t * _cond = (cfa2pthr_cond_var_t *)pcond;
         ?{}(*_cond);
+}
 static cfa2pthr_cond_var_t* get(pthread_cond_t* pcond){
+static cfa2pthr_cond_var_t * get(pthread_cond_t * pcond) {
         static_assert(sizeof(pthread_cond_t) >= sizeof(cfa2pthr_cond_var_t),"sizeof(pthread_t) < sizeof(cfa2pthr_cond_var_t)");
         return (cfa2pthr_cond_var_t*)pcond;
+}
 static void destroy(pthread_cond_t* cond){
+        return (cfa2pthr_cond_var_t *)pcond;
+}
+static void destroy(pthread_cond_t * cond) {
         static_assert(sizeof(pthread_cond_t) >= sizeof(cfa2pthr_cond_var_t),"sizeof(pthread_t) < sizeof(cfa2pthr_cond_var_t)");
         ^?{}(*get(cond));
 …
 /* mutex helper routines */
 static void mutex_check(pthread_mutex_t* t){
+static void mutex_check(pthread_mutex_t * t) {
         // Use double check to improve performance.
         // Check is safe on x86; volatile prevents compiler reordering
         volatile pthread_mutex_t *const mutex_ = t;
+        volatile pthread_mutex_t * const mutex_ = t;
         // SKULLDUGGERY: not a portable way to access the kind field, /usr/include/x86_64-linux-gnu/bits/pthreadtypes.h
 …
 static void init(pthread_mutex_t* plock){
+static void init(pthread_mutex_t * plock) {
         static_assert(sizeof(pthread_mutex_t) >= sizeof(simple_owner_lock),"sizeof(pthread_mutex_t) < sizeof(simple_owner_lock)");
         simple_owner_lock* _lock = (simple_owner_lock*)plock;
+        simple_owner_lock * _lock = (simple_owner_lock *)plock;
         ?{}(*_lock);
+}
 static simple_owner_lock* get(pthread_mutex_t* plock){
+static simple_owner_lock * get(pthread_mutex_t * plock) {
         static_assert(sizeof(pthread_mutex_t) >= sizeof(simple_owner_lock),"sizeof(pthread_mutex_t) < sizeof(simple_owner_lock)");
         return (simple_owner_lock*)plock;
+}
 static void destroy(pthread_mutex_t* plock){
+        return (simple_owner_lock *)plock;
+}
+static void destroy(pthread_mutex_t * plock) {
         static_assert(sizeof(pthread_mutex_t) >= sizeof(simple_owner_lock),"sizeof(pthread_mutex_t) < sizeof(simple_owner_lock)");
         ^?{}(*get(plock));
 …
 //######################### Attr helpers #########################
 struct cfaPthread_attr_t {                                                              // thread attributes
+typedef struct cfaPthread_attr_t {                                              // thread attributes
                 int contentionscope;
                 int detachstate;
                 size_t stacksize;
                 void *stackaddr;
+                void * stackaddr;
                 int policy;
                 int inheritsched;
                 struct sched_param param;
 } typedef cfaPthread_attr_t;
 static const cfaPthread_attr_t default_attrs{
+} cfaPthread_attr_t;
+static const cfaPthread_attr_t default_attrs {
 ,
 ,
         (size_t)65000,
         (void *)NULL,
+_000,
+        NULL,
 ,
 ,
 …
 };
 static cfaPthread_attr_t* get(const pthread_attr_t* attr){
         static_assert(sizeof(pthread_attr_t) >= sizeof(cfaPthread_attr_t),"sizeof(pthread_attr_t) < sizeof(cfaPthread_attr_t)");
         return (cfaPthread_attr_t*)attr;
+static cfaPthread_attr_t * get(const pthread_attr_t * attr) {
+        static_assert(sizeof(pthread_attr_t) >= sizeof(cfaPthread_attr_t), "sizeof(pthread_attr_t) < sizeof(cfaPthread_attr_t)");
+        return (cfaPthread_attr_t *)attr;
+}
 …
         // pthreads return value
         void *joinval;
+        void * joinval;
         // pthread attributes
         pthread_attr_t pthread_attr;
         void *(*start_routine)(void *);
         void *start_arg;
+        void *(* start_routine)(void *);
+        void * start_arg;
         // thread local data
         pthread_values* pthreadData;
+        pthread_values * pthreadData;
         // flag used for tryjoin
 …
 /* thread part routines */
 //  cfaPthread entry point
 void main(cfaPthread& _thread) with(_thread){
         joinval =  start_routine(start_arg);
+void main(cfaPthread & _thread) with(_thread) {
+        joinval = start_routine(start_arg);
         isTerminated = true;
+}
 static cfaPthread *lookup( pthread_t p ){
         static_assert(sizeof(pthread_t) >= sizeof(cfaPthread*),"sizeof(pthread_t) < sizeof(cfaPthread*)");
         return (cfaPthread*)p;
+}
 static void pthread_deletespecific_( pthread_values* values )  { // see uMachContext::invokeTask
         pthread_values* value;
         pthread_keys* key;
+static cfaPthread * lookup( pthread_t p ) {
+        static_assert(sizeof(pthread_t) >= sizeof(cfaPthread *),"sizeof(pthread_t) < sizeof(cfaPthread *)");
+        return (cfaPthread *)p;
+}
+static void pthread_deletespecific_( pthread_values * values )  { // see uMachContext::invokeTask
+        pthread_values * value;
+        pthread_keys * key;
         bool destcalled = true;
         if (values != NULL){
+        if (values != NULL) {
                 for ( int attempts = 0; attempts < PTHREAD_DESTRUCTOR_ITERATIONS && destcalled ; attempts += 1 ) {
                         destcalled = false;
                         lock(key_lock);
                         for (int i = 0; i < PTHREAD_KEYS_MAX; i++){
+                        for ( int i = 0; i < PTHREAD_KEYS_MAX; i++ ) {
                                 // for each valid key
                                 if ( values[i].in_use){
+                                if ( values[i].in_use) {
                                         value = &values[i];
                                         key = &cfa_pthread_keys[i];
 …
                                         // if  a  key  value  has  a  non-NULL  destructor pointer,  and  the  thread  has  a  non-NULL  value associated with that key,
                                         // the value of the key is set to NULL, and then the function pointed to is called with the previously associated value as its sole argument.
                                         if (value->value != NULL && key->destructor != NULL){
+                                        if (value->value != NULL && key->destructor != NULL) {
                                                 unlock(key_lock);
                                                 key->destructor(value->value); // run destructor
 …
+}
 static void ^?{}(cfaPthread & mutex t){
+static void ^?{}(cfaPthread & mutex t) {
         // delete pthread local storage
         pthread_values * values = t.pthreadData;
 …
+}
 static void ?{}(cfaPthread &t, pthread_t* _thread, const pthread_attr_t * _attr,void *(*start_routine)(void *), void * arg) {
         static_assert(sizeof(pthread_t) >= sizeof(cfaPthread*), "pthread_t too small to hold a pointer: sizeof(pthread_t) < sizeof(cfaPthread*)");
+static void ?{}(cfaPthread & t, pthread_t * _thread, const pthread_attr_t * _attr,void *(* start_routine)(void *), void * arg) {
+        static_assert(sizeof(pthread_t) >= sizeof(cfaPthread *), "pthread_t too small to hold a pointer: sizeof(pthread_t) < sizeof(cfaPthread *)");
         // set up user thread stackSize
 …
         //######################### Pthread Attrs #########################
         int pthread_attr_init(pthread_attr_t *attr) libcfa_public __THROW {
                 cfaPthread_attr_t* _attr = get(attr);
+        int pthread_attr_init(pthread_attr_t * attr) libcfa_public __THROW {
+                cfaPthread_attr_t * _attr = get(attr);
                 ?{}(*_attr, default_attrs);
                 return 0;
+        }
         int pthread_attr_destroy(pthread_attr_t *attr) libcfa_public __THROW {
+        int pthread_attr_destroy(pthread_attr_t * attr) libcfa_public __THROW {
                 ^?{}(*get(attr));
                 return 0;
+        }
         int pthread_attr_setscope( pthread_attr_t *attr, int contentionscope ) libcfa_public __THROW {
+        int pthread_attr_setscope( pthread_attr_t * attr, int contentionscope ) libcfa_public __THROW {
                 get( attr )->contentionscope = contentionscope;
                 return 0;
         } // pthread_attr_setscope
         int pthread_attr_getscope( const pthread_attr_t *attr, int *contentionscope ) libcfa_public __THROW {
+        int pthread_attr_getscope( const pthread_attr_t * attr, int * contentionscope ) libcfa_public __THROW {
                 *contentionscope = get( attr )->contentionscope;
                 return 0;
         } // pthread_attr_getscope
         int pthread_attr_setdetachstate( pthread_attr_t *attr, int detachstate ) libcfa_public __THROW {
+        int pthread_attr_setdetachstate( pthread_attr_t * attr, int detachstate ) libcfa_public __THROW {
                 get( attr )->detachstate = detachstate;
                 return 0;
         } // pthread_attr_setdetachstate
         int pthread_attr_getdetachstate( const pthread_attr_t *attr, int *detachstate ) libcfa_public __THROW {
+        int pthread_attr_getdetachstate( const pthread_attr_t * attr, int * detachstate ) libcfa_public __THROW {
                 *detachstate = get( attr )->detachstate;
                 return 0;
         } // pthread_attr_getdetachstate
         int pthread_attr_setstacksize( pthread_attr_t *attr, size_t stacksize ) libcfa_public __THROW {
+        int pthread_attr_setstacksize( pthread_attr_t * attr, size_t stacksize ) libcfa_public __THROW {
                 get( attr )->stacksize = stacksize;
                 return 0;
         } // pthread_attr_setstacksize
         int pthread_attr_getstacksize( const pthread_attr_t *attr, size_t *stacksize ) libcfa_public __THROW {
+        int pthread_attr_getstacksize( const pthread_attr_t * attr, size_t * stacksize ) libcfa_public __THROW {
                 *stacksize = get( attr )->stacksize;
                 return 0;
 …
         } // pthread_attr_setguardsize
         int pthread_attr_setstackaddr( pthread_attr_t *attr, void *stackaddr ) libcfa_public __THROW {
+        int pthread_attr_setstackaddr( pthread_attr_t * attr, void * stackaddr ) libcfa_public __THROW {
                 get( attr )->stackaddr = stackaddr;
                 return 0;
         } // pthread_attr_setstackaddr
         int pthread_attr_getstackaddr( const pthread_attr_t *attr, void **stackaddr ) libcfa_public __THROW {
+        int pthread_attr_getstackaddr( const pthread_attr_t * attr, void ** stackaddr ) libcfa_public __THROW {
                 *stackaddr = get( attr )->stackaddr;
                 return 0;
         } // pthread_attr_getstackaddr
         int pthread_attr_setstack( pthread_attr_t *attr, void *stackaddr, size_t stacksize ) libcfa_public __THROW {
+        int pthread_attr_setstack( pthread_attr_t * attr, void * stackaddr, size_t stacksize ) libcfa_public __THROW {
                 get( attr )->stackaddr = stackaddr;
                 get( attr )->stacksize = stacksize;
 …
         } // pthread_attr_setstack
         int pthread_attr_getstack( const pthread_attr_t *attr, void **stackaddr, size_t *stacksize ) libcfa_public __THROW {
+        int pthread_attr_getstack( const pthread_attr_t * attr, void ** stackaddr, size_t * stacksize ) libcfa_public __THROW {
                 *stackaddr = get( attr )->stackaddr;
                 *stacksize = get( attr )->stacksize;
 …
         // already running thread threadID. It shall be called on unitialized attr
         // and destroyed with pthread_attr_destroy when no longer needed.
         int pthread_getattr_np( pthread_t threadID, pthread_attr_t *attr ) libcfa_public __THROW { // GNU extension
+        int pthread_getattr_np( pthread_t threadID, pthread_attr_t * attr ) libcfa_public __THROW { // GNU extension
                 check_nonnull(attr);
 …
         //######################### Threads #########################
         int pthread_create(pthread_t * _thread, const pthread_attr_t * attr, void *(*start_routine)(void *), void * arg) libcfa_public __THROW {
                 cfaPthread *t = alloc();
+        int pthread_create(pthread_t * _thread, const pthread_attr_t * attr, void *(* start_routine)(void *), void * arg) libcfa_public __THROW {
+                cfaPthread * t = alloc();
                 (*t){_thread, attr, start_routine, arg};
                 return 0;
+        }
+        int pthread_join(pthread_t _thread, void **value_ptr) libcfa_public __THROW {
+        int pthread_join(pthread_t _thread, void ** value_ptr) libcfa_public __THROW {
                 // if thread is invalid
                 if (_thread == NULL) return EINVAL;
 …
                 // get user thr pointer
                 cfaPthread* p = lookup(_thread);
+                cfaPthread * p = lookup(_thread);
                 try {
                         join(*p);
 …
+        }
         int pthread_tryjoin_np(pthread_t _thread, void **value_ptr) libcfa_public __THROW {
+        int pthread_tryjoin_np(pthread_t _thread, void ** value_ptr) libcfa_public __THROW {
                 // if thread is invalid
                 if (_thread == NULL) return EINVAL;
                 if (_thread == pthread_self()) return EDEADLK;
                 cfaPthread* p = lookup(_thread);
+                cfaPthread * p = lookup(_thread);
                 // thread not finished ?
 …
         void pthread_exit(void * status) libcfa_public __THROW {
                 pthread_t pid = pthread_self();
                 cfaPthread* _thread = (cfaPthread*)pid;
+                cfaPthread * _thread = (cfaPthread *)pid;
                 _thread->joinval = status;  // set return value
                 _thread->isTerminated = 1;  // set terminated flag
 …
         //######################### Mutex #########################
         int pthread_mutex_init(pthread_mutex_t *_mutex, const pthread_mutexattr_t *attr) libcfa_public __THROW {
+        int pthread_mutex_init(pthread_mutex_t *_mutex, const pthread_mutexattr_t * attr) libcfa_public __THROW {
                 check_nonnull(_mutex);
                 init(_mutex);
 …
         int pthread_mutex_destroy(pthread_mutex_t *_mutex) libcfa_public __THROW {
                 check_nonnull(_mutex);
                 simple_owner_lock* _lock = get(_mutex);
                 if (_lock->owner != NULL){
+                simple_owner_lock * _lock = get(_mutex);
+                if (_lock->owner != NULL) {
                         return EBUSY;
+                }
 …
                 check_nonnull(_mutex);
                 mutex_check(_mutex);
                 simple_owner_lock* _lock = get(_mutex);
+                simple_owner_lock * _lock = get(_mutex);
                 lock(*_lock);
                 return 0;
 …
         int pthread_mutex_unlock(pthread_mutex_t *_mutex) libcfa_public __THROW {
                 check_nonnull(_mutex);
                 simple_owner_lock* _lock = get(_mutex);
                 if (_lock->owner != active_thread()){
+                simple_owner_lock * _lock = get(_mutex);
+                if (_lock->owner != active_thread()) {
                         return EPERM;
                 } // current thread does not hold the mutex
 …
         int pthread_mutex_trylock(pthread_mutex_t *_mutex) libcfa_public __THROW {
                 check_nonnull(_mutex);
                 simple_owner_lock* _lock = get(_mutex);
                 if (_lock->owner != active_thread() && _lock->owner != NULL){
+                simple_owner_lock * _lock = get(_mutex);
+                if (_lock->owner != active_thread() && _lock->owner != NULL) {
                         return EBUSY;
                 }   // if mutex is owned
 …
         /* conditional variable routines */
         int pthread_cond_init(pthread_cond_t *cond, const pthread_condattr_t *attr) libcfa_public __THROW {
+        int pthread_cond_init(pthread_cond_t * cond, const pthread_condattr_t * attr) libcfa_public __THROW {
                 check_nonnull(cond);
                 init(cond);
 …
         }  //pthread_cond_init
         int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *_mutex) libcfa_public __THROW {
+        int pthread_cond_wait(pthread_cond_t * cond, pthread_mutex_t *_mutex) libcfa_public __THROW {
                 check_nonnull(_mutex);
                 check_nonnull(cond);
 …
         } // pthread_cond_timedwait
         int pthread_cond_signal(pthread_cond_t *cond) libcfa_public __THROW {
+        int pthread_cond_signal(pthread_cond_t * cond) libcfa_public __THROW {
                 check_nonnull(cond);
                 return notify_one(*get(cond));
         } // pthread_cond_signal
         int pthread_cond_broadcast(pthread_cond_t *cond) libcfa_public __THROW {
+        int pthread_cond_broadcast(pthread_cond_t * cond) libcfa_public __THROW {
                 check_nonnull(cond);
                 return notify_all(*get(cond));
         } // pthread_cond_broadcast
         int pthread_cond_destroy(pthread_cond_t *cond) libcfa_public __THROW {
+        int pthread_cond_destroy(pthread_cond_t * cond) libcfa_public __THROW {
                 check_nonnull(cond);
                 destroy(cond);
 …
         //######################### Local storage #########################
         int pthread_once(pthread_once_t *once_control, void (*init_routine)(void)) libcfa_public __THROW {
+        int pthread_once(pthread_once_t * once_control, void (* init_routine)(void)) libcfa_public __THROW {
                 static_assert(sizeof(pthread_once_t) >= sizeof(int),"sizeof(pthread_once_t) < sizeof(int)");
                 check_nonnull(once_control);
 …
         } // pthread_once
         int pthread_key_create( pthread_key_t *key, void (*destructor)( void * ) ) libcfa_public __THROW {
+        int pthread_key_create( pthread_key_t * key, void (* destructor)( void * ) ) libcfa_public __THROW {
                 lock(key_lock);
                 for ( int i = 0; i < PTHREAD_KEYS_MAX; i += 1 ) {
 …
         }   // pthread_key_delete
         int pthread_setspecific( pthread_key_t key, const void *value ) libcfa_public __THROW {
+        int pthread_setspecific( pthread_key_t key, const void * value ) libcfa_public __THROW {
                 // get current thread
                 cfaPthread* t = lookup(pthread_self());
+                cfaPthread * t = lookup(pthread_self());
                 // if current thread's pthreadData is NULL; initialize it
                 pthread_values* values;
                 if (t->pthreadData == NULL){
+                pthread_values * values;
+                if (t->pthreadData == NULL) {
                         values = anew( PTHREAD_KEYS_MAX);
                         t->pthreadData = values;
                         for (int i = 0;i < PTHREAD_KEYS_MAX; i++){
+                        for ( int i = 0;i < PTHREAD_KEYS_MAX; i++ ) {
                                 t->pthreadData[i].in_use = false;
                         }   // for
 …
         } //pthread_setspecific
         void* pthread_getspecific(pthread_key_t key) libcfa_public __THROW {
+        void * pthread_getspecific(pthread_key_t key) libcfa_public __THROW {
                 if (key >= PTHREAD_KEYS_MAX || ! cfa_pthread_keys[key].in_use) return NULL;
                 // get current thread
                 cfaPthread* t = lookup(pthread_self());
+                cfaPthread * t = lookup(pthread_self());
                 if (t->pthreadData == NULL) return NULL;
                 lock(key_lock);
                 pthread_values &entry = ((pthread_values *)t->pthreadData)[key];
+                pthread_values & entry = ((pthread_values *)t->pthreadData)[key];
                 if ( ! entry.in_use ) {
                         unlock( key_lock );
                         return NULL;
                 } // if
                 void *value = entry.value;
+                void * value = entry.value;
                 unlock(key_lock);
 …
         //######################### Parallelism #########################
         int pthread_setaffinity_np( pthread_t /* __th */, size_t /* __cpusetsize */, __const cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
                 abort( "pthread_setaffinity_np" );
         } // pthread_setaffinity_np
         int pthread_getaffinity_np( pthread_t /* __th */, size_t /* __cpusetsize */, cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
                 abort( "pthread_getaffinity_np" );
         } // pthread_getaffinity_np
         int pthread_attr_setaffinity_np( pthread_attr_t * /* __attr */, size_t /* __cpusetsize */, __const cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
                 abort( "pthread_attr_setaffinity_np" );
         } // pthread_attr_setaffinity_np
         int pthread_attr_getaffinity_np( __const pthread_attr_t * /* __attr */, size_t /* __cpusetsize */, cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
                 abort( "pthread_attr_getaffinity_np" );
         } // pthread_attr_getaffinity_np
+        // int pthread_setaffinity_np( pthread_t /* __th */, size_t /* __cpusetsize */, __const cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
+        //      abort( "pthread_setaffinity_np" );
+        // } // pthread_setaffinity_np
+        // int pthread_getaffinity_np( pthread_t /* __th */, size_t /* __cpusetsize */, cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
+        //      abort( "pthread_getaffinity_np" );
+        // } // pthread_getaffinity_np
+        // int pthread_attr_setaffinity_np( pthread_attr_t * /* __attr */, size_t /* __cpusetsize */, __const cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
+        //      abort( "pthread_attr_setaffinity_np" );
+        // } // pthread_attr_setaffinity_np
+        // int pthread_attr_getaffinity_np( __const pthread_attr_t * /* __attr */, size_t /* __cpusetsize */, cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
+        //      abort( "pthread_attr_getaffinity_np" );
+        // } // pthread_attr_getaffinity_np
         //######################### Cancellation #########################
 …
         } // pthread_cancel
         int pthread_setcancelstate( int state, int *oldstate ) libcfa_public __THROW {
+        int pthread_setcancelstate( int state, int * oldstate ) libcfa_public __THROW {
                 abort("pthread_setcancelstate not implemented");
                 return 0;
         } // pthread_setcancelstate
         int pthread_setcanceltype( int type, int *oldtype ) libcfa_public __THROW {
+        int pthread_setcanceltype( int type, int * oldtype ) libcfa_public __THROW {
                 abort("pthread_setcanceltype not implemented");
                 return 0;
 …
 #pragma GCC diagnostic pop

libcfa/src/concurrency/ready_queue.cfa

r34b4268	r24d6572
15	15
16	16	#define __cforall_thread__
17		~~#define _GNU_SOURCE~~
18	17
19	18	// #define __CFA_DEBUG_PRINT_READY_QUEUE__

libcfa/src/concurrency/thread.cfa

-              r34b4268
+              r24d6572
 // Created On       : Tue Jan 17 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sun Dec 11 20:56:54 2022
 // Update Count     : 102
+// Last Modified On : Mon Jan  9 08:42:33 2023
+// Update Count     : 103
 //
 #define __cforall_thread__
-#define _GNU_SOURCE
 #include "thread.hfa"
 …
         preferred = ready_queue_new_preferred();
         last_proc = 0p;
+    link_node = 0p;
         PRNG_SET_SEED( random_state, __global_random_mask ? __global_random_prime : __global_random_prime ^ rdtscl() );
         #if defined( __CFA_WITH_VERIFY__ )
 …
         #endif
-        clh_node = malloc( );
-        *clh_node = false;
         doregister(curr_cluster, this);
         monitors{ &self_mon_p, 1, (fptr_t)0 };
 …
                 canary = 0xDEADDEADDEADDEADp;
         #endif
-        free(clh_node);
         unregister(curr_cluster, this);
         ^self_cor{};

libcfa/src/concurrency/thread.hfa

-              r34b4268
+              r24d6572
 // Created On       : Tue Jan 17 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Nov 22 22:18:34 2022
 // Update Count     : 35
+// Last Modified On : Thu Feb  2 11:27:59 2023
+// Update Count     : 37
 //
 …
 //-----------------------------------------------------------------------------
 // thread trait
+trait is_thread(T &) {
+forall( T & )
+trait is_thread {
         void ^?{}(T& mutex this);
         void main(T& this);

libcfa/src/containers/array.hfa

-              r34b4268
+              r24d6572
+//
+// Single-dim array sruct (with explicit packing and atom)
+//
+//
+// The `array` macro is the public interface.
+// It computes the type of a dense (trivially strided) array.
+// All user-declared objects are dense arrays.
+//
+// The `arpk` (ARray with PacKing info explicit) type is, generally, a slice with _any_ striding.
+// This type is meant for internal use.
+// CFA programmers should not instantiate it directly, nor access its field.
+// CFA programmers should call ?[?] on it.
+// Yet user-given `array(stuff)` expands to `arpk(stuff')`.
+// The comments here explain the resulting internals.
+//
+// Just as a plain-C "multidimesional" array is really array-of-array-of-...,
+// so does arpk generally show up as arpk-of-arpk-of...
+//
+// In the example of `array(float, 3, 4, 5) a;`,
+// `typeof(a)` is an `arpk` instantiation.
+// These comments explain _its_ arguments, i.e. those of the topmost `arpk` level.
+//
+// [N]    : the number of elements in `a`; 3 in the example
+// S      : carries the stride size (distance in bytes between &myA[0] and &myA[1]), in sizeof(S);
+//          same as Timmed when striding is trivial, same as Timmed in the example
+// Timmed : (T-immediate) the inner type; conceptually, `typeof(a)` is "arpk of Timmed";
+//          array(float, 4, 5) in the example
+// Tbase  : (T-base) the deepest element type that is not arpk; float in the example
+//
 forall( [N], S & | sized(S), Timmed &, Tbase & ) {
+    //
+    // Single-dim array sruct (with explicit packing and atom)
+    //
     struct arpk {
         S strides[N];

libcfa/src/containers/list.hfa

-              r34b4268
+              r24d6572
 // Author           : Michael Brooks
 // Created On       : Wed Apr 22 18:00:00 2020
 // Last Modified By : Michael Brooks
 // Last Modified On : Wed Apr 22 18:00:00 2020
 // Update Count     : 1
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Thu Feb  2 11:32:26 2023
+// Update Count     : 2
 //
 …
 };
+trait embedded( tOuter &, tMid &, tInner & ) {
+forall( tOuter &, tMid &, tInner & )
+trait embedded {
     tytagref( tMid, tInner ) ?`inner( tOuter & );
 };
 …
 static inline tytagref(void, T) ?`inner ( T & this ) { tytagref( void, T ) ret = {this}; return ret; }
+// use this on every case of plan-9 inheritance, to make embedded a closure of plan-9 inheritance
+#define P9_EMBEDDED( derived, immedBase ) \
+forall( Tbase &, TdiscardPath & | { tytagref( TdiscardPath, Tbase ) ?`inner( immedBase & ); } ) \
+    static inline tytagref(immedBase, Tbase) ?`inner( derived & this ) { \
+//
+// P9_EMBEDDED: Use on every case of plan-9 inheritance, to make "implements embedded" be a closure of plan-9 inheritance.
+//
+// struct foo {
+//    int a, b, c;
+//    inline (bar);
+// };
+// P9_EMBEDDED( foo, bar )
+//
+// usual version, for structs that are top-level declarations
+#define P9_EMBEDDED(        derived, immedBase ) P9_EMBEDDED_DECL_( derived, immedBase, static ) P9_EMBEDDED_BDY_( immedBase )
+// special version, for structs that are declared in functions
+#define P9_EMBEDDED_INFUNC( derived, immedBase ) P9_EMBEDDED_DECL_( derived, immedBase,        ) P9_EMBEDDED_BDY_( immedBase )
+// forward declarations of both the above; generally not needed
+// may help you control where the P9_EMBEEDED cruft goes, in case "right after the stuct" isn't where you want it
+#define P9_EMBEDDED_FWD(        derived, immedBase )      P9_EMBEDDED_DECL_( derived, immedBase, static ) ;
+#define P9_EMBEDDED_FWD_INFUNC( derived, immedBase ) auto P9_EMBEDDED_DECL_( derived, immedBase,        ) ;
+// private helpers
+#define P9_EMBEDDED_DECL_( derived, immedBase, STORAGE ) \
+    forall( Tbase &, TdiscardPath & | { tytagref( TdiscardPath, Tbase ) ?`inner( immedBase & ); } ) \
+    STORAGE inline tytagref(immedBase, Tbase) ?`inner( derived & this )
+#define P9_EMBEDDED_BDY_( immedBase ) { \
         immedBase & ib = this; \
         Tbase & b = ib`inner; \

libcfa/src/containers/vector.hfa

-              r34b4268
+              r24d6572
 // Created On       : Tue Jul  5 18:00:07 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Jun 17 11:02:46 2020
 // Update Count     : 4
+// Last Modified On : Thu Feb  2 11:41:24 2023
+// Update Count     : 5
 //
 …
 //------------------------------------------------------------------------------
 //Declaration
 trait allocator_c(T, allocator_t)
+{
+forall( T, allocator_t )
+trait allocator_c {
         void realloc_storage(allocator_t*, size_t);
         T* data(allocator_t*);

libcfa/src/containers/vector2.hfa

-              r34b4268
+              r24d6572
 // Author           : Michael Brooks
 // Created On       : Thu Jun 23 22:00:00 2021
+// Last Modified By : Michael Brooks
+// Last Modified On : Thu Jun 23 22:00:00 2021
+// Update Count     : 1
+//
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Tue Mar 14 08:40:53 2023
+// Update Count     : 2
+//
+#pragma once
 #include <stdlib.hfa>

libcfa/src/exception.h

-              r34b4268
+              r24d6572
 // Author           : Andrew Beach
 // Created On       : Mon Jun 26 15:11:00 2017
 // Last Modified By : Andrew Beach
 // Last Modified On : Thr Apr  8 15:20:00 2021
 // Update Count     : 12
+// Last Modified By : Peter A. Buhr
+// Last Modified On : Thu Feb  2 11:20:19 2023
+// Update Count     : 13
 //
 …
 // implemented in the .c file either so they all have to be inline.
+trait is_exception(exceptT &, virtualT &) {
+forall( exceptT &, virtualT & )
+trait is_exception {
         /* The first field must be a pointer to a virtual table.
          * That virtual table must be a decendent of the base exception virtual table.
 …
 };
+trait is_termination_exception(exceptT &, virtualT & | is_exception(exceptT, virtualT)) {
+forall( exceptT &, virtualT & | is_exception(exceptT, virtualT) )
+trait is_termination_exception {
         void defaultTerminationHandler(exceptT &);
 };
+trait is_resumption_exception(exceptT &, virtualT & | is_exception(exceptT, virtualT)) {
+forall( exceptT &, virtualT & | is_exception(exceptT, virtualT) )
+trait is_resumption_exception {
         void defaultResumptionHandler(exceptT &);
 };

libcfa/src/heap.cfa

-              r34b4268
+              r24d6572
 // Created On       : Tue Dec 19 21:58:35 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sun Oct 30 20:56:20 2022
 // Update Count     : 1584
+// Last Modified On : Fri Dec 30 08:37:37 2022
+// Update Count     : 1605
 //
 …
 #include <string.h>                                                                             // memset, memcpy
 #include <limits.h>                                                                             // ULONG_MAX
-#include <stdlib.h>                                                                             // EXIT_FAILURE
 #include <errno.h>                                                                              // errno, ENOMEM, EINVAL
+#include <unistd.h>                                                                             // STDERR_FILENO, sbrk, sysconf
+#include <malloc.h>                                                                             // memalign, malloc_usable_size
+#include <unistd.h>                                                                             // STDERR_FILENO, sbrk, sysconf, write
 #include <sys/mman.h>                                                                   // mmap, munmap
 extern "C" {
 …
 } // extern "C"
+#include "heap.hfa"
 #include "bits/align.hfa"                                                               // libAlign
 #include "bits/defs.hfa"                                                                // likely, unlikely
 …
 #endif
 typedef volatile uintptr_t SpinLock_t CALIGN;                   // aligned addressable word-size
+typedef volatile uintptr_t SpinLock_t;
 static inline __attribute__((always_inline)) void lock( volatile SpinLock_t & slock ) {
 …
         for ( unsigned int i = 1;; i += 1 ) {
           if ( slock == 0 && __atomic_test_and_set( &slock, __ATOMIC_SEQ_CST ) == 0 ) break; // Fence
+          if ( slock == 0 && __atomic_test_and_set( &slock, __ATOMIC_ACQUIRE ) == 0 ) break; // Fence
                 for ( volatile unsigned int s = 0; s < spin; s += 1 ) Pause(); // exponential spin
                 spin += spin;                                                                   // powers of 2
 …
 static inline __attribute__((always_inline)) void unlock( volatile SpinLock_t & slock ) {
         __atomic_clear( &slock, __ATOMIC_SEQ_CST );                     // Fence
+        __atomic_clear( &slock, __ATOMIC_RELEASE );                     // Fence
 } // spin_unlock
 …
         static_assert( libAlign() >= sizeof( Storage ), "minimum alignment < sizeof( Storage )" );
         struct __attribute__(( aligned (8) )) FreeHeader {
                 size_t blockSize __attribute__(( aligned(8) )); // size of allocations on this list
+        struct CALIGN FreeHeader {
+                size_t blockSize CALIGN;                                                // size of allocations on this list
                 #ifdef OWNERSHIP
                 #ifdef RETURNSPIN
 …
         #ifdef __CFA_DEBUG__
         int64_t allocUnfreed;                                                           // running total of allocations minus frees; can be negative
+        ptrdiff_t allocUnfreed;                                                         // running total of allocations minus frees; can be negative
         #endif // __CFA_DEBUG__
 …
 // Thread-local storage is allocated lazily when the storage is accessed.
 static __thread size_t PAD1 CALIGN TLSMODEL __attribute__(( unused )); // protect false sharing
 static __thread Heap * volatile heapManager CALIGN TLSMODEL;
+static __thread Heap * heapManager CALIGN TLSMODEL;
 static __thread size_t PAD2 CALIGN TLSMODEL __attribute__(( unused )); // protect further false sharing
 …
                 // 12K ~= 120K byte superblock.  Where 128-heap superblock handles a medium sized multi-processor server.
                 size_t remaining = heapManagersStorageEnd - heapManagersStorage; // remaining free heaps in superblock
                 if ( ! heapManagersStorage || remaining != 0 ) {
+                if ( ! heapManagersStorage || remaining == 0 ) {
                         // Each block of heaps is a multiple of the number of cores on the computer.
                         int HeapDim = get_nprocs();                                     // get_nprocs_conf does not work
 …
                 // allocUnfreed is set to 0 when a heap is created and it accumulates any unfreed storage during its multiple thread
                 // usages.  At the end, add up each heap allocUnfreed value across all heaps to get the total unfreed storage.
                 int64_t allocUnfreed = 0;
+                ptrdiff_t allocUnfreed = 0;
                 for ( Heap * heap = heapMaster.heapManagersList; heap; heap = heap->nextHeapManager ) {
                         allocUnfreed += heap->allocUnfreed;
 …
                         char helpText[512];
                         __cfaabi_bits_print_buffer( STDERR_FILENO, helpText, sizeof(helpText),
                                                                                 "CFA warning (UNIX pid:%ld) : program terminating with %ju(0x%jx) bytes of storage allocated but not freed.\n"
+                                                                                "CFA warning (UNIX pid:%ld) : program terminating with %td(%#tx) bytes of storage allocated but not freed.\n"
                                                                                 "Possible cause is unfreed storage allocated by the program or system/library routines called from the program.\n",
                                                                                 (long int)getpid(), allocUnfreed, allocUnfreed ); // always print the UNIX pid
 …
         ptrdiff_t rem = heapRemaining - size;
         if ( unlikely( rem < 0 ) ) {
+        if ( unlikely( rem < 0 ) ) {                                            // negative ?
                 // If the size requested is bigger than the current remaining storage, increase the size of the heap.
 …
         ptrdiff_t rem = heapReserve - size;
         if ( unlikely( rem < 0 ) ) {                                            // negative
+        if ( unlikely( rem < 0 ) ) {                                            // negative ?
                 // If the size requested is bigger than the current remaining reserve, use the current reserve to populate
                 // smaller freeLists, and increase the reserve.
 …
                 rem = heapReserve;                                                              // positive
                 if ( rem >= bucketSizes[0] ) {                                  // minimal size ? otherwise ignore
+                if ( (unsigned int)rem >= bucketSizes[0] ) {    // minimal size ? otherwise ignore
                         size_t bucket;
                         #ifdef FASTLOOKUP
 …
                         Heap.FreeHeader * freeHead = &(freeLists[bucket]);
                         // The remaining storage many not be bucket size, whereas all other allocations are. Round down to previous
+                        // The remaining storage may not be bucket size, whereas all other allocations are. Round down to previous
                         // bucket size in this case.
                         if ( unlikely( freeHead->blockSize > (size_t)rem ) ) freeHead -= 1;
 …
                 block = freeHead->freeList;                                             // remove node from stack
                 if ( unlikely( block == 0p ) ) {                                // no free block ?
                         // Freelist for this size is empty, so check return list (OWNERSHIP), carve it out of the heap, if there
+                        // Freelist for this size is empty, so check return list (OWNERSHIP), or carve it out of the heap if there
                         // is enough left, or get some more heap storage and carve it off.
                         #ifdef OWNERSHIP
 …
                         while ( ! __atomic_compare_exchange_n( &freeHead->returnList, &header->kind.real.next, (Heap.Storage *)header,
                                                                                                    false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) );
+                        #ifdef __STATISTICS__
+                        stats.return_pushes += 1;
+                        stats.return_storage_request += rsize;
+                        stats.return_storage_alloc += size;
+                        #endif // __STATISTICS__
                         #endif // RETURNSPIN
                 } // if
 …
                 freeHead->freeList = (Heap.Storage *)header;
                 #endif // ! OWNERSHIP
-                #ifdef __U_STATISTICS__
-                stats.return_pushes += 1;
-                stats.return_storage_request += rsize;
-                stats.return_storage_alloc += size;
-                #endif // __U_STATISTICS__
                 // OK TO BE PREEMPTED HERE AS heapManager IS NO LONGER ACCESSED.
 …
 #ifdef __STATISTICS__
 static void incCalls( intptr_t statName ) libcfa_nopreempt {
+static void incCalls( size_t statName ) libcfa_nopreempt {
         heapManager->stats.counters[statName].calls += 1;
 } // incCalls
 static void incZeroCalls( intptr_t statName ) libcfa_nopreempt {
+static void incZeroCalls( size_t statName ) libcfa_nopreempt {
         heapManager->stats.counters[statName].calls_0 += 1;
 } // incZeroCalls
 …
         // 0p, no operation is performed.
         void free( void * addr ) libcfa_public {
-//              verify( heapManager );
           if ( unlikely( addr == 0p ) ) {                                       // special case
                         #ifdef __STATISTICS__

libcfa/src/interpose.cfa

-              r34b4268
+              r24d6572
 // Created On       : Wed Mar 29 16:10:31 2017
 // Last Modified By : Peter A. Buhr
+// Last Modified On : Fri Mar 13 17:35:37 2020
+// Update Count     : 178
+//
+#include <stdarg.h>                                                                             // va_start, va_end
+// Last Modified On : Mon Mar 27 21:09:03 2023
+// Update Count     : 196
+//
 #include <stdio.h>
-#include <string.h>                                                                             // strlen
 #include <unistd.h>                                                                             // _exit, getpid
-#define __USE_GNU
-#include <signal.h>
-#undef __USE_GNU
 extern "C" {
 #include <dlfcn.h>                                                                              // dlopen, dlsym
 …
+}
-#include "bits/debug.hfa"
 #include "bits/defs.hfa"
 #include "bits/signal.hfa"                                                              // sigHandler_?
 …
 typedef void (* generic_fptr_t)(void);
 static generic_fptr_t do_interpose_symbol( void * library, const char symbol[], const char version[] ) {
-        const char * error;
         union { generic_fptr_t fptr; void * ptr; } originalFunc;
         #if defined( _GNU_SOURCE )
                 if ( version ) {
                         originalFunc.ptr = dlvsym( library, symbol, version );
                 } else {
                         originalFunc.ptr = dlsym( library, symbol );
+                }
+        if ( version ) {
+                originalFunc.ptr = dlvsym( library, symbol, version );
+        } else {
+                originalFunc.ptr = dlsym( library, symbol );
+        } // if
         #else
                 originalFunc.ptr = dlsym( library, symbol );
+        originalFunc.ptr = dlsym( library, symbol );
         #endif // _GNU_SOURCE
         error = dlerror();
         if ( error ) abort( "interpose_symbol : internal error, %s\n", error );
+        if ( ! originalFunc.ptr ) {                                                     // == nullptr
+                abort( "interpose_symbol : internal error, %s\n", dlerror() );
+        } // if
         return originalFunc.fptr;
+}
 static generic_fptr_t interpose_symbol( const char symbol[], const char version[] ) {
+        const char * error;
+        static void * library;
+        static void * pthread_library;
+        if ( ! library ) {
+                #if defined( RTLD_NEXT )
+                        library = RTLD_NEXT;
+                #else
+                        // missing RTLD_NEXT => must hard-code library name, assuming libstdc++
+                        library = dlopen( "libc.so.6", RTLD_LAZY );
+                        error = dlerror();
+                        if ( error ) {
+                                abort( "interpose_symbol : failed to open libc, %s\n", error );
+                        }
+                #endif
+        void * library;
+        #if defined( RTLD_NEXT )
+        library = RTLD_NEXT;
+        #else
+        // missing RTLD_NEXT => must hard-code library name, assuming libstdc++
+        library = dlopen( "libc.so.6", RTLD_LAZY );
+        if ( ! library ) {                                                                      // == nullptr
+                abort( "interpose_symbol : failed to open libc, %s\n", dlerror() );
         } // if
+        if ( ! pthread_library ) {
+                #if defined( RTLD_NEXT )
+                        pthread_library = RTLD_NEXT;
+                #else
+                        // missing RTLD_NEXT => must hard-code library name, assuming libstdc++
+                        pthread_library = dlopen( "libpthread.so", RTLD_LAZY );
+                        error = dlerror();
+                        if ( error ) {
+                                abort( "interpose_symbol : failed to open libpthread, %s\n", error );
+                        }
+                #endif
+        } // if
+        return do_interpose_symbol(library, symbol, version);
+        #endif // RTLD_NEXT
+        return do_interpose_symbol( library, symbol, version );
+}
 …
                 preload_libgcc();
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
+                #pragma GCC diagnostic push
+                #pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
                 INTERPOSE_LIBC( abort, version );
                 INTERPOSE_LIBC( exit , version );
 #pragma GCC diagnostic pop
+                #pragma GCC diagnostic pop
                 if(__cfathreadabi_interpose_startup) __cfathreadabi_interpose_startup( do_interpose_symbol );
+                // SKULLDUGGERY: In Ubuntu 22.04, someone augmented signal.h to allow SIGSTKSZ to be "sysconf(_SC_SIGSTKSZ)" in
+                // sigstksz.h, as well as 8192 in sigstack.h. HOWEVER, they forgot to provide a mechanism to tell signal.h to
+                // use sigstack.h rather than sigstksz.h. (I'm not happy.) By undefining _GNU_SOURCE before signal.h and
+                // redefining it afterwards, you can get 8192, but then nothing works correctly inside of signal.h without
+                // _GNU_SOURCE defined.  So what is needed is a way to get signal.h to use sigstack.h WITH _GNU_SOURCE defined.
+                // Basically something is wrong with features.h and its use in signal.h.
+                #undef SIGSTKSZ
+                #define SIGSTKSZ 8192
                 // As a precaution (and necessity), errors that result in termination are delivered on a separate stack because
 …
         va_start( args, fmt );
         __abort( false, fmt, args );
     // CONTROL NEVER REACHES HERE!
+        // CONTROL NEVER REACHES HERE!
         va_end( args );
+}
 void abort( bool signalAbort, const char fmt[], ... ) {
     va_list args;
     va_start( args, fmt );
     __abort( signalAbort, fmt, args );
     // CONTROL NEVER REACHES HERE!
     va_end( args );
+        va_list args;
+        va_start( args, fmt );
+        __abort( signalAbort, fmt, args );
+        // CONTROL NEVER REACHES HERE!
+        va_end( args );
+}

libcfa/src/interpose_thread.cfa

-              r34b4268
+              r24d6572
 //
+#include <stdarg.h>                                                                             // va_start, va_end
+#include <stdio.h>
+#include <string.h>                                                                             // strlen
+// BUG in 32-bit gcc with interpose: fixed in >= gcc-9.5, gcc-10.4, gcc-12.2
+#ifdef __i386__                                                                                 // 32-bit architecture
+#undef _GNU_SOURCE
+#endif // __i386__
 #include <signal.h>
 #include <pthread.h>
+#include <signal.h>
 extern "C" {
 #include <dlfcn.h>                                                                              // dlopen, dlsym
-#include <execinfo.h>                                                                   // backtrace, messages
+}
-#include "bits/debug.hfa"
 #include "bits/defs.hfa"
-#include <assert.h>
 //=============================================================================================
 …
 typedef void (* generic_fptr_t)(void);
 generic_fptr_t interpose_symbol(
+generic_fptr_t libcfa_public interpose_symbol(
         generic_fptr_t (*do_interpose_symbol)( void * library, const char symbol[], const char version[] ),
         const char symbol[],
         const char version[]
 ) libcfa_public {
         const char * error;
+) {
+        void * library;
+        static void * library;
+        if ( ! library ) {
+                #if defined( RTLD_NEXT )
+                        library = RTLD_NEXT;
+                #else
+                        // missing RTLD_NEXT => must hard-code library name, assuming libstdc++
+                        library = dlopen( "libpthread.so", RTLD_LAZY );
+                        error = dlerror();
+                        if ( error ) {
+                                abort( "interpose_symbol : failed to open libpthread, %s\n", error );
+                        }
+                #endif
+        #if defined( RTLD_NEXT )
+        library = RTLD_NEXT;
+        #else
+        // missing RTLD_NEXT => must hard-code library name, assuming libstdc++
+        library = dlopen( "libpthread.so", RTLD_LAZY );
+        if ( ! library ) {                                                                      // == nullptr
+                abort( "interpose_symbol : failed to open libpthread, %s\n", dlerror() );
         } // if
+        #endif // RTLD_NEXT
         return do_interpose_symbol(library, symbol, version);
+        return do_interpose_symbol( library, symbol, version );
+}
 …
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
                 INTERPOSE( pthread_create , version );
                 INTERPOSE( pthread_join , version );
                 INTERPOSE( pthread_self , version );
                 INTERPOSE( pthread_attr_init , version );
                 INTERPOSE( pthread_attr_destroy , version );
                 INTERPOSE( pthread_attr_setstack , version );
                 INTERPOSE( pthread_attr_getstacksize , version );
                 INTERPOSE( pthread_sigmask , version );
                 INTERPOSE( pthread_sigqueue , version );
                 INTERPOSE( pthread_once , version );
+                INTERPOSE( pthread_create, version );
+                INTERPOSE( pthread_join, version );
+                INTERPOSE( pthread_self, version );
+                INTERPOSE( pthread_attr_init, version );
+                INTERPOSE( pthread_attr_destroy, version );
+                INTERPOSE( pthread_attr_setstack, version );
+                INTERPOSE( pthread_attr_getstacksize, version );
+                INTERPOSE( pthread_sigmask, version );
+                INTERPOSE( pthread_sigqueue, version );
+                INTERPOSE( pthread_once, version );
 #pragma GCC diagnostic pop
+        }

libcfa/src/iostream.cfa

-              r34b4268
+              r24d6572
 // Created On       : Wed May 27 17:56:53 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sat Aug 27 15:04:15 2022
 // Update Count     : 1358
+// Last Modified On : Mon Jan  9 09:27:58 2023
+// Update Count     : 1361
 //
 …
                         } /* if */ \
                         if ( ! f.flags.nobsdp || (exp10 < SUFFIXES_START) || (exp10 > SUFFIXES_END) ) { \
                                 len2 = snprintf( &buf[len], size - len, "e%d", exp10 ); \
+                                len2 = snprintf( &buf[len], size - len, "e%d", (int)exp10 /* ambiguity with function exp10 */ ); \
                         } else { \
                                 len2 = snprintf( &buf[len], size - len, "%s", suffixes[(exp10 - SUFFIXES_START) / 3] ); \

libcfa/src/iostream.hfa

-              r34b4268
+              r24d6572
 // Created On       : Wed May 27 17:56:53 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sun Oct 10 10:02:07 2021
 // Update Count     : 407
+// Last Modified On : Thu Feb  2 11:25:39 2023
+// Update Count     : 410
 //
 …
+trait basic_ostream( ostype & ) {
+forall( ostype & )
+trait basic_ostream {
         // private
         bool sepPrt$( ostype & );                                                       // get separator state (on/off)
 …
 }; // basic_ostream
+trait ostream( ostype & | basic_ostream( ostype ) ) {
+forall( ostype & | basic_ostream( ostype ) )
+trait ostream {
         bool fail( ostype & );                                                          // operation failed?
         void clear( ostype & );
 …
 }; // ostream
+// trait writeable( T ) {
+// forall( T )
+// trait writeable {
 //      forall( ostype & | ostream( ostype ) ) ostype & ?|?( ostype &, T );
 // }; // writeable
+trait writeable( T, ostype & | ostream( ostype ) ) {
+forall( T, ostype & | ostream( ostype ) )
+        trait writeable {
         ostype & ?|?( ostype &, T );
 }; // writeable
 …
+trait basic_istream( istype & ) {
+forall( istype & )
+trait basic_istream {
         // private
         bool getANL$( istype & );                                                       // get scan newline (on/off)
 …
 }; // basic_istream
+trait istream( istype & | basic_istream( istype ) ) {
+forall( istype & | basic_istream( istype ) )
+trait istream {
         bool fail( istype & );
         void clear( istype & );
 …
 }; // istream
+trait readable( T ) {
+forall( T )
+trait readable {
         forall( istype & | istream( istype ) ) istype & ?|?( istype &, T );
 }; // readable

libcfa/src/iterator.hfa

-              r34b4268
+              r24d6572
 // Created On       : Wed May 27 17:56:53 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Fri Jul  7 08:37:25 2017
 // Update Count     : 10
+// Last Modified On : Thu Feb  2 11:21:50 2023
+// Update Count     : 11
 //
 …
 // An iterator can be used to traverse a data structure.
+trait iterator( iterator_type, elt_type ) {
+forall( iterator_type, elt_type )
+trait iterator {
         // point to the next element
 //      iterator_type ?++( iterator_type & );
 …
 };
+trait iterator_for( iterator_type, collection_type, elt_type | iterator( iterator_type, elt_type ) ) {
+forall( iterator_type, collection_type, elt_type | iterator( iterator_type, elt_type ) )
+        trait iterator_for {
 //      [ iterator_type begin, iterator_type end ] get_iterators( collection_type );
         iterator_type begin( collection_type );

libcfa/src/limits.cfa

-              r34b4268
+              r24d6572
 // Created On       : Wed Apr  6 18:06:52 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Mar  1 16:22:51 2018
 // Update Count     : 74
+// Last Modified On : Fri Feb 17 12:25:39 2023
+// Update Count     : 87
 //
 #include <limits.h>
 #include <float.h>
-#define __USE_GNU                                                                               // get M_* constants
 #include <math.h>
 #include <complex.h>

libcfa/src/math.trait.hfa

-              r34b4268
+              r24d6572
 // Created On       : Fri Jul 16 15:40:52 2021
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Jul 20 17:47:19 2021
 // Update Count     : 19
+// Last Modified On : Thu Feb  2 11:36:56 2023
+// Update Count     : 20
 //
 #pragma once
+trait Not( U ) {
+forall( U )
+trait Not {
         void ?{}( U &, zero_t );
         int !?( U );
 }; // Not
+trait Equality( T | Not( T ) ) {
+forall( T | Not( T ) )
+trait Equality {
         int ?==?( T, T );
         int ?!=?( T, T );
 }; // Equality
+trait Relational( U | Equality( U ) ) {
+forall( U | Equality( U ) )
+trait Relational {
         int ?<?( U, U );
         int ?<=?( U, U );
 …
 }; // Relational
+trait Signed( T ) {
+forall ( T )
+trait Signed {
         T +?( T );
         T -?( T );
 …
 }; // Signed
+trait Additive( U | Signed( U ) ) {
+forall( U | Signed( U ) )
+trait Additive {
         U ?+?( U, U );
         U ?-?( U, U );
 …
 }; // Additive
+trait Incdec( T | Additive( T ) ) {
+forall( T | Additive( T ) )
+trait Incdec {
         void ?{}( T &, one_t );
         // T ?++( T & );
 …
 }; // Incdec
+trait Multiplicative( U | Incdec( U ) ) {
+forall( U | Incdec( U ) )
+trait Multiplicative {
         U ?*?( U, U );
         U ?/?( U, U );
 …
 }; // Multiplicative
+trait Arithmetic( T | Relational( T ) | Multiplicative( T ) ) {
+forall( T | Relational( T ) | Multiplicative( T ) )
+trait Arithmetic {
 }; // Arithmetic

libcfa/src/stdlib.cfa

-              r34b4268
+              r24d6572
 // Created On       : Thu Jan 28 17:10:29 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Fri Dec  9 15:11:30 2022
 // Update Count     : 631
+// Last Modified On : Thu Feb 16 16:31:34 2023
+// Update Count     : 633
 //
 …
 //---------------------------------------
-#define _XOPEN_SOURCE 600                                                               // posix_memalign, *rand48
 #include <string.h>                                                                             // memcpy, memset
 //#include <math.h>                                                                             // fabsf, fabs, fabsl

libcfa/src/stdlib.hfa

-              r34b4268
+              r24d6572
 // Created On       : Thu Jan 28 17:12:35 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sun Dec 11 18:25:53 2022
 // Update Count     : 765
+// Last Modified On : Thu Feb  2 11:30:04 2023
+// Update Count     : 766
 //
 …
         char random( void ) { return (unsigned long int)random(); }
         char random( char u ) { return random( (unsigned long int)u ); } // [0,u)
+        char random( char u ) { return (unsigned long int)random( (unsigned long int)u ); } // [0,u)
         char random( char l, char u ) { return random( (unsigned long int)l, (unsigned long int)u ); } // [l,u)
         int random( void ) { return (long int)random(); }
         int random( int u ) { return random( (long int)u ); } // [0,u]
+        int random( int u ) { return (long int)random( (long int)u ); } // [0,u]
         int random( int l, int u ) { return random( (long int)l, (long int)u ); } // [l,u)
         unsigned int random( void ) { return (unsigned long int)random(); }
         unsigned int random( unsigned int u ) { return random( (unsigned long int)u ); } // [0,u]
+        unsigned int random( unsigned int u ) { return (unsigned long int)random( (unsigned long int)u ); } // [0,u]
         unsigned int random( unsigned int l, unsigned int u ) { return random( (unsigned long int)l, (unsigned long int)u ); } // [l,u)
 } // distribution
 …
 //   calls( sprng );
+trait basic_prng( PRNG &, R ) {
+forall( PRNG &, R )
+trait basic_prng {
         void set_seed( PRNG & prng, R seed );                           // set seed
         R get_seed( PRNG & prng );                                                      // get seed

libcfa/src/vec/vec.hfa

-              r34b4268
+              r24d6572
 #include <math.hfa>
+trait fromint(T) {
+forall(T)
+trait fromint {
     void ?{}(T&, int);
 };
+trait zeroinit(T) {
+forall(T)
+trait zeroinit {
     void ?{}(T&, zero_t);
 };
+trait zero_assign(T) {
+forall(T)
+trait zero_assign {
     T ?=?(T&, zero_t);
 };
+trait subtract(T) {
+forall(T)
+trait subtract {
     T ?-?(T, T);
 };
+trait negate(T) {
+forall(T)
+trait negate {
     T -?(T);
 };
+trait add(T) {
+forall(T)
+trait add {
     T ?+?(T, T);
 };
+trait multiply(T) {
+forall(T)
+trait multiply {
     T ?*?(T, T);
 };
+trait divide(T) {
+forall(T)
+trait divide {
     T ?/?(T, T);
 };
+trait lessthan(T) {
+forall(T)
+trait lessthan {
     int ?<?(T, T);
 };
+trait equality(T) {
+forall(T)
+trait equality {
     int ?==?(T, T);
 };
+trait sqrt(T) {
+forall(T)
+trait sqrt {
     T sqrt(T);
 };

Context Navigation

Legend:

Download in other formats: