Diff [6a8208cbc93288ea3ca61f2a04619465c12cf317:59f3f616c737817ce7af766fa1bcf77a79d695d9] for / – Cforall

benchmark/benchcltr.hfa

-              r6a8208cb
+              r59f3f61
         for() {
                 sleep(100`ms);
                 end = getTimeNsec();
+                end = timeHiRes();
                 Duration delta = end - start;
                 /*if(is_tty)*/ {
 …
+}
 #else
 uint64_t getTimeNsec() {
+uint64_t timeHiRes() {
         timespec curr;
         clock_gettime( CLOCK_REALTIME, &curr );
 …
         for(;;) {
                 usleep(100000);
                 end = getTimeNsec();
+                end = timeHiRes();
                 uint64_t delta = end - start;
                 /*if(is_tty)*/ {

benchmark/io/http/protocol.cfa

r6a8208cb	r59f3f61
228	228
229	229	char buff[100];
230		Time now = ~~getTimeNsec~~();
	230	Time now = timeHiRes();
231	231	strftime( buff, 100, "%a, %d %b %Y %H:%M:%S %Z", now );
232	232	sout \| "Updated date to '" \| buff \| "'";

benchmark/io/readv-posix.c

-              r6a8208cb
+              r59f3f61
                                 printf("Starting\n");
                                 bool is_tty = isatty(STDOUT_FILENO);
                                 start = getTimeNsec();
+                                start = timeHiRes();
                                 run = true;
 …
                                 run = false;
                                 end = getTimeNsec();
+                                end = timeHiRes();
                                 printf("\nDone\n");

benchmark/io/readv.cfa

-              r6a8208cb
+              r59f3f61
                                 printf("Starting\n");
                                 bool is_tty = isatty(STDOUT_FILENO);
                                 start = getTimeNsec();
+                                start = timeHiRes();
                                 run = true;
 …
                                 run = false;
                                 end = getTimeNsec();
+                                end = timeHiRes();
                                 printf("\nDone\n");
+                        }

benchmark/readyQ/cycle.cc

-              r6a8208cb
+              r59f3f61
                         bool is_tty = isatty(STDOUT_FILENO);
                         start = getTimeNsec();
+                        start = timeHiRes();
                         for(int i = 0; i < nthreads; i++) {
 …
                         stop = true;
                         end = getTimeNsec();
+                        end = timeHiRes();
                         printf("\nDone\n");

benchmark/readyQ/cycle.cfa

-              r6a8208cb
+              r59f3f61
                         bool is_tty = isatty(STDOUT_FILENO);
                         start = getTimeNsec();
+                        start = timeHiRes();
                         for(i; nthreads) {
 …
                         stop = true;
                         end = getTimeNsec();
+                        end = timeHiRes();
                         printf("\nDone\n");

benchmark/readyQ/cycle.cpp

-              r6a8208cb
+              r59f3f61
                         bool is_tty = isatty(STDOUT_FILENO);
                         start = getTimeNsec();
+                        start = timeHiRes();
                         for(int i = 0; i < nthreads; i++) {
 …
                         stop = true;
                         end = getTimeNsec();
+                        end = timeHiRes();
                         printf("\nDone\n");

benchmark/readyQ/locality.cc

-              r6a8208cb
+              r59f3f61
                         bool is_tty = isatty(STDOUT_FILENO);
                         start = getTimeNsec();
+                        start = timeHiRes();
                         for(size_t i = 0; i < nthreads; i++) {
 …
                         stop = true;
                         end = getTimeNsec();
+                        end = timeHiRes();
                         printf("\nDone\n");

benchmark/readyQ/locality.cfa

-              r6a8208cb
+              r59f3f61
                         bool is_tty = isatty(STDOUT_FILENO);
                         start = getTimeNsec();
+                        start = timeHiRes();
                         for(i; nthreads) {
 …
                         stop = true;
                         end = getTimeNsec();
+                        end = timeHiRes();
                         printf("\nDone\n");

benchmark/readyQ/locality.cpp

-              r6a8208cb
+              r59f3f61
                         bool is_tty = isatty(STDOUT_FILENO);
                         start = getTimeNsec();
+                        start = timeHiRes();
                         for(size_t i = 0; i < nthreads; i++) {
 …
                         stop = true;
                         end = getTimeNsec();
+                        end = timeHiRes();
                         printf("\nDone\n");

benchmark/readyQ/rq_bench.hfa

r6a8208cb	r59f3f61
73	73	for() {
74	74	sleep(100`ms);
75		Time end = ~~getTimeNsec~~();
	75	Time end = timeHiRes();
76	76	Duration delta = end - start;
77	77	if(is_tty) {

benchmark/readyQ/rq_bench.hpp

-              r6a8208cb
+              r59f3f61
+        }
 uint64_t getTimeNsec() {
+uint64_t timeHiRes() {
         timespec curr;
         clock_gettime( CLOCK_REALTIME, &curr );
 …
         for(;;) {
                 Sleeper::usleep(100000);
                 uint64_t end = getTimeNsec();
+                uint64_t end = timeHiRes();
                 uint64_t delta = end - start;
                 if(is_tty) {

benchmark/readyQ/yield.cfa

-              r6a8208cb
+              r59f3f61
                                 bool is_tty = isatty(STDOUT_FILENO);
                                 start = getTimeNsec();
+                                start = timeHiRes();
                                 run = true;
 …
                                 run = false;
                                 end = getTimeNsec();
+                                end = timeHiRes();
                                 printf("\nDone\n");
+                        }

doc/theses/thierry_delisle_PhD/code/readyQ_proto/links.hpp

r6a8208cb	r59f3f61
117	117	}
118	118
119		long long ts() const {
	119	unsigned long long ts() const {
120	120	return before._links.ts;
121	121	}

doc/theses/thierry_delisle_PhD/code/readyQ_proto/links2.hpp

-              r6a8208cb
+              r59f3f61
 template<typename node_t>
 class mpsc_queue : private mcs_queue<node_t> {
         node_t * volatile head;
+        node_t * volatile _head;
 public:
         mpsc_queue(): mcs_queue<node_t>(), head(nullptr) {}
+        mpsc_queue(): mcs_queue<node_t>(), _head(nullptr) {}
         inline bool empty() const { return mcs_queue<node_t>::empty(); }
+        node_t * head() const { return _head; }
         // Added a new element to the queue
 …
         inline node_t * push(node_t * elem) {
                 node_t * prev = mcs_queue<node_t>::push(elem);
                 if (!prev) head = elem;
+                if (!prev) _head = elem;
                 return prev;
+        }
 …
         // NOT Multi-Thread Safe
         inline node_t * pop(node_t *& next) {
                 node_t * elem = head;
+                node_t * elem = _head;
                 // If head is empty just return
                 if (!elem) return nullptr;
 …
                 // If there is already someone in the list, then it's easy
                 if (elem->_links.next) {
                         head = next = elem->_links.next;
+                        _head = next = elem->_links.next;
                         // force memory sync
                         __atomic_thread_fence(__ATOMIC_SEQ_CST);
 …
                         // at the CAS in advance and therefore can write to head
                         // after that point, it could overwrite the write in push
                         head = nullptr;
+                        _head = nullptr;
                         next = mcs_queue<node_t>::advance(elem);
 …
                         // it is the only way we can guarantee we are not overwriting
                         // a write made in push
                         if (next) head = next;
+                        if (next) _head = next;
+                }

doc/theses/thierry_delisle_PhD/code/readyQ_proto/utils.hpp

r6a8208cb	r59f3f61
11	11	#include <sys/sysinfo.h>
12	12
13		#include <x86intrin.h>
	13	// #include <x86intrin.h>
14	14
15	15	// class Random {

doc/theses/thierry_delisle_PhD/code/readyQ_proto/work_stealing.hpp

-              r6a8208cb
+              r59f3f61
 #include "snzi.hpp"
 #include <x86intrin.h>
+// #include <x86intrin.h>
 using namespace std;
 …
 template<typename node_t>
 struct __attribute__((aligned(128))) localQ_t {
+        mpsc_queue<node_t> queue = {};
+        spinlock_t lock = {};
+        bool needs_help = true;
+        #ifdef NO_MPSC
+                intrusive_queue_t<node_t> list;
+                inline auto ts() { return list.ts(); }
+                inline auto lock() { return list.lock.lock(); }
+                inline auto try_lock() { return list.lock.try_lock(); }
+                inline auto unlock() { return list.lock.unlock(); }
+                inline auto push( node_t * node ) { return list.push( node ); }
+                inline auto pop() { return list.pop(); }
+        #else
+                mpsc_queue<node_t> queue = {};
+                spinlock_t _lock = {};
+                inline auto ts() { auto h = queue.head(); return h ? h->_links.ts : 0ull; }
+                inline auto lock() { return _lock.lock(); }
+                inline auto try_lock() { return _lock.try_lock(); }
+                inline auto unlock() { return _lock.unlock(); }
+                inline auto push( node_t * node ) { return queue.push( node ); }
+                inline auto pop() { return queue.pop(); }
+        #endif
 };
 …
         work_stealing(unsigned _numThreads, unsigned)
                 : numThreads(_numThreads * nqueues)
+                , lists(new intrusive_queue_t<node_t>[numThreads])
+                , lists(new localQ_t<node_t>[numThreads])
+                // , lists(new intrusive_queue_t<node_t>[numThreads])
                 , times(new timestamp_t[numThreads])
                 // , snzi( std::log2( numThreads / 2 ), 2 )
 …
         __attribute__((noinline, hot)) void push(node_t * node) {
                 // node->_links.ts = rdtscl();
                 node->_links.ts = 1;
+                node->_links.ts = rdtscl();
+                // node->_links.ts = 1;
                 auto & list = *({
                         unsigned i;
+                        do {
+                        #ifdef NO_MPSC
+                                do {
+                        #endif
                                 tls.stats.push.attempt++;
                                 // unsigned r = tls.rng1.next();
 …
                                         i = tls.my_queue + (r % nqueues);
+                                }
+                        } while(!lists[i].lock.try_lock());
+                        #ifdef NO_MPSC
+                                } while(!lists[i].try_lock());
+                        #endif
                         &lists[i];
                 });
                 list.push( node );
+                list.lock.unlock();
+                #ifdef NO_MPSC
+                        list.unlock();
+                #endif
                 // tls.rng2.set_raw_state( tls.rng1.get_raw_state());
                 // count++;
 …
         __attribute__((noinline, hot)) node_t * pop() {
-                if( tls.myfriend == outside ) {
-                        auto r  = tls.rng1.next();
-                        tls.myfriend = r % numThreads;
-                        times[tls.myfriend].val = 0;
+                }
-                else if(times[tls.myfriend].val == 0) {
-                        node_t * n = try_pop(tls.myfriend, tls.stats.pop.help);
-                        tls.stats.help++;
-                        tls.myfriend = outside;
-                        if(n) return n;
+                }
                 if(tls.my_queue != outside) {
+                        // if( tls.myfriend == outside ) {
+                        //      auto r  = tls.rng1.next();
+                        //      tls.myfriend = r % numThreads;
+                        //      // assert(lists[(tls.it % nqueues) + tls.my_queue].ts() >= lists[((tls.it + 1) % nqueues) + tls.my_queue].ts());
+                        //      tls.mytime = std::min(lists[(tls.it % nqueues) + tls.my_queue].ts(), lists[((tls.it + 1) % nqueues) + tls.my_queue].ts());
+                        //      // times[tls.myfriend].val = 0;
+                        //      // lists[tls.myfriend].val = 0;
+                        // }
+                        // // else if(times[tls.myfriend].val == 0) {
+                        // // else if(lists[tls.myfriend].val == 0) {
+                        // else if(times[tls.myfriend].val < tls.mytime) {
+                        // // else if(times[tls.myfriend].val < lists[(tls.it % nqueues) + tls.my_queue].ts()) {
+                        //      node_t * n = try_pop(tls.myfriend, tls.stats.pop.help);
+                        //      tls.stats.help++;
+                        //      tls.myfriend = outside;
+                        //      if(n) return n;
+                        // }
+                        // if( tls.myfriend == outside ) {
+                        //      auto r  = tls.rng1.next();
+                        //      tls.myfriend = r % numThreads;
+                        //      tls.mytime = lists[((tls.it + 1) % nqueues) + tls.my_queue].ts();
+                        // }
+                        // else {
+                        //      if(times[tls.myfriend].val + 1000 < tls.mytime) {
+                        //              node_t * n = try_pop(tls.myfriend, tls.stats.pop.help);
+                        //              tls.stats.help++;
+                        //              if(n) return n;
+                        //      }
+                        //      tls.myfriend = outside;
+                        // }
                         node_t * n = local();
                         if(n) return n;
 …
 private:
         inline node_t * local() {
-                // unsigned i = (tls.rng2.prev() % 4) + tls.my_queue;
                 unsigned i = (--tls.it % nqueues) + tls.my_queue;
+                node_t * n = try_pop(i, tls.stats.pop.local);
+                if(n) return n;
+                i = (--tls.it % nqueues) + tls.my_queue;
                 return try_pop(i, tls.stats.pop.local);
+        }
 …
                 // If we can't get the lock, move on
+                if( !list.lock.try_lock() ) { stat.elock++; return nullptr; }
+                if( !list.try_lock() ) { stat.elock++; return nullptr; }
                 // If list is empty, unlock and retry
                 if( list.ts() == 0 ) {
                         list.lock.unlock();
+                        list.unlock();
                         stat.eempty++;
                         return nullptr;
 …
                 auto node = list.pop();
                 list.lock.unlock();
+                list.unlock();
                 stat.success++;
+                times[i].val = 1; //node.first->_links.ts;
+                // count--;
+                // _mm_stream_si64((long long int*)&times[i].val, node.first->_links.ts);
+                return node.first;
+                #ifdef NO_MPSC
+                        // times[i].val = 1;
+                        times[i].val = node.first->_links.ts;
+                        // lists[i].val = node.first->_links.ts;
+                        return node.first;
+                #else
+                        times[i].val = node->_links.ts;
+                        return node;
+                #endif
+        }
 …
                 unsigned   my_queue = calc_preferred();
                 unsigned   myfriend = outside;
+                unsigned long long int mytime = 0;
                 #if defined(READ)
                         unsigned it = 0;
 …
 private:
         const unsigned numThreads;
+        std::unique_ptr<intrusive_queue_t<node_t> []> lists;
+        std::unique_ptr<localQ_t<node_t> []> lists;
+        // std::unique_ptr<intrusive_queue_t<node_t> []> lists;
         std::unique_ptr<timestamp_t []> times;
         __attribute__((aligned(128))) std::atomic_size_t count;

example/io/batch-readv.c

-              r6a8208cb
+              r59f3f61
+}
 uint64_t getTimeNsec() {
+uint64_t timeHiRes() {
         timespec curr;
         clock_gettime( CLOCK_REALTIME, &curr );
 …
         printf("Running for %f second, reading %d bytes in batches of %d\n", duration, buflen, batch);
         uint64_t start = getTimeNsec();
         uint64_t end   = getTimeNsec();
         uint64_t prev  = getTimeNsec();
+        uint64_t start = timeHiRes();
+        uint64_t end   = timeHiRes();
+        uint64_t prev  = timeHiRes();
         for(;;) {
                 submit_and_drain(&iov, batch);
                 end = getTimeNsec();
+                end = timeHiRes();
                 uint64_t delta = end - start;
                 if( to_fseconds(end - prev) > 0.1 ) {

libcfa/src/bits/weakso_locks.cfa

r6a8208cb	r59f3f61
25	25	void unlock( blocking_lock & ) {}
26	26	void on_notify( blocking_lock &, struct $thread * ) {}
27		size_t on_wait( blocking_lock & ) {}
	27	size_t on_wait( blocking_lock & ) { return 0; }
28	28	void on_wakeup( blocking_lock &, size_t ) {}
29	29	size_t wait_count( blocking_lock & ) { return 0; }

libcfa/src/clock.hfa

-              r6a8208cb
+              r59f3f61
 // Created On       : Thu Apr 12 14:36:06 2018
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Mon Jan  6 12:49:58 2020
 // Update Count     : 9
+// Last Modified On : Sun Apr 18 08:12:16 2021
+// Update Count     : 28
 //
 …
 //######################### Clock #########################
+struct Clock {                                                                                  // private
+        Duration offset;                                                                        // for virtual clock: contains offset from real-time
+struct Clock {                                                                                  // virtual clock
+        // private
+        Duration offset;                                                                        // offset from computer real-time
 };
 static inline {
         void resetClock( Clock & clk, Duration adj ) with( clk ) {
+        void reset( Clock & clk, Duration adj ) with( clk ) { // change offset
                 offset = adj + __timezone`s;                                    // timezone (global) is (UTC - local time) in seconds
         } // resetClock
+        } // reset
+        void ?{}( Clock & clk, Duration adj ) { resetClock( clk, adj ); }
+        void ?{}( Clock & clk ) { reset( clk, (Duration){ 0 } ); } // create no offset
+        void ?{}( Clock & clk, Duration adj ) { reset( clk, adj ); } // create with offset
+        Duration getResNsec() {
+        // System-wide clock that measures real, i.e., wall-clock) time. This clock is affected by discontinuous jumps in
+        // the system time. For example, manual changes of the clock, and incremental adjustments performed by adjtime(3)
+        // and NTP (daylight saving (Fall back).
+        Duration resolutionHi() {                                                       // clock resolution in nanoseconds (fine)
                 struct timespec res;
                 clock_getres( CLOCK_REALTIME, &res );
                 return ((int64_t)res.tv_sec * TIMEGRAN + res.tv_nsec)`ns;
         } // getRes
+        } // resolutionHi
         Duration getRes() {
+        Duration resolution() {                                                         // clock resolution without nanoseconds (coarse)
                 struct timespec res;
                 clock_getres( CLOCK_REALTIME_COARSE, &res );
                 return ((int64_t)res.tv_sec * TIMEGRAN + res.tv_nsec)`ns;
         } // getRes
+        } // resolution
         Time getTimeNsec() {                                                            // with nanoseconds
+        Time timeHiRes() {                                                                      // real time with nanoseconds
                 timespec curr;
                 clock_gettime( CLOCK_REALTIME, &curr );
                 return (Time){ curr };
         } // getTimeNsec
+        } // timeHiRes
         Time getTime() {                                                                        // without nanoseconds
+        Time time() {                                                                           // real time without nanoseconds
                 timespec curr;
                 clock_gettime( CLOCK_REALTIME_COARSE, &curr );
                 curr.tv_nsec = 0;
                 return (Time){ curr };
         } // getTime
+        } // time
         Time getTime( Clock & clk ) with( clk ) {
                 return getTime() + offset;
         } // getTime
+        Time time( Clock & clk ) with( clk ) {                          // real time for given clock
+                return time() + offset;
+        } // time
         Time ?()( Clock & clk ) with( clk ) {                           // alternative syntax
                 return getTime() + offset;
         } // getTime
+                return time() + offset;
+        } // ?()
         timeval getTime( Clock & clk ) {
+        timeval time( Clock & clk ) {                                           // convert to C time format
                 return (timeval){ clk() };
         } // getTime
+        } // time
         tm getTime( Clock & clk ) with( clk ) {
+        tm time( Clock & clk ) with( clk ) {
                 tm ret;
                 localtime_r( getTime( clk ).tv_sec, &ret );
+                localtime_r( time( clk ).tv_sec, &ret );
                 return ret;
         } // getTime
+        } // time
+        Time getCPUTime() {
+        // CFA processor CPU-time watch that ticks when the processor (kernel thread) is running. This watch is affected by
+        // discontinuous jumps when the OS is not running the kernal thread. A duration is returned because the value is
+        // relative and cannot be converted to real-time (wall-clock) time.
+        Duration processor() {                                                          // non-monotonic duration of kernel thread
                 timespec ts;
                 clock_gettime( CLOCK_THREAD_CPUTIME_ID, &ts );
+                return (Time){ ts };
+    } // getCPUTime
+                return (Duration){ ts };
+        } // processor
+        // Program CPU-time watch measures CPU time consumed by all processors (kernel threads) in the UNIX process.  This
+        // watch is affected by discontinuous jumps when the OS is not running the kernel threads. A duration is returned
+        // because the value is relative and cannot be converted to real-time (wall-clock) time.
+        Duration program() {                                                            // non-monotonic duration of program CPU
+                timespec ts;
+                clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts );
+                return (Duration){ ts };
+        } // program
+        // Monotonic duration from machine boot and including system suspension. This watch is unaffected by discontinuous
+        // jumps resulting from manual changes of the clock, and incremental adjustments performed by adjtime(3) and NTP
+        // (Fall back). A duration is returned because the value is relative and cannot be converted to real-time
+        // (wall-clock) time.
+        Duration boot() {                                                                       // monotonic duration since computer boot
+                timespec ts;
+                clock_gettime( CLOCK_BOOTTIME, &ts );
+                return (Duration){ ts };
+        } // boot
 } // distribution

libcfa/src/concurrency/invoke.h

r6a8208cb	r59f3f61
148	148	struct $thread * prev;
149	149	volatile unsigned long long ts;
150		~~int preferred;~~
151	150	};
152	151

libcfa/src/concurrency/io/call.cfa.in

-              r6a8208cb
+              r59f3f61
                 sqe->opcode = IORING_OP_{op};
                 sqe->user_data = (__u64)(uintptr_t)&future;
+                sqe->user_data = (uintptr_t)&future;
                 sqe->flags = sflags;
                 sqe->ioprio = 0;
 …
                 asm volatile("": : :"memory");
                 verify( sqe->user_data == (__u64)(uintptr_t)&future );
+                verify( sqe->user_data == (uintptr_t)&future );
                 cfa_io_submit( ctx, &idx, 1, 0 != (submit_flags & CFA_IO_LAZY) );
         #endif
 …
                 'fd'  : 'fd',
                 'off' : 'offset',
                 'addr': '(__u64)iov',
+                'addr': '(uintptr_t)iov',
                 'len' : 'iovcnt',
         }, define = 'CFA_HAVE_PREADV2'),
 …
                 'fd'  : 'fd',
                 'off' : 'offset',
                 'addr': '(__u64)iov',
+                'addr': '(uintptr_t)iov',
                 'len' : 'iovcnt'
         }, define = 'CFA_HAVE_PWRITEV2'),
 …
                 'addr': 'fd',
                 'len': 'op',
                 'off': '(__u64)event'
+                'off': '(uintptr_t)event'
         }),
         # CFA_HAVE_IORING_OP_SYNC_FILE_RANGE
 …
         Call('SENDMSG', 'ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(__u64)(struct msghdr *)msg',
+                'addr': '(uintptr_t)(struct msghdr *)msg',
                 'len': '1',
                 'msg_flags': 'flags'
 …
         Call('RECVMSG', 'ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(__u64)(struct msghdr *)msg',
+                'addr': '(uintptr_t)(struct msghdr *)msg',
                 'len': '1',
                 'msg_flags': 'flags'
 …
         Call('SEND', 'ssize_t send(int sockfd, const void *buf, size_t len, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(__u64)buf',
+                'addr': '(uintptr_t)buf',
                 'len': 'len',
                 'msg_flags': 'flags'
 …
         Call('RECV', 'ssize_t recv(int sockfd, void *buf, size_t len, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(__u64)buf',
+                'addr': '(uintptr_t)buf',
                 'len': 'len',
                 'msg_flags': 'flags'
 …
         Call('ACCEPT', 'int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(__u64)addr',
                 'addr2': '(__u64)addrlen',
+                'addr': '(uintptr_t)addr',
+                'addr2': '(uintptr_t)addrlen',
                 'accept_flags': 'flags'
         }),
 …
         Call('CONNECT', 'int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen)', {
                 'fd': 'sockfd',
                 'addr': '(__u64)addr',
+                'addr': '(uintptr_t)addr',
                 'off': 'addrlen'
         }),
 …
         Call('FALLOCATE', 'int fallocate(int fd, int mode, off_t offset, off_t len)', {
                 'fd': 'fd',
                 'addr': '(__u64)len',
+                'addr': '(uintptr_t)len',
                 'len': 'mode',
                 'off': 'offset'
 …
         # CFA_HAVE_IORING_OP_MADVISE
         Call('MADVISE', 'int madvise(void *addr, size_t length, int advice)', {
                 'addr': '(__u64)addr',
+                'addr': '(uintptr_t)addr',
                 'len': 'length',
                 'fadvise_advice': 'advice'
 …
         Call('OPENAT', 'int openat(int dirfd, const char *pathname, int flags, mode_t mode)', {
                 'fd': 'dirfd',
                 'addr': '(__u64)pathname',
+                'addr': '(uintptr_t)pathname',
                 'len': 'mode',
                 'open_flags': 'flags;'
 …
                 'addr': 'pathname',
                 'len': 'sizeof(*how)',
                 'off': '(__u64)how',
+                'off': '(uintptr_t)how',
         }, define = 'CFA_HAVE_OPENAT2'),
         # CFA_HAVE_IORING_OP_CLOSE
 …
         Call('STATX', 'int statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf)', {
                 'fd': 'dirfd',
                 'off': '(__u64)statxbuf',
+                'off': '(uintptr_t)statxbuf',
                 'addr': 'pathname',
                 'len': 'mask',
 …
         Call('READ', 'ssize_t read(int fd, void * buf, size_t count)', {
                 'fd': 'fd',
                 'addr': '(__u64)buf',
+                'addr': '(uintptr_t)buf',
                 'len': 'count'
         }),
 …
         Call('WRITE', 'ssize_t write(int fd, void * buf, size_t count)', {
                 'fd': 'fd',
                 'addr': '(__u64)buf',
+                'addr': '(uintptr_t)buf',
                 'len': 'count'
         }),

libcfa/src/concurrency/kernel.cfa

-              r6a8208cb
+              r59f3f61
 static void __wake_one(cluster * cltr);
 static void push  (__cluster_idles & idles, processor & proc);
 static void remove(__cluster_idles & idles, processor & proc);
 static [unsigned idle, unsigned total, * processor] query( & __cluster_idles idles );
+static void mark_idle (__cluster_proc_list & idles, processor & proc);
+static void mark_awake(__cluster_proc_list & idles, processor & proc);
+static [unsigned idle, unsigned total, * processor] query_idles( & __cluster_proc_list idles );
 extern void __cfa_io_start( processor * );
 …
                                 // Push self to idle stack
                                 push(this->cltr->idles, * this);
+                                mark_idle(this->cltr->procs, * this);
                                 // Confirm the ready-queue is empty
 …
                                 if( readyThread ) {
                                         // A thread was found, cancel the halt
                                         remove(this->cltr->idles, * this);
+                                        mark_awake(this->cltr->procs, * this);
                                         #if !defined(__CFA_NO_STATISTICS__)
 …
                                 // We were woken up, remove self from idle
                                 remove(this->cltr->idles, * this);
+                                mark_awake(this->cltr->procs, * this);
                                 // DON'T just proceed, start looking again
 …
         ready_schedule_lock();
                 $thread * thrd = pop( this );
+                $thread * thrd = pop_fast( this );
         ready_schedule_unlock();
 …
         unsigned idle;
         unsigned total;
         [idle, total, p] = query(this->idles);
+        [idle, total, p] = query_idles(this->procs);
         // If no one is sleeping, we are done
 …
+}
 static void push  (__cluster_idles & this, processor & proc) {
+static void mark_idle(__cluster_proc_list & this, processor & proc) {
         /* paranoid */ verify( ! __preemption_enabled() );
         lock( this );
                 this.idle++;
                 /* paranoid */ verify( this.idle <= this.total );
                 insert_first(this.list, proc);
+                remove(proc);
+                insert_first(this.idles, proc);
         unlock( this );
         /* paranoid */ verify( ! __preemption_enabled() );
+}
 static void remove(__cluster_idles & this, processor & proc) {
+static void mark_awake(__cluster_proc_list & this, processor & proc) {
         /* paranoid */ verify( ! __preemption_enabled() );
         lock( this );
                 this.idle--;
                 /* paranoid */ verify( this.idle >= 0 );
                 remove(proc);
+                insert_last(this.actives, proc);
         unlock( this );
         /* paranoid */ verify( ! __preemption_enabled() );
+}
+static [unsigned idle, unsigned total, * processor] query( & __cluster_idles this ) {
+static [unsigned idle, unsigned total, * processor] query_idles( & __cluster_proc_list this ) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( ready_schedule_islocked() );
         for() {
                 uint64_t l = __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST);
 …
                 unsigned idle    = this.idle;
                 unsigned total   = this.total;
                 processor * proc = &this.list`first;
+                processor * proc = &this.idles`first;
                 // Compiler fence is unnecessary, but gcc-8 and older incorrectly reorder code without it
                 asm volatile("": : :"memory");
 …
                 return [idle, total, proc];
+        }
+        /* paranoid */ verify( ready_schedule_islocked() );
+        /* paranoid */ verify( ! __preemption_enabled() );
+}

libcfa/src/concurrency/kernel.hfa

-              r6a8208cb
+              r59f3f61
         struct cluster * cltr;
+        // Id within the cluster
+        unsigned cltr_id;
+        // Ready Queue state per processor
+        struct {
+                unsigned short its;
+                unsigned short itr;
+                unsigned id;
+                unsigned target;
+                unsigned long long int cutoff;
+        } rdq;
         // Set to true to notify the processor should terminate
 …
 // Cluster Tools
 // Intrusives lanes which are used by the relaxed ready queue
+// Intrusives lanes which are used by the ready queue
 struct __attribute__((aligned(128))) __intrusive_lane_t;
 void  ?{}(__intrusive_lane_t & this);
 void ^?{}(__intrusive_lane_t & this);
+// Counter used for wether or not the lanes are all empty
+struct __attribute__((aligned(128))) __snzi_node_t;
+struct __snzi_t {
+        unsigned mask;
+        int root;
+        __snzi_node_t * nodes;
+};
+void  ?{}( __snzi_t & this, unsigned depth );
+void ^?{}( __snzi_t & this );
+// Aligned timestamps which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __timestamp_t;
+void  ?{}(__timestamp_t & this);
+void ^?{}(__timestamp_t & this);
 //TODO adjust cache size to ARCHITECTURE
 // Structure holding the relaxed ready queue
 struct __ready_queue_t {
-        // Data tracking how many/which lanes are used
-        // Aligned to 128 for cache locality
-        __snzi_t snzi;
         // Data tracking the actual lanes
         // On a seperate cacheline from the used struct since
 …
                 __intrusive_lane_t * volatile data;
+                // Array of times
+                __timestamp_t * volatile tscs;
                 // Number of lanes (empty or not)
                 volatile size_t count;
 …
 // Idle Sleep
 struct __cluster_idles {
+struct __cluster_proc_list {
         // Spin lock protecting the queue
         volatile uint64_t lock;
 …
         // List of idle processors
+        dlist(processor, processor) list;
+        dlist(processor, processor) idles;
+        // List of active processors
+        dlist(processor, processor) actives;
 };
 …
         // List of idle processors
         __cluster_idles idles;
+        __cluster_proc_list procs;
         // List of threads

libcfa/src/concurrency/kernel/startup.cfa

-              r6a8208cb
+              r59f3f61
         this.name = name;
         this.cltr = &_cltr;
+        this.rdq.its = 0;
+        this.rdq.itr = 0;
+        this.rdq.id  = -1u;
+        this.rdq.target = -1u;
+        this.rdq.cutoff = -1ull;
         do_terminate = false;
         preemption_alarm = 0p;
 …
         #endif
+        lock( this.cltr->idles );
+                int target = this.cltr->idles.total += 1u;
+        unlock( this.cltr->idles );
+        id = doregister((__processor_id_t*)&this);
+        // Register and Lock the RWlock so no-one pushes/pops while we are changing the queue
+        uint_fast32_t last_size = ready_mutate_register((__processor_id_t*)&this);
+                this.cltr->procs.total += 1u;
+                insert_last(this.cltr->procs.actives, this);
+                // Adjust the ready queue size
+                ready_queue_grow( cltr );
+        // Unlock the RWlock
+        ready_mutate_unlock( last_size );
+        __cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
+}
+// Not a ctor, it just preps the destruction but should not destroy members
+static void deinit(processor & this) {
         // Lock the RWlock so no-one pushes/pops while we are changing the queue
         uint_fast32_t last_size = ready_mutate_lock();
+                this.cltr->procs.total -= 1u;
+                remove(this);
                 // Adjust the ready queue size
+                this.cltr_id = ready_queue_grow( cltr, target );
+        // Unlock the RWlock
+        ready_mutate_unlock( last_size );
+        __cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
+}
+// Not a ctor, it just preps the destruction but should not destroy members
+static void deinit(processor & this) {
+        lock( this.cltr->idles );
+                int target = this.cltr->idles.total -= 1u;
+        unlock( this.cltr->idles );
+        // Lock the RWlock so no-one pushes/pops while we are changing the queue
+        uint_fast32_t last_size = ready_mutate_lock();
+                // Adjust the ready queue size
+                ready_queue_shrink( this.cltr, target );
+        // Unlock the RWlock
+        ready_mutate_unlock( last_size );
+        // Finally we don't need the read_lock any more
+        unregister((__processor_id_t*)&this);
+                ready_queue_shrink( this.cltr );
+        // Unlock the RWlock and unregister: we don't need the read_lock any more
+        ready_mutate_unregister((__processor_id_t*)&this, last_size );
         close(this.idle);
 …
 //-----------------------------------------------------------------------------
 // Cluster
 static void ?{}(__cluster_idles & this) {
+static void ?{}(__cluster_proc_list & this) {
         this.lock  = 0;
         this.idle  = 0;
         this.total = 0;
-        (this.list){};
+}
 …
                 // Adjust the ready queue size
                 ready_queue_grow( &this, 0 );
+                ready_queue_grow( &this );
         // Unlock the RWlock
 …
                 // Adjust the ready queue size
                 ready_queue_shrink( &this, 0 );
+                ready_queue_shrink( &this );
         // Unlock the RWlock

libcfa/src/concurrency/kernel_private.hfa

-              r6a8208cb
+              r59f3f61
 // Cluster lock API
 //=======================================================================
-// Cells use by the reader writer lock
-// while not generic it only relies on a opaque pointer
-struct __attribute__((aligned(128))) __scheduler_lock_id_t {
-        // Spin lock used as the underlying lock
-        volatile bool lock;
-        // Handle pointing to the proc owning this cell
-        // Used for allocating cells and debugging
-        __processor_id_t * volatile handle;
-        #ifdef __CFA_WITH_VERIFY__
-                // Debug, check if this is owned for reading
-                bool owned;
-        #endif
-};
-static_assert( sizeof(struct __scheduler_lock_id_t) <= __alignof(struct __scheduler_lock_id_t));
 // Lock-Free registering/unregistering of threads
 // Register a processor to a given cluster and get its unique id in return
 unsigned doregister( struct __processor_id_t * proc );
+void register_proc_id( struct __processor_id_t * );
 // Unregister a processor from a given cluster using its id, getting back the original pointer
+void     unregister( struct __processor_id_t * proc );
+//-----------------------------------------------------------------------
+// Cluster idle lock/unlock
+static inline void lock(__cluster_idles & this) {
+        for() {
+                uint64_t l = this.lock;
+                if(
+                        (0 == (l % 2))
+                        && __atomic_compare_exchange_n(&this.lock, &l, l + 1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+                ) return;
+                Pause();
+        }
+}
+static inline void unlock(__cluster_idles & this) {
+        /* paranoid */ verify( 1 == (this.lock % 2) );
+        __atomic_fetch_add( &this.lock, 1, __ATOMIC_SEQ_CST );
+}
+void unregister_proc_id( struct __processor_id_t * proc );
 //=======================================================================
 …
         __atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
+}
+// Cells use by the reader writer lock
+// while not generic it only relies on a opaque pointer
+struct __attribute__((aligned(128))) __scheduler_lock_id_t {
+        // Spin lock used as the underlying lock
+        volatile bool lock;
+        // Handle pointing to the proc owning this cell
+        // Used for allocating cells and debugging
+        __processor_id_t * volatile handle;
+        #ifdef __CFA_WITH_VERIFY__
+                // Debug, check if this is owned for reading
+                bool owned;
+        #endif
+};
+static_assert( sizeof(struct __scheduler_lock_id_t) <= __alignof(struct __scheduler_lock_id_t));
 //-----------------------------------------------------------------------
 …
 void ready_mutate_unlock( uint_fast32_t /* value returned by lock */ );
+//-----------------------------------------------------------------------
+// Lock-Free registering/unregistering of threads
+// Register a processor to a given cluster and get its unique id in return
+// For convenience, also acquires the lock
+static inline uint_fast32_t ready_mutate_register( struct __processor_id_t * proc ) {
+        register_proc_id( proc );
+        return ready_mutate_lock();
+}
+// Unregister a processor from a given cluster using its id, getting back the original pointer
+// assumes the lock is acquired
+static inline void ready_mutate_unregister( struct __processor_id_t * proc, uint_fast32_t last_s ) {
+        ready_mutate_unlock( last_s );
+        unregister_proc_id( proc );
+}
+//-----------------------------------------------------------------------
+// Cluster idle lock/unlock
+static inline void lock(__cluster_proc_list & this) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        // Start by locking the global RWlock so that we know no-one is
+        // adding/removing processors while we mess with the idle lock
+        ready_schedule_lock();
+        // Simple counting lock, acquired, acquired by incrementing the counter
+        // to an odd number
+        for() {
+                uint64_t l = this.lock;
+                if(
+                        (0 == (l % 2))
+                        && __atomic_compare_exchange_n(&this.lock, &l, l + 1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+                ) return;
+                Pause();
+        }
+        /* paranoid */ verify( ! __preemption_enabled() );
+}
+static inline void unlock(__cluster_proc_list & this) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( 1 == (this.lock % 2) );
+        // Simple couting lock, release by incrementing to an even number
+        __atomic_fetch_add( &this.lock, 1, __ATOMIC_SEQ_CST );
+        // Release the global lock, which we acquired when locking
+        ready_schedule_unlock();
+        /* paranoid */ verify( ! __preemption_enabled() );
+}
 //=======================================================================
 // Ready-Queue API
 //-----------------------------------------------------------------------
-// pop thread from the ready queue of a cluster
-// returns 0p if empty
-__attribute__((hot)) bool query(struct cluster * cltr);
-//-----------------------------------------------------------------------
 // push thread onto a ready queue for a cluster
 // returns true if the list was previously empty, false otherwise
 __attribute__((hot)) bool push(struct cluster * cltr, struct $thread * thrd);
+__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd);
 //-----------------------------------------------------------------------
 …
 // returns 0p if empty
 // May return 0p spuriously
 __attribute__((hot)) struct $thread * pop(struct cluster * cltr);
+__attribute__((hot)) struct $thread * pop_fast(struct cluster * cltr);
 //-----------------------------------------------------------------------
 …
 //-----------------------------------------------------------------------
-// remove thread from the ready queue of a cluster
-// returns bool if it wasn't found
-bool remove_head(struct cluster * cltr, struct $thread * thrd);
-//-----------------------------------------------------------------------
 // Increase the width of the ready queue (number of lanes) by 4
 unsigned ready_queue_grow  (struct cluster * cltr, int target);
+void ready_queue_grow  (struct cluster * cltr);
 //-----------------------------------------------------------------------
 // Decrease the width of the ready queue (number of lanes) by 4
 void ready_queue_shrink(struct cluster * cltr, int target);
+void ready_queue_shrink(struct cluster * cltr);

libcfa/src/concurrency/preemption.cfa

-              r6a8208cb
+              r59f3f61
 static void * alarm_loop( __attribute__((unused)) void * args ) {
         __processor_id_t id;
         id.id = doregister(&id);
+        register_proc_id(&id);
         __cfaabi_tls.this_proc_id = &id;
 …
 EXIT:
         __cfaabi_dbg_print_safe( "Kernel : Preemption thread stopping\n" );
         unregister(&id);
+        register_proc_id(&id);
         return 0p;

libcfa/src/concurrency/ready_queue.cfa

-              r6a8208cb
+              r59f3f61
 // #define __CFA_DEBUG_PRINT_READY_QUEUE__
-// #define USE_SNZI
 // #define USE_MPSC
+#define USE_RELAXED_FIFO
+// #define USE_WORK_STEALING
 #include "bits/defs.hfa"
 …
 #include <unistd.h>
-#include "snzi.hfa"
 #include "ready_subqueue.hfa"
 static const size_t cache_line_size = 64;
+#if !defined(__CFA_NO_STATISTICS__)
+        #define __STATS(...) __VA_ARGS__
+#else
+        #define __STATS(...)
+#endif
 // No overriden function, no environment variable, no define
 …
 #endif
+#define BIAS 4
+#if   defined(USE_RELAXED_FIFO)
+        #define BIAS 4
+        #define READYQ_SHARD_FACTOR 4
+        #define SEQUENTIAL_SHARD 1
+#elif defined(USE_WORK_STEALING)
+        #define READYQ_SHARD_FACTOR 2
+        #define SEQUENTIAL_SHARD 2
+#else
+        #error no scheduling strategy selected
+#endif
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats));
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j __STATS(, __stats_readyQ_pop_t & stats));
+static inline struct $thread * search(struct cluster * cltr);
+static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred);
 // returns the maximum number of processors the RWLock support
 …
 //=======================================================================
 // Lock-Free registering/unregistering of threads
 unsigned doregister( struct __processor_id_t * proc ) with(*__scheduler_lock) {
+void register_proc_id( struct __processor_id_t * proc ) with(*__scheduler_lock) {
         __cfadbg_print_safe(ready_queue, "Kernel : Registering proc %p for RW-Lock\n", proc);
 …
                         /*paranoid*/ verify(0 == (__alignof__(data[i]) % cache_line_size));
                         /*paranoid*/ verify((((uintptr_t)&data[i]) % cache_line_size) == 0);
                         return i;
+                        proc->id = i;
+                }
+        }
 …
         /*paranoid*/ verify(__alignof__(data[n]) == (2 * cache_line_size));
         /*paranoid*/ verify((((uintptr_t)&data[n]) % cache_line_size) == 0);
         return n;
+}
 void unregister( struct __processor_id_t * proc ) with(*__scheduler_lock) {
+        proc->id = n;
+}
+void unregister_proc_id( struct __processor_id_t * proc ) with(*__scheduler_lock) {
         unsigned id = proc->id;
         /*paranoid*/ verify(id < ready);
 …
 //=======================================================================
 // Cforall Reqdy Queue used for scheduling
+// Cforall Ready Queue used for scheduling
 //=======================================================================
 void ?{}(__ready_queue_t & this) with (this) {
         lanes.data  = 0p;
+        lanes.tscs  = 0p;
         lanes.count = 0;
+}
 void ^?{}(__ready_queue_t & this) with (this) {
+        verify( 1 == lanes.count );
+        #ifdef USE_SNZI
+                verify( !query( snzi ) );
+        #endif
+        verify( SEQUENTIAL_SHARD == lanes.count );
         free(lanes.data);
+        free(lanes.tscs);
+}
 //-----------------------------------------------------------------------
+__attribute__((hot)) bool query(struct cluster * cltr) {
+        #ifdef USE_SNZI
+                return query(cltr->ready_queue.snzi);
+        #endif
+        return true;
+}
+static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred) {
+        unsigned i;
+        bool local;
+        #if defined(BIAS)
+#if defined(USE_RELAXED_FIFO)
+        //-----------------------------------------------------------------------
+        // get index from random number with or without bias towards queues
+        static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred) {
+                unsigned i;
+                bool local;
                 unsigned rlow  = r % BIAS;
                 unsigned rhigh = r / BIAS;
 …
                         // (BIAS - 1) out of BIAS chances
                         // Use perferred queues
                         i = preferred + (rhigh % 4);
+                        i = preferred + (rhigh % READYQ_SHARD_FACTOR);
                         local = true;
+                }
 …
                         local = false;
+                }
+        #else
+                i = r;
+                local = false;
+        #endif
+        return [i, local];
+}
+//-----------------------------------------------------------------------
+__attribute__((hot)) bool push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+        __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
+        const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+        // write timestamp
+        thrd->link.ts = rdtscl();
+        bool first = false;
+        __attribute__((unused)) bool local;
+        __attribute__((unused)) int preferred;
+        #if defined(BIAS)
+                preferred =
+                        //*
+                        external ? -1 : kernelTLS().this_processor->cltr_id;
+                        /*/
+                        thrd->link.preferred * 4;
+                        //*/
+        #endif
+        // Try to pick a lane and lock it
+        unsigned i;
+        do {
+                // Pick the index of a lane
+                // unsigned r = __tls_rand();
+                unsigned r = __tls_rand_fwd();
+                [i, local] = idx_from_r(r, preferred);
+                i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+                return [i, local];
+        }
+        __attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+                __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
+                const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+                /* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
+                // write timestamp
+                thrd->link.ts = rdtscl();
+                bool local;
+                int preferred = external ? -1 : kernelTLS().this_processor->rdq.id;
+                // Try to pick a lane and lock it
+                unsigned i;
+                do {
+                        // Pick the index of a lane
+                        unsigned r = __tls_rand_fwd();
+                        [i, local] = idx_from_r(r, preferred);
+                        i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+                        #if !defined(__CFA_NO_STATISTICS__)
+                                if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.attempt, 1, __ATOMIC_RELAXED);
+                                else if(local) __tls_stats()->ready.push.local.attempt++;
+                                else __tls_stats()->ready.push.share.attempt++;
+                        #endif
+                #if defined(USE_MPSC)
+                        // mpsc always succeeds
+                } while( false );
+                #else
+                        // If we can't lock it retry
+                } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+                #endif
+                // Actually push it
+                push(lanes.data[i], thrd);
+                #if !defined(USE_MPSC)
+                        // Unlock and return
+                        __atomic_unlock( &lanes.data[i].lock );
+                #endif
+                // Mark the current index in the tls rng instance as having an item
+                __tls_rand_advance_bck();
+                __cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
+                // Update statistics
                 #if !defined(__CFA_NO_STATISTICS__)
+                        if(external) {
+                                if(local) __atomic_fetch_add(&cltr->stats->ready.pick.ext.local, 1, __ATOMIC_RELAXED);
+                                __atomic_fetch_add(&cltr->stats->ready.pick.ext.attempt, 1, __ATOMIC_RELAXED);
+                        if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED);
+                        else if(local) __tls_stats()->ready.push.local.success++;
+                        else __tls_stats()->ready.push.share.success++;
+                #endif
+        }
+        // Pop from the ready queue from a given cluster
+        __attribute__((hot)) $thread * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
+                /* paranoid */ verify( lanes.count > 0 );
+                /* paranoid */ verify( kernelTLS().this_processor );
+                /* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count );
+                unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+                int preferred = kernelTLS().this_processor->rdq.id;
+                // As long as the list is not empty, try finding a lane that isn't empty and pop from it
+                for(25) {
+                        // Pick two lists at random
+                        unsigned ri = __tls_rand_bck();
+                        unsigned rj = __tls_rand_bck();
+                        unsigned i, j;
+                        __attribute__((unused)) bool locali, localj;
+                        [i, locali] = idx_from_r(ri, preferred);
+                        [j, localj] = idx_from_r(rj, preferred);
+                        i %= count;
+                        j %= count;
+                        // try popping from the 2 picked lists
+                        struct $thread * thrd = try_pop(cltr, i, j __STATS(, *(locali || localj ? &__tls_stats()->ready.pop.local : &__tls_stats()->ready.pop.help)));
+                        if(thrd) {
+                                return thrd;
+                        }
+                }
+                // All lanes where empty return 0p
+                return 0p;
+        }
+        __attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) {
+                return search(cltr);
+        }
+#endif
+#if defined(USE_WORK_STEALING)
+        __attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+                __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
+                const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+                /* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
+                // write timestamp
+                thrd->link.ts = rdtscl();
+                // Try to pick a lane and lock it
+                unsigned i;
+                do {
+                        #if !defined(__CFA_NO_STATISTICS__)
+                                if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.attempt, 1, __ATOMIC_RELAXED);
+                                else __tls_stats()->ready.push.local.attempt++;
+                        #endif
+                        if(unlikely(external)) {
+                                i = __tls_rand() % lanes.count;
+                        }
                         else {
+                                if(local) __tls_stats()->ready.pick.push.local++;
+                                __tls_stats()->ready.pick.push.attempt++;
+                                processor * proc = kernelTLS().this_processor;
+                                unsigned r = proc->rdq.its++;
+                                i =  proc->rdq.id + (r % READYQ_SHARD_FACTOR);
+                        }
+                #if defined(USE_MPSC)
+                        // mpsc always succeeds
+                } while( false );
+                #else
+                        // If we can't lock it retry
+                } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
                 #endif
+        #if defined(USE_MPSC)
+                // mpsc always succeeds
+        } while( false );
+        #else
+                // If we can't lock it retry
+        } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+                // Actually push it
+                push(lanes.data[i], thrd);
+                #if !defined(USE_MPSC)
+                        // Unlock and return
+                        __atomic_unlock( &lanes.data[i].lock );
+                #endif
+                #if !defined(__CFA_NO_STATISTICS__)
+                        if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED);
+                        else __tls_stats()->ready.push.local.success++;
+                #endif
+                __cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
+        }
+        // Pop from the ready queue from a given cluster
+        __attribute__((hot)) $thread * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
+                /* paranoid */ verify( lanes.count > 0 );
+                /* paranoid */ verify( kernelTLS().this_processor );
+                /* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count );
+                processor * proc = kernelTLS().this_processor;
+                if(proc->rdq.target == -1u) {
+                        proc->rdq.target = __tls_rand() % lanes.count;
+                        unsigned it1  = proc->rdq.itr;
+                        unsigned it2  = proc->rdq.itr + 1;
+                        unsigned idx1 = proc->rdq.id + (it1 % READYQ_SHARD_FACTOR);
+                        unsigned idx2 = proc->rdq.id + (it1 % READYQ_SHARD_FACTOR);
+                        unsigned long long tsc1 = ts(lanes.data[idx1]);
+                        unsigned long long tsc2 = ts(lanes.data[idx2]);
+                        proc->rdq.cutoff = min(tsc1, tsc2);
+                }
+                else if(lanes.tscs[proc->rdq.target].tv < proc->rdq.cutoff) {
+                        $thread * t = try_pop(cltr, proc->rdq.target __STATS(, __tls_stats()->ready.pop.help));
+                        proc->rdq.target = -1u;
+                        if(t) return t;
+                }
+                for(READYQ_SHARD_FACTOR) {
+                        unsigned i = proc->rdq.id + (--proc->rdq.itr % READYQ_SHARD_FACTOR);
+                        if($thread * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
+                }
+                return 0p;
+        }
+        __attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+                for(25) {
+                        unsigned i = __tls_rand() % lanes.count;
+                        $thread * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal));
+                        if(t) return t;
+                }
+                return search(cltr);
+        }
+#endif
+//=======================================================================
+// Various Ready Queue utilities
+//=======================================================================
+// these function work the same or almost the same
+// whether they are using work-stealing or relaxed fifo scheduling
+//-----------------------------------------------------------------------
+// try to pop from a lane given by index w
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->ready_queue) {
+        __STATS( stats.attempt++; )
+        // Get relevant elements locally
+        __intrusive_lane_t & lane = lanes.data[w];
+        // If list looks empty retry
+        if( is_empty(lane) ) {
+                __STATS( stats.espec++; )
+                return 0p;
+        }
+        // If we can't get the lock retry
+        if( !__atomic_try_acquire(&lane.lock) ) {
+                __STATS( stats.elock++; )
+                return 0p;
+        }
+        // If list is empty, unlock and retry
+        if( is_empty(lane) ) {
+                __atomic_unlock(&lane.lock);
+                __STATS( stats.eempty++; )
+                return 0p;
+        }
+        // Actually pop the list
+        struct $thread * thrd;
+        thrd = pop(lane);
+        /* paranoid */ verify(thrd);
+        /* paranoid */ verify(lane.lock);
+        // Unlock and return
+        __atomic_unlock(&lane.lock);
+        // Update statistics
+        __STATS( stats.success++; )
+        #if defined(USE_WORK_STEALING)
+                lanes.tscs[w].tv = thrd->link.ts;
         #endif
+        // Actually push it
+        #ifdef USE_SNZI
+                bool lane_first =
+        #endif
+        push(lanes.data[i], thrd);
+        #ifdef USE_SNZI
+                // If this lane used to be empty we need to do more
+                if(lane_first) {
+                        // Check if the entire queue used to be empty
+                        first = !query(snzi);
+                        // Update the snzi
+                        arrive( snzi, i );
+                }
+        #endif
+        #if !defined(USE_MPSC)
+                // Unlock and return
+                __atomic_unlock( &lanes.data[i].lock );
+        #endif
+        // Mark the current index in the tls rng instance as having an item
+        __tls_rand_advance_bck();
+        __cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
+        // Update statistics
+        #if !defined(__CFA_NO_STATISTICS__)
+                if(external) {
+                        if(local) __atomic_fetch_add(&cltr->stats->ready.pick.ext.lsuccess, 1, __ATOMIC_RELAXED);
+                        __atomic_fetch_add(&cltr->stats->ready.pick.ext.success, 1, __ATOMIC_RELAXED);
+                }
+                else {
+                        if(local) __tls_stats()->ready.pick.push.lsuccess++;
+                        __tls_stats()->ready.pick.push.success++;
+                }
+        #endif
+        // return whether or not the list was empty before this push
+        return first;
+}
+static struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j);
+static struct $thread * try_pop(struct cluster * cltr, unsigned i);
+// Pop from the ready queue from a given cluster
+__attribute__((hot)) $thread * pop(struct cluster * cltr) with (cltr->ready_queue) {
+        /* paranoid */ verify( lanes.count > 0 );
+        unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+        int preferred;
+        #if defined(BIAS)
+                // Don't bother trying locally too much
+                preferred = kernelTLS().this_processor->cltr_id;
+        #endif
+        // As long as the list is not empty, try finding a lane that isn't empty and pop from it
+        #ifdef USE_SNZI
+                while( query(snzi) ) {
+        #else
+                for(25) {
+        #endif
+                // Pick two lists at random
+                // unsigned ri = __tls_rand();
+                // unsigned rj = __tls_rand();
+                unsigned ri = __tls_rand_bck();
+                unsigned rj = __tls_rand_bck();
+                unsigned i, j;
+                __attribute__((unused)) bool locali, localj;
+                [i, locali] = idx_from_r(ri, preferred);
+                [j, localj] = idx_from_r(rj, preferred);
+                #if !defined(__CFA_NO_STATISTICS__)
+                        if(locali && localj) {
+                                __tls_stats()->ready.pick.pop.local++;
+                        }
+                #endif
+                i %= count;
+                j %= count;
+                // try popping from the 2 picked lists
+                struct $thread * thrd = try_pop(cltr, i, j);
+                if(thrd) {
+                        #if defined(BIAS) && !defined(__CFA_NO_STATISTICS__)
+                                if( locali || localj ) __tls_stats()->ready.pick.pop.lsuccess++;
+                        #endif
+                        return thrd;
+                }
+        }
+        // All lanes where empty return 0p
+        return 0p;
+}
+__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+        // return the popped thread
+        return thrd;
+}
+//-----------------------------------------------------------------------
+// try to pop from any lanes making sure you don't miss any threads push
+// before the start of the function
+static inline struct $thread * search(struct cluster * cltr) with (cltr->ready_queue) {
         /* paranoid */ verify( lanes.count > 0 );
         unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
 …
         for(i; count) {
                 unsigned idx = (offset + i) % count;
                 struct $thread * thrd = try_pop(cltr, idx);
+                struct $thread * thrd = try_pop(cltr, idx __STATS(, __tls_stats()->ready.pop.search));
                 if(thrd) {
                         return thrd;
 …
+}
 //-----------------------------------------------------------------------
+// Given 2 indexes, pick the list with the oldest push an try to pop from it
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j) with (cltr->ready_queue) {
+        #if !defined(__CFA_NO_STATISTICS__)
+                __tls_stats()->ready.pick.pop.attempt++;
+        #endif
+        // Pick the bet list
+        int w = i;
+        if( __builtin_expect(!is_empty(lanes.data[j]), true) ) {
+                w = (ts(lanes.data[i]) < ts(lanes.data[j])) ? i : j;
+        }
+        return try_pop(cltr, w);
+}
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned w) with (cltr->ready_queue) {
+        // Get relevant elements locally
+        __intrusive_lane_t & lane = lanes.data[w];
+        // If list looks empty retry
+        if( is_empty(lane) ) return 0p;
+        // If we can't get the lock retry
+        if( !__atomic_try_acquire(&lane.lock) ) return 0p;
+        // If list is empty, unlock and retry
+        if( is_empty(lane) ) {
+                __atomic_unlock(&lane.lock);
+                return 0p;
+        }
+        // Actually pop the list
+        struct $thread * thrd;
+        thrd = pop(lane);
+        /* paranoid */ verify(thrd);
+        /* paranoid */ verify(lane.lock);
+        #ifdef USE_SNZI
+                // If this was the last element in the lane
+                if(emptied) {
+                        depart( snzi, w );
+                }
+        #endif
+        // Unlock and return
+        __atomic_unlock(&lane.lock);
+        // Update statistics
+        #if !defined(__CFA_NO_STATISTICS__)
+                __tls_stats()->ready.pick.pop.success++;
+        #endif
+        // Update the thread bias
+        thrd->link.preferred = w / 4;
+        // return the popped thread
+        return thrd;
+}
+//-----------------------------------------------------------------------
+bool remove_head(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+        for(i; lanes.count) {
+                __intrusive_lane_t & lane = lanes.data[i];
+                bool removed = false;
+                __atomic_acquire(&lane.lock);
+                        if(head(lane)->link.next == thrd) {
+                                $thread * pthrd;
+                                pthrd = pop(lane);
+                                /* paranoid */ verify( pthrd == thrd );
+                                removed = true;
+                                #ifdef USE_SNZI
+                                        if(emptied) {
+                                                depart( snzi, i );
+                                        }
+                                #endif
+                        }
+                __atomic_unlock(&lane.lock);
+                if( removed ) return true;
+        }
+        return false;
+}
+//-----------------------------------------------------------------------
+// Check that all the intrusive queues in the data structure are still consistent
 static void check( __ready_queue_t & q ) with (q) {
         #if defined(__CFA_WITH_VERIFY__) && !defined(USE_MPSC)
 …
+}
+//-----------------------------------------------------------------------
+// Given 2 indexes, pick the list with the oldest push an try to pop from it
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->ready_queue) {
+        // Pick the bet list
+        int w = i;
+        if( __builtin_expect(!is_empty(lanes.data[j]), true) ) {
+                w = (ts(lanes.data[i]) < ts(lanes.data[j])) ? i : j;
+        }
+        return try_pop(cltr, w __STATS(, stats));
+}
 // Call this function of the intrusive list was moved using memcpy
 // fixes the list so that the pointers back to anchors aren't left dangling
 …
+}
+static void assign_list(unsigned & value, dlist(processor, processor) & list, unsigned count) {
+        processor * it = &list`first;
+        for(unsigned i = 0; i < count; i++) {
+                /* paranoid */ verifyf( it, "Unexpected null iterator, at index %u of %u\n", i, count);
+                it->rdq.id = value;
+                it->rdq.target = -1u;
+                value += READYQ_SHARD_FACTOR;
+                it = &(*it)`next;
+        }
+}
+static void reassign_cltr_id(struct cluster * cltr) {
+        unsigned preferred = 0;
+        assign_list(preferred, cltr->procs.actives, cltr->procs.total - cltr->procs.idle);
+        assign_list(preferred, cltr->procs.idles  , cltr->procs.idle );
+}
+static void fix_times( struct cluster * cltr ) with( cltr->ready_queue ) {
+        #if defined(USE_WORK_STEALING)
+                lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
+                for(i; lanes.count) {
+                        lanes.tscs[i].tv = ts(lanes.data[i]);
+                }
+        #endif
+}
 // Grow the ready queue
+unsigned ready_queue_grow(struct cluster * cltr, int target) {
+        unsigned preferred;
+void ready_queue_grow(struct cluster * cltr) {
         size_t ncount;
+        int target = cltr->procs.total;
         /* paranoid */ verify( ready_mutate_islocked() );
 …
         // grow the ready queue
         with( cltr->ready_queue ) {
-                #ifdef USE_SNZI
-                        ^(snzi){};
-                #endif
                 // Find new count
                 // Make sure we always have atleast 1 list
                 if(target >= 2) {
+                        ncount = target * 4;
+                        preferred = ncount - 4;
+                        ncount = target * READYQ_SHARD_FACTOR;
                 } else {
+                        ncount = 1;
+                        preferred = 0;
+                        ncount = SEQUENTIAL_SHARD;
+                }
 …
                 // Update original
                 lanes.count = ncount;
+                #ifdef USE_SNZI
+                        // Re-create the snzi
+                        snzi{ log2( lanes.count / 8 ) };
+                        for( idx; (size_t)lanes.count ) {
+                                if( !is_empty(lanes.data[idx]) ) {
+                                        arrive(snzi, idx);
+                                }
+                        }
+                #endif
+        }
+        }
+        fix_times(cltr);
+        reassign_cltr_id(cltr);
         // Make sure that everything is consistent
 …
         /* paranoid */ verify( ready_mutate_islocked() );
-        return preferred;
+}
 // Shrink the ready queue
 void ready_queue_shrink(struct cluster * cltr, int target) {
+void ready_queue_shrink(struct cluster * cltr) {
         /* paranoid */ verify( ready_mutate_islocked() );
         __cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue\n");
 …
         /* paranoid */ check( cltr->ready_queue );
+        int target = cltr->procs.total;
         with( cltr->ready_queue ) {
-                #ifdef USE_SNZI
-                        ^(snzi){};
-                #endif
                 // Remember old count
                 size_t ocount = lanes.count;
 …
                 // Find new count
                 // Make sure we always have atleast 1 list
                 lanes.count = target >= 2 ? target * 4: 1;
+                lanes.count = target >= 2 ? target * READYQ_SHARD_FACTOR: SEQUENTIAL_SHARD;
                 /* paranoid */ verify( ocount >= lanes.count );
                 /* paranoid */ verify( lanes.count == target * 4 || target < 2 );
+                /* paranoid */ verify( lanes.count == target * READYQ_SHARD_FACTOR || target < 2 );
                 // for printing count the number of displaced threads
 …
                         fix(lanes.data[idx]);
+                }
+                #ifdef USE_SNZI
+                        // Re-create the snzi
+                        snzi{ log2( lanes.count / 8 ) };
+                        for( idx; (size_t)lanes.count ) {
+                                if( !is_empty(lanes.data[idx]) ) {
+                                        arrive(snzi, idx);
+                                }
+                        }
+                #endif
+        }
+        }
+        fix_times(cltr);
+        reassign_cltr_id(cltr);
         // Make sure that everything is consistent

libcfa/src/concurrency/ready_subqueue.hfa

-              r6a8208cb
+              r59f3f61
         #endif
+}
+// Aligned timestamps which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __timestamp_t {
+        volatile unsigned long long tv;
+};
+void  ?{}(__timestamp_t & this) { this.tv = 0; }
+void ^?{}(__timestamp_t & this) {}

libcfa/src/concurrency/stats.cfa

-              r6a8208cb
+              r59f3f61
 #if !defined(__CFA_NO_STATISTICS__)
         void __init_stats( struct __stats_t * stats ) {
+                stats->ready.pick.push.attempt  = 0;
+                stats->ready.pick.push.success  = 0;
+                stats->ready.pick.push.local    = 0;
+                stats->ready.pick.push.lsuccess = 0;
+                stats->ready.pick.ext.attempt  = 0;
+                stats->ready.pick.ext.success  = 0;
+                stats->ready.pick.ext.local    = 0;
+                stats->ready.pick.ext.lsuccess = 0;
+                stats->ready.pick.pop .probe    = 0;
+                stats->ready.pick.pop .attempt  = 0;
+                stats->ready.pick.pop .success  = 0;
+                stats->ready.pick.pop .local    = 0;
+                stats->ready.pick.pop .lsuccess = 0;
+                stats->ready.push.local.attempt = 0;
+                stats->ready.push.local.success = 0;
+                stats->ready.push.share.attempt = 0;
+                stats->ready.push.share.success = 0;
+                stats->ready.push.extrn.attempt = 0;
+                stats->ready.push.extrn.success = 0;
+                stats->ready.pop.local .attempt = 0;
+                stats->ready.pop.local .success = 0;
+                stats->ready.pop.local .elock   = 0;
+                stats->ready.pop.local .eempty  = 0;
+                stats->ready.pop.local .espec   = 0;
+                stats->ready.pop.help  .attempt = 0;
+                stats->ready.pop.help  .success = 0;
+                stats->ready.pop.help  .elock   = 0;
+                stats->ready.pop.help  .eempty  = 0;
+                stats->ready.pop.help  .espec   = 0;
+                stats->ready.pop.steal .attempt = 0;
+                stats->ready.pop.steal .success = 0;
+                stats->ready.pop.steal .elock   = 0;
+                stats->ready.pop.steal .eempty  = 0;
+                stats->ready.pop.steal .espec   = 0;
+                stats->ready.pop.search.attempt = 0;
+                stats->ready.pop.search.success = 0;
+                stats->ready.pop.search.elock   = 0;
+                stats->ready.pop.search.eempty  = 0;
+                stats->ready.pop.search.espec   = 0;
                 stats->ready.threads.migration = 0;
                 stats->ready.threads.threads   = 0;
 …
         void __tally_stats( struct __stats_t * cltr, struct __stats_t * proc ) {
+                __atomic_fetch_add( &cltr->ready.pick.push.attempt , proc->ready.pick.push.attempt , __ATOMIC_SEQ_CST ); proc->ready.pick.push.attempt  = 0;
+                __atomic_fetch_add( &cltr->ready.pick.push.success , proc->ready.pick.push.success , __ATOMIC_SEQ_CST ); proc->ready.pick.push.success  = 0;
+                __atomic_fetch_add( &cltr->ready.pick.push.local   , proc->ready.pick.push.local   , __ATOMIC_SEQ_CST ); proc->ready.pick.push.local    = 0;
+                __atomic_fetch_add( &cltr->ready.pick.push.lsuccess, proc->ready.pick.push.lsuccess, __ATOMIC_SEQ_CST ); proc->ready.pick.push.lsuccess = 0;
+                __atomic_fetch_add( &cltr->ready.pick.ext.attempt  , proc->ready.pick.ext.attempt  , __ATOMIC_SEQ_CST ); proc->ready.pick.ext.attempt   = 0;
+                __atomic_fetch_add( &cltr->ready.pick.ext.success  , proc->ready.pick.ext.success  , __ATOMIC_SEQ_CST ); proc->ready.pick.ext.success   = 0;
+                __atomic_fetch_add( &cltr->ready.pick.ext.local    , proc->ready.pick.ext.local    , __ATOMIC_SEQ_CST ); proc->ready.pick.ext.local     = 0;
+                __atomic_fetch_add( &cltr->ready.pick.ext.lsuccess , proc->ready.pick.ext.lsuccess , __ATOMIC_SEQ_CST ); proc->ready.pick.ext.lsuccess  = 0;
+                __atomic_fetch_add( &cltr->ready.pick.pop .probe   , proc->ready.pick.pop .probe   , __ATOMIC_SEQ_CST ); proc->ready.pick.pop .probe    = 0;
+                __atomic_fetch_add( &cltr->ready.pick.pop .attempt , proc->ready.pick.pop .attempt , __ATOMIC_SEQ_CST ); proc->ready.pick.pop .attempt  = 0;
+                __atomic_fetch_add( &cltr->ready.pick.pop .success , proc->ready.pick.pop .success , __ATOMIC_SEQ_CST ); proc->ready.pick.pop .success  = 0;
+                __atomic_fetch_add( &cltr->ready.pick.pop .local   , proc->ready.pick.pop .local   , __ATOMIC_SEQ_CST ); proc->ready.pick.pop .local    = 0;
+                __atomic_fetch_add( &cltr->ready.pick.pop .lsuccess, proc->ready.pick.pop .lsuccess, __ATOMIC_SEQ_CST ); proc->ready.pick.pop .lsuccess = 0;
+                __atomic_fetch_add( &cltr->ready.push.local.attempt, proc->ready.push.local.attempt, __ATOMIC_SEQ_CST ); proc->ready.push.local.attempt = 0;
+                __atomic_fetch_add( &cltr->ready.push.local.success, proc->ready.push.local.success, __ATOMIC_SEQ_CST ); proc->ready.push.local.success = 0;
+                __atomic_fetch_add( &cltr->ready.push.share.attempt, proc->ready.push.share.attempt, __ATOMIC_SEQ_CST ); proc->ready.push.share.attempt = 0;
+                __atomic_fetch_add( &cltr->ready.push.share.success, proc->ready.push.share.success, __ATOMIC_SEQ_CST ); proc->ready.push.share.success = 0;
+                __atomic_fetch_add( &cltr->ready.push.extrn.attempt, proc->ready.push.extrn.attempt, __ATOMIC_SEQ_CST ); proc->ready.push.extrn.attempt = 0;
+                __atomic_fetch_add( &cltr->ready.push.extrn.success, proc->ready.push.extrn.success, __ATOMIC_SEQ_CST ); proc->ready.push.extrn.success = 0;
+                __atomic_fetch_add( &cltr->ready.pop.local .attempt, proc->ready.pop.local .attempt, __ATOMIC_SEQ_CST ); proc->ready.pop.local .attempt = 0;
+                __atomic_fetch_add( &cltr->ready.pop.local .success, proc->ready.pop.local .success, __ATOMIC_SEQ_CST ); proc->ready.pop.local .success = 0;
+                __atomic_fetch_add( &cltr->ready.pop.local .elock  , proc->ready.pop.local .elock  , __ATOMIC_SEQ_CST ); proc->ready.pop.local .elock   = 0;
+                __atomic_fetch_add( &cltr->ready.pop.local .eempty , proc->ready.pop.local .eempty , __ATOMIC_SEQ_CST ); proc->ready.pop.local .eempty  = 0;
+                __atomic_fetch_add( &cltr->ready.pop.local .espec  , proc->ready.pop.local .espec  , __ATOMIC_SEQ_CST ); proc->ready.pop.local .espec   = 0;
+                __atomic_fetch_add( &cltr->ready.pop.help  .attempt, proc->ready.pop.help  .attempt, __ATOMIC_SEQ_CST ); proc->ready.pop.help  .attempt = 0;
+                __atomic_fetch_add( &cltr->ready.pop.help  .success, proc->ready.pop.help  .success, __ATOMIC_SEQ_CST ); proc->ready.pop.help  .success = 0;
+                __atomic_fetch_add( &cltr->ready.pop.help  .elock  , proc->ready.pop.help  .elock  , __ATOMIC_SEQ_CST ); proc->ready.pop.help  .elock   = 0;
+                __atomic_fetch_add( &cltr->ready.pop.help  .eempty , proc->ready.pop.help  .eempty , __ATOMIC_SEQ_CST ); proc->ready.pop.help  .eempty  = 0;
+                __atomic_fetch_add( &cltr->ready.pop.help  .espec  , proc->ready.pop.help  .espec  , __ATOMIC_SEQ_CST ); proc->ready.pop.help  .espec   = 0;
+                __atomic_fetch_add( &cltr->ready.pop.steal .attempt, proc->ready.pop.steal .attempt, __ATOMIC_SEQ_CST ); proc->ready.pop.steal .attempt = 0;
+                __atomic_fetch_add( &cltr->ready.pop.steal .success, proc->ready.pop.steal .success, __ATOMIC_SEQ_CST ); proc->ready.pop.steal .success = 0;
+                __atomic_fetch_add( &cltr->ready.pop.steal .elock  , proc->ready.pop.steal .elock  , __ATOMIC_SEQ_CST ); proc->ready.pop.steal .elock   = 0;
+                __atomic_fetch_add( &cltr->ready.pop.steal .eempty , proc->ready.pop.steal .eempty , __ATOMIC_SEQ_CST ); proc->ready.pop.steal .eempty  = 0;
+                __atomic_fetch_add( &cltr->ready.pop.steal .espec  , proc->ready.pop.steal .espec  , __ATOMIC_SEQ_CST ); proc->ready.pop.steal .espec   = 0;
+                __atomic_fetch_add( &cltr->ready.pop.search.attempt, proc->ready.pop.search.attempt, __ATOMIC_SEQ_CST ); proc->ready.pop.search.attempt = 0;
+                __atomic_fetch_add( &cltr->ready.pop.search.success, proc->ready.pop.search.success, __ATOMIC_SEQ_CST ); proc->ready.pop.search.success = 0;
+                __atomic_fetch_add( &cltr->ready.pop.search.elock  , proc->ready.pop.search.elock  , __ATOMIC_SEQ_CST ); proc->ready.pop.search.elock   = 0;
+                __atomic_fetch_add( &cltr->ready.pop.search.eempty , proc->ready.pop.search.eempty , __ATOMIC_SEQ_CST ); proc->ready.pop.search.eempty  = 0;
+                __atomic_fetch_add( &cltr->ready.pop.search.espec  , proc->ready.pop.search.espec  , __ATOMIC_SEQ_CST ); proc->ready.pop.search.espec   = 0;
                 __atomic_fetch_add( &cltr->ready.threads.migration , proc->ready.threads.migration , __ATOMIC_SEQ_CST ); proc->ready.threads.migration  = 0;
                 __atomic_fetch_add( &cltr->ready.threads.threads   , proc->ready.threads.threads   , __ATOMIC_SEQ_CST ); proc->ready.threads.threads    = 0;
 …
                 if( flags & CFA_STATS_READY_Q ) {
+                        double push_len = ((double)ready.pick.push.attempt) / ready.pick.push.success;
+                        double ext_len  = ((double)ready.pick.ext .attempt) / ready.pick.ext .success;
+                        double pop_len  = ((double)ready.pick.pop .attempt) / ready.pick.pop .success;
+                        double lpush_len = ((double)ready.pick.push.local) / ready.pick.push.lsuccess;
+                        double lext_len  = ((double)ready.pick.ext .local) / ready.pick.ext .lsuccess;
+                        double lpop_len  = ((double)ready.pick.pop .local) / ready.pick.pop .lsuccess;
+                        double push_len = ((double)ready.push.local.attempt + ready.push.share.attempt + ready.push.extrn.attempt) / (ready.push.local.success + ready.push.share.success + ready.push.extrn.success);
+                        double sLcl_len = ready.push.local.success ? ((double)ready.push.local.attempt) / ready.push.local.success : 0;
+                        double sOth_len = ready.push.share.success ? ((double)ready.push.share.attempt) / ready.push.share.success : 0;
+                        double sExt_len = ready.push.extrn.success ? ((double)ready.push.extrn.attempt) / ready.push.extrn.success : 0;
+                        double rLcl_len  = ready.pop.local .success ? ((double)ready.pop.local .attempt) / ready.pop.local .success : 0;
+                        double rHlp_len  = ready.pop.help  .success ? ((double)ready.pop.help  .attempt) / ready.pop.help  .success : 0;
+                        double rStl_len  = ready.pop.steal .success ? ((double)ready.pop.steal .attempt) / ready.pop.steal .success : 0;
+                        double rSch_len  = ready.pop.search.success ? ((double)ready.pop.search.attempt) / ready.pop.search.success : 0;
                         __cfaabi_bits_print_safe( STDOUT_FILENO,
                                 "----- %s \"%s\" (%p) - Ready Q Stats -----\n"
+                                "- total threads  : %'15" PRIu64 "run, %'15" PRIu64 "schd (%'" PRIu64 "ext, %'" PRIu64 "mig, %'" PRId64 " )\n"
+                                "- push avg probe : %'3.2lf, %'3.2lfl (%'15" PRIu64 " attempts, %'15" PRIu64 " locals)\n"
+                                "- ext  avg probe : %'3.2lf, %'3.2lfl (%'15" PRIu64 " attempts, %'15" PRIu64 " locals)\n"
+                                "- pop  avg probe : %'3.2lf, %'3.2lfl (%'15" PRIu64 " attempts, %'15" PRIu64 " locals)\n"
+                                "- Idle Sleep     : %'15" PRIu64 "h, %'15" PRIu64 "c, %'15" PRIu64 "w, %'15" PRIu64 "e\n"
+                                "- totals   : %'3" PRIu64 " run, %'3" PRIu64 " schd (%'" PRIu64 "ext, %'" PRIu64 "mig, %'" PRId64 " )\n"
+                                "- push avg : %'3.2lf (l: %'3.2lf/%'" PRIu64 ", s: %'3.2lf/%'" PRIu64 ", e: %'3.2lf : %'" PRIu64 "e)\n"
+                                "- local    : %'3.2lf (%'3" PRIu64 " try, %'3" PRIu64 " spc, %'3" PRIu64 " lck, %'3" PRIu64 " ept)\n"
+                                "- help     : %'3.2lf (%'3" PRIu64 " try, %'3" PRIu64 " spc, %'3" PRIu64 " lck, %'3" PRIu64 " ept)\n"
+                                "- steal    : %'3.2lf (%'3" PRIu64 " try, %'3" PRIu64 " spc, %'3" PRIu64 " lck, %'3" PRIu64 " ept)\n"
+                                "- search   : %'3.2lf (%'3" PRIu64 " try, %'3" PRIu64 " spc, %'3" PRIu64 " lck, %'3" PRIu64 " ept)\n"
+                                "- Idle Slp : %'3" PRIu64 "h, %'3" PRIu64 "c, %'3" PRIu64 "w, %'3" PRIu64 "e\n"
                                 "\n"
                                 , type, name, id
+                                , ready.pick.pop.success
+                                , ready.pick.push.success + ready.pick.ext.success
+                                , ready.pick.ext.success, ready.threads.migration, ready.threads.threads
+                                , push_len, lpush_len, ready.pick.push.attempt, ready.pick.push.local
+                                , ext_len , lext_len , ready.pick.ext .attempt, ready.pick.ext .local
+                                , pop_len , lpop_len , ready.pick.pop .attempt, ready.pick.pop .local
+                                , ready.pop.local.success + ready.pop.help.success + ready.pop.steal.success + ready.pop.search.success
+                                , ready.push.local.success + ready.push.share.success + ready.push.extrn.success
+                                , ready.push.extrn.success, ready.threads.migration, ready.threads.threads
+                                , push_len, sLcl_len, ready.push.local.attempt, sOth_len, ready.push.share.attempt, sExt_len, ready.push.extrn.attempt
+                                , rLcl_len, ready.pop.local .attempt, ready.pop.local .espec, ready.pop.local .elock, ready.pop.local .eempty
+                                , rHlp_len, ready.pop.help  .attempt, ready.pop.help  .espec, ready.pop.help  .elock, ready.pop.help  .eempty
+                                , rStl_len, ready.pop.steal .attempt, ready.pop.steal .espec, ready.pop.steal .elock, ready.pop.steal .eempty
+                                , rSch_len, ready.pop.search.attempt, ready.pop.search.espec, ready.pop.search.elock, ready.pop.search.eempty
                                 , ready.sleep.halts, ready.sleep.cancels, ready.sleep.wakes, ready.sleep.exits
                         );

libcfa/src/concurrency/stats.hfa

-              r6a8208cb
+              r59f3f61
         static inline void __print_stats( struct __stats_t *, int, const char *, const char *, void * ) {}
 #else
+        struct __stats_readyQ_pop_t {
+                // number of attemps at poping something
+                volatile uint64_t attempt;
+        struct __attribute__((aligned(64))) __stats_readQ_t {
+                // number of successes at poping
+                volatile uint64_t success;
+                // number of attempts failed due to the lock being held
+                volatile uint64_t elock;
+                // number of attempts failed due to the queue being empty (lock held)
+                volatile uint64_t eempty;
+                // number of attempts failed due to the queue looking empty (lock not held)
+                volatile uint64_t espec;
+        };
+        struct __attribute__((aligned(64))) __stats_readyQ_t {
+                // Push statistic
                 struct {
-                        // Push statistic
                         struct {
                                 // number of attemps at pushing something
+                                // number of attemps at pushing something to preferred queues
                                 volatile uint64_t attempt;
                                 // number of successes at pushing
+                                // number of successes at pushing to preferred queues
                                 volatile uint64_t success;
+                        }
+                        // Stats for local queue within cluster
+                        local,
                                 // number of attemps at pushing something to preferred queues
                                 volatile uint64_t local;
+                        // Stats for non-local queues within cluster
+                        share,
                                 // number of successes at pushing to preferred queues
                                 volatile uint64_t lsuccess;
                         } push;
+                        // Stats from outside cluster
+                        extrn;
+                } push;
+                        struct {
+                                // number of attemps at pushing something
+                                volatile uint64_t attempt;
+                // Pop statistic
+                struct {
+                        // pop from local queue
+                        __stats_readyQ_pop_t local;
                                 // number of successes at pushing
                                 volatile uint64_t success;
+                        // pop before looking at local queue
+                        __stats_readyQ_pop_t help;
                                 // number of attemps at pushing something to preferred queues
                                 volatile uint64_t local;
+                        // pop from some other queue
+                        __stats_readyQ_pop_t steal;
                                 // number of successes at pushing to preferred queues
                                 volatile uint64_t lsuccess;
                         } ext;
+                        // pop when searching queues sequentially
+                        __stats_readyQ_pop_t search;
+                } pop;
-                        // Pop statistic
-                        struct {
-                                // number of reads of the mask
-                                // picking an empty __cfa_readyQ_mask_t counts here
-                                // but not as an attempt
-                                volatile uint64_t probe;
-                                // number of attemps at poping something
-                                volatile uint64_t attempt;
-                                // number of successes at poping
-                                volatile uint64_t success;
-                                // number of attemps at poping something to preferred queues
-                                volatile uint64_t local;
-                                // number of successes at poping to preferred queues
-                                volatile uint64_t lsuccess;
-                        } pop;
-                } pick;
                 struct {
                         volatile uint64_t migration;
 …
         struct __attribute__((aligned(128))) __stats_t {
                 __stats_readQ_t ready;
+                __stats_readyQ_t ready;
                 #if defined(CFA_HAVE_LINUX_IO_URING_H)
                         __stats_io_t    io;

libcfa/src/concurrency/thread.cfa

r6a8208cb	r59f3f61
39	39	link.next = 0p;
40	40	link.prev = 0p;
41		~~link.preferred = -1;~~
42	41	#if defined( __CFA_WITH_VERIFY__ )
43	42	canary = 0x0D15EA5E0D15EA5Ep;

libcfa/src/math.hfa

-              r6a8208cb
+              r59f3f61
 // file "LICENCE" distributed with Cforall.
 //
 // math --
+// math.hfa --
 //
 // Author           : Peter A. Buhr
 // Created On       : Mon Apr 18 23:37:04 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Mon Apr 12 18:35:39 2021
 // Update Count     : 131
+// Last Modified On : Thu Apr 15 11:47:56 2021
+// Update Count     : 132
 //
 …
         int log2( unsigned int n ) { return n == 0 ? -1 : sizeof(n) * __CHAR_BIT__ - 1 - __builtin_clz( n ); }
         long int log2( unsigned long int n ) { return n == 0 ? -1 : sizeof(n) * __CHAR_BIT__ - 1 - __builtin_clzl( n ); }
         long long int log2( unsigned long long int n ) { return n == 0 ? -1 : sizeof(n) * __CHAR_BIT__ - 1 - __builtin_clzl( n ); }
+        long long int log2( unsigned long long int n ) { return n == 0 ? -1 : sizeof(n) * __CHAR_BIT__ - 1 - __builtin_clzll( n ); }
         float log2( float x ) { return log2f( x ); }
         // extern "C" { double log2( double ); }

libcfa/src/time.hfa

-              r6a8208cb
+              r59f3f61
 // Created On       : Wed Mar 14 23:18:57 2018
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Jun 17 16:13:00 2020
 // Update Count     : 663
+// Last Modified On : Wed Apr 14 09:30:30 2021
+// Update Count     : 664
 //
 …
 static inline {
         Duration ?=?( Duration & dur, __attribute__((unused)) zero_t ) { return dur{ 0 }; }
+        void ?{}( Duration & dur, timeval t ) with( dur ) { tn = (int64_t)t.tv_sec * TIMEGRAN + t.tv_usec * 1000; }
+        Duration ?=?( Duration & dur, timeval t ) with( dur ) {
+                tn = (int64_t)t.tv_sec * TIMEGRAN + t.tv_usec * (TIMEGRAN / 1_000_000LL);
+                return dur;
+        } // ?=?
+        void ?{}( Duration & dur, timespec t ) with( dur ) { tn = (int64_t)t.tv_sec * TIMEGRAN + t.tv_nsec; }
+        Duration ?=?( Duration & dur, timespec t ) with( dur ) {
+                tn = (int64_t)t.tv_sec * TIMEGRAN + t.tv_nsec;
+                return dur;
+        } // ?=?
         Duration +?( Duration rhs ) with( rhs ) { return (Duration)@{ +tn }; }

src/Parser/parser.yy

-              r6a8208cb
+              r59f3f61
 // Created On       : Sat Sep  1 20:22:55 2001
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Apr  1 14:43:24 2021
 // Update Count     : 4978
+// Last Modified On : Wed Apr 14 18:13:44 2021
+// Update Count     : 4983
 //
 …
 %token ATTRIBUTE EXTENSION                                                              // GCC
 %token IF ELSE SWITCH CASE DEFAULT DO WHILE FOR BREAK CONTINUE GOTO RETURN
+%token CHOOSE DISABLE ENABLE FALLTHRU FALLTHROUGH TRY THROW THROWRESUME AT WITH WHEN WAITFOR // CFA
+%token CHOOSE FALLTHRU FALLTHROUGH WITH WHEN WAITFOR    // CFA
+%token DISABLE ENABLE TRY THROW THROWRESUME AT                  // CFA
 %token ASM                                                                                              // C99, extension ISO/IEC 9899:1999 Section J.5.10(1)
 %token ALIGNAS ALIGNOF GENERIC STATICASSERT                             // C11

tests/.expect/math.nast.x86.txt

r6a8208cb	r59f3f61
19	19	log2:10 17 23
20	20	log2:10 17 23
21		log2:~~42 49 55~~
	21	log2:10 17 23
22	22	log2:3. 3. 3.
23	23	log10:2. 2. 2.

tests/concurrent/futures/multi.cfa

-              r6a8208cb
+              r59f3f61
 thread Server {
         int cnt, iteration;
+        int pending, done, iteration;
         multi_future(int) * request;
 };
 void ?{}( Server & this ) {
+        this.cnt = 0;
+        ((thread&)this){"Server Thread"};
+        this.pending = 0;
+        this.done = 0;
         this.iteration = 0;
         this.request = 0p;
 …
 void ^?{}( Server & mutex this ) {
         assert(this.cnt == 0);
     this.request = 0p;
+        assert(this.pending == 0);
+        this.request = 0p;
+}
 …
+}
+void process( Server & mutex this ) {
+        fulfil( *this.request, this.iteration );
+        this.iteration++;
+void call( Server & mutex this ) {
+        this.pending++;
+}
 void call( Server & mutex this ) {
         this.cnt++;
+void finish( Server & mutex this ) {
+        this.done++;
+}
-void finish( Server & mutex this ) { }
 void main( Server & this ) {
+        MAIN_LOOP:
         for() {
                 waitfor( ^?{} : this ) {
                         break;
+                }
+                or when( this.cnt < NFUTURES ) waitfor( call: this ) {
+                        if (this.cnt == NFUTURES) {
+                                process(this);
+                or waitfor( call: this ) {
+                        if (this.pending != NFUTURES) { continue MAIN_LOOP; }
+                        this.pending = 0;
+                        fulfil( *this.request, this.iteration );
+                        this.iteration++;
+                        for(NFUTURES) {
+                                waitfor( finish: this );
+                        }
+                }
+                or waitfor( finish: this ) {
+                        if (this.cnt == NFUTURES) {
+                                reset( *this.request );
+                                this.cnt = 0;
+                        }
+                        reset( *this.request );
+                        this.done = 0;
+                }
+        }
 …
 Server * the_server;
 thread Worker {};
+void ?{}(Worker & this) {
+        ((thread&)this){"Worker Thread"};
+}
 multi_future(int) * shared_future;

tests/concurrent/spinaphore.cfa

-              r6a8208cb
+              r59f3f61
 void main(Unblocker & this) {
         this.sum = 0;
         unsigned me = (unsigned)&this;
+        unsigned me = (unsigned)(uintptr_t)&this;
         for(num_unblocks) {
                 $thread * t = V(sem, false);
                 Blocker * b = from_thread(t);
                 b->sum += me;
                 this.sum += (unsigned)b;
+                this.sum += (unsigned)(uintptr_t)b;
                 unpark(t);
                 yield(random(10));
 …
                 for(i;num_blockers) {
                         for(num_blocks)
                                 usum += (unsigned)&blockers[i];
+                                usum += (unsigned)(uintptr_t)&blockers[i];
+                }
                 for(i;num_unblockers) {
                         for(num_unblocks)
                                 bsum += (unsigned)&unblockers[i];
+                                bsum += (unsigned)(uintptr_t)&unblockers[i];
+                }

tests/time.cfa

-              r6a8208cb
+              r59f3f61
 // Created On       : Tue Mar 27 17:24:56 2018
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Jun 18 18:14:49 2020
 // Update Count     : 37
+// Last Modified On : Fri Apr 16 14:59:53 2021
+// Update Count     : 38
 //
 …
         //       | "Newfoundland" | getTime( Newfoundland )
         //       | "local" | getTime()
         //       | "local nsec" | getTimeNsec()
+        //       | "local nsec" | timeHiRes()
         //       | "PST" | PST();                                                               // getTime short form
         // sout | nl;

tools/gdb/utils-gdb.py

-              r6a8208cb
+              r59f3f61
 gdb.execute('handle SIGUSR1 nostop noprint pass')
 CfaTypes = collections.namedtuple('CfaTypes', 'cluster_ptr processor_ptr thread_ptr int_ptr thread_state')
+CfaTypes = collections.namedtuple('CfaTypes', 'cluster_ptr processor_ptr thread_ptr int_ptr thread_state yield_state')
 class ThreadInfo:
 …
         # GDB types for various structures/types in CFA
         return CfaTypes(cluster_ptr = gdb.lookup_type('struct cluster').pointer(),
+                                  processor_ptr = gdb.lookup_type('struct processor').pointer(),
+                                         thread_ptr = gdb.lookup_type('struct $thread').pointer(),
+                                                int_ptr = gdb.lookup_type('int').pointer(),
+                                   thread_state = gdb.lookup_type('enum __Coroutine_State'))
+                processor_ptr = gdb.lookup_type('struct processor').pointer(),
+                thread_ptr = gdb.lookup_type('struct $thread').pointer(),
+                int_ptr = gdb.lookup_type('int').pointer(),
+                thread_state = gdb.lookup_type('enum __Coroutine_State'),
+                yield_state = gdb.lookup_type('enum __Preemption_Reason'))
 def get_addr(addr):
 …
         def print_thread(self, thread, tid, marked):
                 cfa_t = get_cfa_types()
+                self.print_formatted(marked, tid, thread['self_cor']['name'].string(), str(thread['state'].cast(cfa_t.thread_state)), str(thread))
+                ys = str(thread['preempted'].cast(cfa_t.yield_state))
+                if ys == '_X15__NO_PREEMPTIONKM19__Preemption_Reason_1':
+                        state = str(thread['state'].cast(cfa_t.thread_state))
+                elif ys == '_X18__ALARM_PREEMPTIONKM19__Preemption_Reason_1':
+                        state = 'preempted'
+                elif ys == '_X19__MANUAL_PREEMPTIONKM19__Preemption_Reason_1':
+                        state = 'yield'
+                elif ys == '_X17__POLL_PREEMPTIONKM19__Preemption_Reason_1':
+                        state = 'poll'
+                else:
+                        print("error: thread {} in undefined preemption state {}".format(thread, ys))
+                        state = 'error'
+                self.print_formatted(marked, tid, thread['self_cor']['name'].string(), state, str(thread))
         def print_threads_by_cluster(self, cluster, print_system = False):
 …
                 context = thread['context']
+                # must be at frame 0 to set pc register
+                gdb.execute('select-frame 0')
+                if gdb.selected_frame().architecture().name() != 'i386:x86-64':
+                        print('gdb debugging only supported for i386:x86-64 for now')
+                        return
+                # gdb seems to handle things much better if we pretend we just entered the context switch
+                # pretend the pc is __cfactx_switch and adjust the sp, base pointer doesn't need to change
                 # lookup for sp,fp and uSwitch
                 xsp = context['SP'] + 48
+                xsp = context['SP'] + 40 # 40 = 5 64bit registers : %r15, %r14, %r13, %r12, %rbx WARNING: x64 specific
                 xfp = context['FP']
                 # convert string so we can strip out the address
                 try:
                         xpc = get_addr(gdb.parse_and_eval('__cfactx_switch').address + 28)
+                        xpc = get_addr(gdb.parse_and_eval('__cfactx_switch').address)
                 except:
                         print("here")
                         return
-                # must be at frame 0 to set pc register
-                gdb.execute('select-frame 0')
                 # push sp, fp, pc into a global stack
 …
                 # update registers for new task
+                print('switching to ')
+                # print('switching to {} ({}) : [{}, {}, {}]'.format(thread['self_cor']['name'].string(), str(thread), str(xsp), str(xfp), str(xpc)))
+                print('switching to thread {} ({})'.format(str(thread), thread['self_cor']['name'].string()))
                 gdb.execute('set $rsp={}'.format(xsp))
                 gdb.execute('set $rbp={}'.format(xfp))
 …
                 argv = parse(arg)
-                print(argv)
                 if argv[0].isdigit():
                         cname = " ".join(argv[1:]) if len(argv) > 1 else None

Context Navigation

Changes in / [6a8208cb:59f3f61]

Legend:

Download in other formats: