Diff [7f9968adc291989fb495461dc2b178248a560216:8b58baeb5312bd49746f84b760dfbdcb2c8d577e] for / – Cforall

benchmark/io/readv.cfa

-              r7f9968ad
+              r8b58bae
 #include <time.hfa>
+#include "../benchcltr.hfa"
 extern bool traceHeapOn();
 extern ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
 …
 unsigned long int buflen = 50;
+cluster * the_cluster;
+thread Reader {};
+thread __attribute__((aligned(128))) Reader {};
 void ?{}( Reader & this ) {
+        ((thread&)this){ "Reader Thread", *the_cluster };
+}
+struct my_processor {
+        processor p;
+};
+void ?{}( my_processor & this ) {
+        (this.p){ "I/O Processor", *the_cluster };
+        ((thread&)this){ "Reader Thread", *the_benchmark_cluster };
+}
 void main( Reader & ) {
+        while(!__atomic_load_n(&run, __ATOMIC_RELAXED)) yield();
+        park( __cfaabi_dbg_ctx );
+        /* paranoid */ assert( true == __atomic_load_n(&run, __ATOMIC_RELAXED) );
         char data[buflen];
 …
+        {
                 Time start, end;
+                cluster cl = { "IO Cluster", flags };
+                the_cluster = &cl;
+                BenchCluster cl = { flags };
                 #if !defined(__CFA_NO_STATISTICS__)
                         print_stats_at_exit( cl );
+                        print_stats_at_exit( cl.self );
                 #endif
+                {
                         my_processor procs[nprocs];
+                        BenchProc procs[nprocs];
+                        {
                                 Reader threads[nthreads];
                                 printf("Starting\n");
+                                bool is_tty = isatty(STDOUT_FILENO);
                                 start = getTime();
                                 run = true;
+                                do {
+                                        sleep(500`ms);
+                                        end = getTime();
+                                } while( (end - start) < duration`s );
+                                for(i; nthreads) {
+                                        unpark( threads[i] __cfaabi_dbg_ctx2 );
+                                }
+                                wait(duration, start, end, is_tty);
                                 run = false;
                                 end = getTime();
                                 printf("Done\n");
+                                printf("\nDone\n");
+                        }
+                }

doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp

-              r7f9968ad
+              r8b58bae
+#include "relaxed_list.hpp"
+#if !defined(LIST_VARIANT_HPP)
+#define LIST_VARIANT_HPP "relaxed_list.hpp"
+#endif
+#include LIST_VARIANT_HPP
+#if !defined(LIST_VARIANT)
+#error not variant selected
+#endif
 #include <array>
 …
 template<>
 thread_local relaxed_list<Node>::TLS relaxed_list<Node>::tls = {};
+thread_local LIST_VARIANT<Node>::TLS LIST_VARIANT<Node>::tls = {};
 template<>
 relaxed_list<Node> * relaxed_list<Node>::head = nullptr;
+std::atomic_uint32_t LIST_VARIANT<Node>::ticket = { 0 };
 #ifndef NO_STATS
 template<>
 relaxed_list<Node>::GlobalStats relaxed_list<Node>::global_stats = {};
+LIST_VARIANT<Node>::GlobalStats LIST_VARIANT<Node>::global_stats = {};
 #endif
 …
         size_t valmax = 0;
         size_t valmin = 100000000ul;
+        struct {
+                size_t val = 0;
+                size_t cnt = 0;
+        } comp;
+        struct {
+                size_t val = 0;
+                size_t cnt = 0;
+        } subm;
 };
 …
         std::atomic_size_t valmax = { 0 };
         std::atomic_size_t valmin = { 100000000ul };
+        struct {
+                std::atomic_size_t val = { 0 };
+                std::atomic_size_t cnt = { 0 };
+        } comp;
+        struct {
+                std::atomic_size_t val = { 0 };
+                std::atomic_size_t cnt = { 0 };
+        } subm;
 };
 …
         global.crc_out += local.crc_out;
+        global.comp.val += local.comp.val;
+        global.comp.cnt += local.comp.cnt;
+        global.subm.val += local.subm.val;
+        global.subm.cnt += local.subm.cnt;
         atomic_max(global.valmax, local.valmax);
         atomic_min(global.valmin, local.valmin);
         relaxed_list<Node>::stats_tls_tally();
+        LIST_VARIANT<Node>::stats_tls_tally();
+}
 …
         auto before = Clock::now();
         barrier.wait(0);
+        bool is_tty = isatty(STDOUT_FILENO);
         while(true) {
 …
                         break;
+                }
+                std::cout << "\r" << std::setprecision(4) << durr.count();
+                std::cout.flush();
+                if(is_tty) {
+                        std::cout << "\r" << std::setprecision(4) << durr.count();
+                        std::cout.flush();
+                }
+        }
 …
         auto dur_nano = duration_cast<std::nano>(1.0);
+        if(global.valmax != 0) {
+                std::cout << "Max runs      : " << global.valmax << "\n";
+                std::cout << "Min runs      : " << global.valmin << "\n";
+        }
+        if(global.comp.cnt != 0) {
+                std::cout << "Submit count  : " << global.subm.cnt << "\n";
+                std::cout << "Submit average: " << ((double(global.subm.val)) / global.subm.cnt) << "\n";
+                std::cout << "Complete count: " << global.comp.cnt << "\n";
+                std::cout << "Complete avg  : " << ((double(global.comp.val)) / global.comp.cnt) << "\n";
+        }
         std::cout << "Duration      : " << duration << "s\n";
         std::cout << "ns/Op         : " << ( dur_nano / ops_thread )<< "\n";
 …
         std::cout << "Ops/sec       : " << ops_sec << "\n";
         std::cout << "Total ops     : " << ops << "(" << global.in << "i, " << global.out << "o, " << global.empty << "e)\n";
-        if(global.valmax != 0) {
-                std::cout << "Max runs      : " << global.valmax << "\n";
-                std::cout << "Min runs      : " << global.valmin << "\n";
+        }
         #ifndef NO_STATS
                 relaxed_list<Node>::stats_print(std::cout);
+                LIST_VARIANT<Node>::stats_print(std::cout);
         #endif
+}
 …
         unsigned nslots,
         local_stat_t & local,
         relaxed_list<Node> & list
+        LIST_VARIANT<Node> & list
 ) {
         while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
 …
         std::cout << "Initializing ";
         size_t npushed = 0;
         relaxed_list<Node> list = { nthread * nqueues };
+        LIST_VARIANT<Node> list = { nthread, nqueues };
+        {
                 Node** all_nodes[nthread];
 …
         unsigned nnodes,
         local_stat_t & local,
         relaxed_list<Node> & list
+        LIST_VARIANT<Node> & list
 ) {
         Node * nodes[nnodes];
 …
         std::cout << "Initializing ";
         // List being tested
         relaxed_list<Node> list = { nthread * nqueues };
+        LIST_VARIANT<Node> list = { nthread, nqueues };
+        {
                 enable_stats = true;
 …
                 enable_stats = false;
+        }
+        print_stats(duration, nthread, global);
+}
+// ================================================================================================
+struct __attribute__((aligned(64))) Slot {
+        Node * volatile node;
+};
+__attribute__((noinline)) void runProducer_body(
+        std::atomic<bool>& done,
+        Random & rand,
+        Slot * slots,
+        int nslots,
+        local_stat_t & local,
+        LIST_VARIANT<Node> & list
+) {
+        while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
+                Node * node = list.pop();
+                if(!node) {
+                        local.empty ++;
+                        continue;
+                }
+                local.crc_out += node->value;
+                local.out++;
+                if(node->id == 0) {
+                        unsigned cnt = 0;
+                        for(int i = 0; i < nslots; i++) {
+                                Node * found = __atomic_exchange_n( &slots[i].node, nullptr, __ATOMIC_SEQ_CST );
+                                if( found ) {
+                                        local.crc_in += found->value;
+                                        local.in++;
+                                        cnt++;
+                                        list.push( found );
+                                }
+                        }
+                        local.crc_in += node->value;
+                        local.in++;
+                        list.push( node );
+                        local.comp.cnt++;
+                        local.comp.val += cnt;
+                }
+                else {
+                        unsigned len = 0;
+                        while(true) {
+                                auto off = rand.next();
+                                for(int i = 0; i < nslots; i++) {
+                                        Node * expected = nullptr;
+                                        int idx = (i + off) % nslots;
+                                        Slot & slot = slots[ idx ];
+                                        if(
+                                                slot.node == nullptr &&
+                                                __atomic_compare_exchange_n( &slot.node, &expected, node, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST )
+                                        ) {
+                                                local.subm.cnt++;
+                                                local.subm.val += len;
+                                                goto LOOP;
+                                        }
+                                        assert( expected != node );
+                                        len++;
+                                }
+                        }
+                }
+                LOOP:;
+        }
+}
+void runProducer(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes) {
+        std::cout << "Producer Benchmark" << std::endl;
+        // Barrier for synchronization
+        barrier_t barrier(nthread + 1);
+        // Data to check everything is OK
+        global_stat_t global;
+        // Flag to signal termination
+        std::atomic_bool done  = { false };
+        std::cout << "Initializing ";
+        int nslots = nnodes * 4;
+        Slot * slots = new Slot[nslots];
+        std::cout << nnodes << " nodes (" << nslots << " slots)" << std::endl;
+        // List being tested
+        LIST_VARIANT<Node> list = { nthread, nqueues };
+        {
+                Random rand(rdtscl());
+                for(unsigned i = 0; i < nnodes; i++) {
+                        Node * node = new Node(rand.next() % 100);
+                        node->id = i;
+                        global.crc_in += node->value;
+                        list.push(node);
+                }
+                for(int i = 0; i < nslots; i++) {
+                        slots[i].node = nullptr;
+                }
+        }
+        {
+                enable_stats = true;
+                std::thread * threads[nthread];
+                unsigned i = 1;
+                for(auto & t : threads) {
+                        t = new std::thread([&done, &list, &barrier, &global, slots, nslots](unsigned tid) {
+                                Random rand(tid + rdtscl());
+                                local_stat_t local;
+                                barrier.wait(tid);
+                                // EXPERIMENT START
+                                runProducer_body(done, rand, slots, nslots, local, list);
+                                // EXPERIMENT END
+                                barrier.wait(tid);
+                                tally_stats(global, local);
+                        }, i++);
+                }
+                waitfor(duration, barrier, done);
+                for(auto t : threads) {
+                        t->join();
+                        delete t;
+                }
+                enable_stats = false;
+        }
+        {
+                while(Node * node = list.pop()) {
+                        global.crc_out += node->value;
+                        delete node;
+                }
+                for(int i = 0; i < nslots; i++) {
+                        delete slots[i].node;
+                }
+                delete [] slots;
+        }
 …
         unsigned nnodes,
         local_stat_t & local,
         relaxed_list<Node> & list
+        LIST_VARIANT<Node> & list
 ) {
         Node * nodes[nnodes];
 …
         // List being tested
         relaxed_list<Node> list = { nthread * nqueues };
+        LIST_VARIANT<Node> list = { nthread, nqueues };
+        {
                 enable_stats = true;
 …
         print_stats(duration, nthread, global);
         save_fairness(data_out.get(), 100, nthread, width, length, output);
+        // save_fairness(data_out.get(), 100, nthread, width, length, output);
+}
 …
                 Churn,
                 PingPong,
+                Producer,
                 Fairness,
                 NONE
 …
                                 case PingPong:
                                         nnodes = 1;
-                                        nslots = 1;
                                         switch(argc - optind) {
                                         case 0: break;
 …
                                                 break;
                                         default:
+                                                std::cerr << "'PingPong' benchmark doesn't accept more than 2 extra arguments" << std::endl;
+                                                std::cerr << "'PingPong' benchmark doesn't accept more than 1 extra arguments" << std::endl;
+                                                goto usage;
+                                        }
+                                        break;
+                                case Producer:
+                                        nnodes = 32;
+                                        switch(argc - optind) {
+                                        case 0: break;
+                                        case 1:
+                                                try {
+                                                        arg = optarg = argv[optind];
+                                                        nnodes = stoul(optarg, &len);
+                                                        if(len != arg.size()) { throw std::invalid_argument(""); }
+                                                } catch(std::invalid_argument &) {
+                                                        std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl;
+                                                        goto usage;
+                                                }
+                                                break;
+                                        default:
+                                                std::cerr << "'Producer' benchmark doesn't accept more than 1 extra arguments" << std::endl;
                                                 goto usage;
+                                        }
 …
                                         break;
+                                }
+                                if(iequals(arg, "producer")) {
+                                        benchmark = Producer;
+                                        break;
+                                }
                                 if(iequals(arg, "fairness")) {
                                         benchmark = Fairness;
 …
                                 std::cerr << "Usage: " << argv[0] << ": [options] -b churn [NNODES] [NSLOTS = NNODES]" << std::endl;
                                 std::cerr << "  or:  " << argv[0] << ": [options] -b pingpong [NNODES]" << std::endl;
+                                std::cerr << "  or:  " << argv[0] << ": [options] -b producer [NNODES]" << std::endl;
                                 std::cerr << std::endl;
                                 std::cerr << "  -d, --duration=DURATION  Duration of the experiment, in seconds" << std::endl;
 …
         std::cout << "Running " << nthreads << " threads (" << (nthreads * nqueues) << " queues) for " << duration << " seconds" << std::endl;
+        std::cout << "Relaxed list variant: " << LIST_VARIANT<Node>::name() << std::endl;
         switch(benchmark) {
                 case Churn:
 …
                 case PingPong:
                         runPingPong(nthreads, nqueues, duration, nnodes);
+                        break;
+                case Producer:
+                        runProducer(nthreads, nqueues, duration, nnodes);
                         break;
                 case Fairness:
 …
+}
 void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output) {
         std::ofstream os(output);
         os << "<html>\n";
         os << "<head>\n";
         os << "<style>\n";
         os << "</style>\n";
         os << "</head>\n";
         os << "<body>\n";
         os << "<table style=\"width=100%\">\n";
         size_t idx = 0;
         for(size_t r = 0ul; r < rows; r++) {
                 os << "<tr>\n";
                 for(size_t c = 0ul; c < columns; c++) {
                         os << "<td class=\"custom custom" << data[idx] << "\"></td>\n";
                         idx++;
+                }
                 os << "</tr>\n";
+        }
         os << "</table>\n";
         os << "</body>\n";
         os << "</html>\n";
         os << std::endl;
+}
 #include <png.h>
 #include <setjmp.h>
+// void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output) {
+//      std::ofstream os(output);
+//      os << "<html>\n";
+//      os << "<head>\n";
+//      os << "<style>\n";
+//      os << "</style>\n";
+//      os << "</head>\n";
+//      os << "<body>\n";
+//      os << "<table style=\"width=100%\">\n";
+//      size_t idx = 0;
+//      for(size_t r = 0ul; r < rows; r++) {
+//              os << "<tr>\n";
+//              for(size_t c = 0ul; c < columns; c++) {
+//                      os << "<td class=\"custom custom" << data[idx] << "\"></td>\n";
+//                      idx++;
+//              }
+//              os << "</tr>\n";
+//      }
+//      os << "</table>\n";
+//      os << "</body>\n";
+//      os << "</html>\n";
+//      os << std::endl;
+// }
+// #include <png.h>
+// #include <setjmp.h>
 /*

doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp

-              r7f9968ad
+              r8b58bae
 #pragma once
+#define LIST_VARIANT relaxed_list
+#define VANILLA 0
+#define SNZI 1
+#define BITMASK 2
+#define DISCOVER 3
+#define SNZM 4
+#define BIAS 5
+#ifndef VARIANT
+#define VARIANT VANILLA
+#endif
 #ifndef NO_STATS
 …
 #endif
+#include <cmath>
 #include <memory>
 #include <mutex>
 …
 #include "assert.hpp"
 #include "utils.hpp"
+#include "links.hpp"
+#include "snzi.hpp"
+#include "snzm.hpp"
 using namespace std;
-struct spinlock_t {
-        std::atomic_bool ll = { false };
-        inline void lock() {
-                while( __builtin_expect(ll.exchange(true),false) ) {
-                        while(ll.load(std::memory_order_relaxed))
-                                asm volatile("pause");
+                }
+        }
-        inline bool try_lock() {
-                return false == ll.exchange(true);
+        }
-        inline void unlock() {
-                ll.store(false, std::memory_order_release);
+        }
-        inline explicit operator bool() {
-                return ll.load(std::memory_order_relaxed);
+        }
-};
-static inline bool bts(std::atomic_size_t & target, size_t bit ) {
-        //*
-        int result = 0;
-        asm volatile(
-                "LOCK btsq %[bit], %[target]\n\t"
-                :"=@ccc" (result)
-                : [target] "m" (target), [bit] "r" (bit)
-        );
-        return result != 0;
-        /*/
-        size_t mask = 1ul << bit;
-        size_t ret = target.fetch_or(mask, std::memory_order_relaxed);
-        return (ret & mask) != 0;
-        //*/
+}
-static inline bool btr(std::atomic_size_t & target, size_t bit ) {
-        //*
-        int result = 0;
-        asm volatile(
-                "LOCK btrq %[bit], %[target]\n\t"
-                :"=@ccc" (result)
-                : [target] "m" (target), [bit] "r" (bit)
-        );
-        return result != 0;
-        /*/
-        size_t mask = 1ul << bit;
-        size_t ret = target.fetch_and(~mask, std::memory_order_relaxed);
-        return (ret & mask) != 0;
-        //*/
+}
-extern bool enable_stats;
 struct pick_stat {
 …
                 size_t attempt = 0;
                 size_t success = 0;
+                size_t local = 0;
         } push;
         struct {
 …
                 size_t success = 0;
                 size_t mask_attempt = 0;
+                size_t mask_reset = 0;
+                size_t local = 0;
         } pop;
 };
 …
 template<typename node_t>
-struct _LinksFields_t {
-        node_t * prev = nullptr;
-        node_t * next = nullptr;
-        unsigned long long ts = 0;
-};
-template<typename node_t>
 class __attribute__((aligned(128))) relaxed_list {
         static_assert(std::is_same<decltype(node_t::_links), _LinksFields_t<node_t>>::value, "Node must have a links field");
 public:
+        relaxed_list(unsigned numLists)
+                : lists(new intrusive_queue_t[numLists])
+                , numLists(numLists)
+        static const char * name() {
+                const char * names[] = {
+                        "RELAXED: VANILLA",
+                        "RELAXED: SNZI",
+                        "RELAXED: BITMASK",
+                        "RELAXED: SNZI + DISCOVERED MASK",
+                        "RELAXED: SNZI + MASK",
+                        "RELAXED: SNZI + LOCAL BIAS"
+                };
+                return names[VARIANT];
+        }
+        relaxed_list(unsigned numThreads, unsigned numQueues)
+                : numLists(numThreads * numQueues)
+                , lists(new intrusive_queue_t<node_t>[numLists])
+                #if VARIANT == SNZI || VARIANT == BIAS
+                        , snzi( std::log2( numLists / (2 * numQueues) ), 2 )
+                #elif VARIANT == SNZM || VARIANT == DISCOVER
+                        , snzm( numLists )
+                #endif
+        {
                 assertf(7 * 8 * 8 >= numLists, "List currently only supports 448 sublists");
-                // assert(sizeof(*this) == 128);
                 std::cout << "Constructing Relaxed List with " << numLists << std::endl;
-                #ifndef NO_STATS
-                        if(head) this->next = head;
-                        head = this;
-                #endif
+        }
 …
                 while(true) {
                         // Pick a random list
+                        #if VARIANT == BIAS
+                        unsigned r = tls.rng.next();
+                        unsigned i;
+                        if(0 == (r & 0xF)) {
+                                i = r >> 4;
+                        } else {
+                                i = tls.my_queue + ((r >> 4) % 4);
+                                tls.pick.push.local++;
+                        }
+                        i %= numLists;
+                        #else
                         unsigned i = tls.rng.next() % numLists;
+                        #endif
                         #ifndef NO_STATS
 …
                         if( !lists[i].lock.try_lock() ) continue;
+                        __attribute__((unused)) int num = numNonEmpty;
+                        #if VARIANT != SNZM && VARIANT != SNZI && VARIANT != DISCOVER && VARIANT != BIAS
+                                __attribute__((unused)) int num = numNonEmpty;
+                        #endif
                         // Actually push it
                         if(lists[i].push(node)) {
+                                numNonEmpty++;
+                                size_t qword = i >> 6ull;
+                                size_t bit   = i & 63ull;
+                                assertf((list_mask[qword] & (1ul << bit)) == 0, "Before set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit));
+                                __attribute__((unused)) bool ret = bts(list_mask[qword], bit);
+                                assert(!ret);
+                                assertf((list_mask[qword] & (1ul << bit)) != 0, "After set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit));
+                        }
+                        assert(numNonEmpty <= (int)numLists);
+                                #if VARIANT == DISCOVER
+                                        size_t qword = i >> 6ull;
+                                        size_t bit   = i & 63ull;
+                                        assert(qword == 0);
+                                        bts(tls.mask, bit);
+                                        snzm.arrive(i);
+                                #elif VARIANT == SNZI || VARIANT == BIAS
+                                        snzi.arrive(i);
+                                #elif VARIANT == SNZM
+                                        snzm.arrive(i);
+                                #elif VARIANT == BITMASK
+                                        numNonEmpty++;
+                                        size_t qword = i >> 6ull;
+                                        size_t bit   = i & 63ull;
+                                        assertf((list_mask[qword] & (1ul << bit)) == 0, "Before set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit));
+                                        __attribute__((unused)) bool ret = bts(list_mask[qword], bit);
+                                        assert(!ret);
+                                        assertf((list_mask[qword] & (1ul << bit)) != 0, "After set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit));
+                                #else
+                                        numNonEmpty++;
+                                #endif
+                        }
+                        #if VARIANT != SNZM && VARIANT != SNZI && VARIANT != DISCOVER && VARIANT != BIAS
+                                assert(numNonEmpty <= (int)numLists);
+                        #endif
                         // Unlock and return
 …
                         #ifndef NO_STATS
                                 tls.pick.push.success++;
+                                tls.empty.push.value += num;
+                                tls.empty.push.count += 1;
+                                #if VARIANT != SNZM && VARIANT != SNZI && VARIANT != DISCOVER && VARIANT != BIAS
+                                        tls.empty.push.value += num;
+                                        tls.empty.push.count += 1;
+                                #endif
                         #endif
                         return;
 …
         __attribute__((noinline, hot)) node_t * pop() {
+                #if !defined(NO_BITMASK)
+                        // for(int r = 0; r < 10 && numNonEmpty != 0; r++) {
+                        //      // Pick two lists at random
+                        //      unsigned i = tls.rng.next() % numLists;
+                        //      unsigned j = tls.rng.next() % numLists;
+                        //      if(auto node = try_pop(i, j)) return node;
+                        // }
+                #if VARIANT == DISCOVER
+                        assert(numLists <= 64);
+                        while(snzm.query()) {
+                                tls.pick.pop.mask_attempt++;
+                                unsigned i, j;
+                                {
+                                        // Pick first list totally randomly
+                                        i = tls.rng.next() % numLists;
+                                        // Pick the other according to the bitmask
+                                        unsigned r = tls.rng.next();
+                                        size_t mask = tls.mask.load(std::memory_order_relaxed);
+                                        if(mask == 0) {
+                                                tls.pick.pop.mask_reset++;
+                                                mask = (1U << numLists) - 1;
+                                                tls.mask.store(mask, std::memory_order_relaxed);
+                                        }
+                                        unsigned b = rand_bit(r, mask);
+                                        assertf(b < 64, "%zu %u", mask, b);
+                                        j = b;
+                                        assert(j < numLists);
+                                }
+                                if(auto node = try_pop(i, j)) return node;
+                        }
+                #elif VARIANT == SNZI
+                        while(snzi.query()) {
+                                // Pick two lists at random
+                                int i = tls.rng.next() % numLists;
+                                // int j = tls.rng.next() % numLists;
+                                if(auto node = try_pop(i, j)) return node;
+                        }
+                #elif VARIANT == BIAS
+                        while(snzi.query()) {
+                                // Pick two lists at random
+                                unsigned ri = tls.rng.next();
+                                unsigned i;
+                                unsigned j = tls.rng.next();
+                                if(0 == (ri & 0xF)) {
+                                        i = (ri >> 4) % numLists;
+                                } else {
+                                        i = tls.my_queue + ((ri >> 4) % 4);
+                                        j = tls.my_queue + ((j >> 4) % 4);
+                                        tls.pick.pop.local++;
+                                }
+                                i %= numLists;
+                                j %= numLists;
+                                if(auto node = try_pop(i, j)) return node;
+                        }
+                #elif VARIANT == SNZM
+                        //*
+                        while(snzm.query()) {
+                                tls.pick.pop.mask_attempt++;
+                                unsigned i, j;
+                                {
+                                        // Pick two random number
+                                        unsigned ri = tls.rng.next();
+                                        unsigned rj = tls.rng.next();
+                                        // Pick two nodes from it
+                                        unsigned wdxi = ri & snzm.mask;
+                                        // unsigned wdxj = rj & snzm.mask;
+                                        // Get the masks from the nodes
+                                        // size_t maski = snzm.masks(wdxi);
+                                        size_t maskj = snzm.masks(wdxj);
+                                        if(maski == 0 && maskj == 0) continue;
+                                        #if defined(__BMI2__)
+                                                uint64_t idxsi = _pext_u64(snzm.indexes, maski);
+                                                // uint64_t idxsj = _pext_u64(snzm.indexes, maskj);
+                                                auto pi = __builtin_popcountll(maski);
+                                                // auto pj = __builtin_popcountll(maskj);
+                                                ri = pi ? ri & ((pi >> 3) - 1) : 0;
+                                                rj = pj ? rj & ((pj >> 3) - 1) : 0;
+                                                unsigned bi = (idxsi >> (ri << 3)) & 0xff;
+                                                unsigned bj = (idxsj >> (rj << 3)) & 0xff;
+                                        #else
+                                                unsigned bi = rand_bit(ri >> snzm.depth, maski);
+                                                unsigned bj = rand_bit(rj >> snzm.depth, maskj);
+                                        #endif
+                                        i = (bi << snzm.depth) | wdxi;
+                                        j = (bj << snzm.depth) | wdxj;
+                                        /* paranoid */ assertf(i < numLists, "%u %u", bj, wdxi);
+                                        /* paranoid */ assertf(j < numLists, "%u %u", bj, wdxj);
+                                }
+                                if(auto node = try_pop(i, j)) return node;
+                        }
+                        /*/
+                        while(snzm.query()) {
+                                // Pick two lists at random
+                                int i = tls.rng.next() % numLists;
+                                int j = tls.rng.next() % numLists;
+                                if(auto node = try_pop(i, j)) return node;
+                        }
+                        //*/
+                #elif VARIANT == BITMASK
                         int nnempty;
                         while(0 != (nnempty = numNonEmpty)) {
                                 tls.pick.pop.mask_attempt++;
                                 unsigned i, j;
-                                // if( numLists < 4 || (numLists / nnempty) < 4 ) {
-                                //      // Pick two lists at random
-                                //      i = tls.rng.next() % numLists;
-                                //      j = tls.rng.next() % numLists;
-                                // } else
+                                {
-                                        #ifndef NO_STATS
-                                                // tls.pick.push.mask_attempt++;
-                                        #endif
                                         // Pick two lists at random
                                         unsigned num = ((numLists - 1) >> 6) + 1;
 …
                 #endif
+                #if VARIANT == DISCOVER
+                        if(lists[i].ts() > 0) bts(tls.mask, i); else btr(tls.mask, i);
+                        if(lists[j].ts() > 0) bts(tls.mask, j); else btr(tls.mask, j);
+                #endif
                 // Pick the bet list
                 int w = i;
 …
                 if( !list.lock.try_lock() ) return nullptr;
+                __attribute__((unused)) int num = numNonEmpty;
+                #if VARIANT != SNZM && VARIANT != SNZI && VARIANT != DISCOVER  && VARIANT != BIAS
+                        __attribute__((unused)) int num = numNonEmpty;
+                #endif
                 // If list is empty, unlock and retry
 …
                 if(emptied) {
+                        numNonEmpty--;
+                        size_t qword = w >> 6ull;
+                        size_t bit   = w & 63ull;
+                        assert((list_mask[qword] & (1ul << bit)) != 0);
+                        __attribute__((unused)) bool ret = btr(list_mask[qword], bit);
+                        assert(ret);
+                        assert((list_mask[qword] & (1ul << bit)) == 0);
+                        #if VARIANT == DISCOVER
+                                size_t qword = w >> 6ull;
+                                size_t bit   = w & 63ull;
+                                assert(qword == 0);
+                                __attribute__((unused)) bool ret = btr(tls.mask, bit);
+                                snzm.depart(w);
+                        #elif VARIANT == SNZI || VARIANT == BIAS
+                                snzi.depart(w);
+                        #elif VARIANT == SNZM
+                                snzm.depart(w);
+                        #elif VARIANT == BITMASK
+                                numNonEmpty--;
+                                size_t qword = w >> 6ull;
+                                size_t bit   = w & 63ull;
+                                assert((list_mask[qword] & (1ul << bit)) != 0);
+                                __attribute__((unused)) bool ret = btr(list_mask[qword], bit);
+                                assert(ret);
+                                assert((list_mask[qword] & (1ul << bit)) == 0);
+                        #else
+                                numNonEmpty--;
+                        #endif
+                }
                 // Unlock and return
                 list.lock.unlock();
+                assert(numNonEmpty >= 0);
+                #if VARIANT != SNZM && VARIANT != SNZI && VARIANT != DISCOVER && VARIANT != BIAS
+                        assert(numNonEmpty >= 0);
+                #endif
                 #ifndef NO_STATS
                         tls.pick.pop.success++;
+                        tls.empty.pop.value += num;
+                        tls.empty.pop.count += 1;
+                        #if VARIANT != SNZM && VARIANT != SNZI && VARIANT != DISCOVER && VARIANT != BIAS
+                                tls.empty.pop.value += num;
+                                tls.empty.pop.count += 1;
+                        #endif
                 #endif
                 return node;
+        }
-private:
-        class __attribute__((aligned(128))) intrusive_queue_t {
-        public:
-                typedef spinlock_t lock_t;
-                friend class relaxed_list<node_t>;
-                struct stat {
-                        ssize_t diff = 0;
-                        size_t  push = 0;
-                        size_t  pop  = 0;
-                        // size_t value = 0;
-                        // size_t count = 0;
-                };
-        private:
-                struct sentinel_t {
-                        _LinksFields_t<node_t> _links;
-                };
-                lock_t lock;
-                sentinel_t before;
-                sentinel_t after;
-                #ifndef NO_STATS
-                        stat s;
-                #endif
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Winvalid-offsetof"
-                static constexpr auto fields_offset = offsetof( node_t, _links );
-#pragma GCC diagnostic pop
-        public:
-                intrusive_queue_t()
-                        : before{{ nullptr, tail() }}
-                        , after {{ head(), nullptr }}
+                {
-                        /* paranoid */ assert((reinterpret_cast<uintptr_t>( head() ) + fields_offset) == reinterpret_cast<uintptr_t>(&before));
-                        /* paranoid */ assert((reinterpret_cast<uintptr_t>( tail() ) + fields_offset) == reinterpret_cast<uintptr_t>(&after ));
-                        /* paranoid */ assert(head()->_links.prev == nullptr);
-                        /* paranoid */ assert(head()->_links.next == tail() );
-                        /* paranoid */ assert(tail()->_links.next == nullptr);
-                        /* paranoid */ assert(tail()->_links.prev == head() );
-                        /* paranoid */ assert(sizeof(*this) == 128);
-                        /* paranoid */ assert((intptr_t(this) % 128) == 0);
+                }
-                ~intrusive_queue_t() = default;
-                inline node_t * head() const {
-                        node_t * rhead = reinterpret_cast<node_t *>(
-                                reinterpret_cast<uintptr_t>( &before ) - fields_offset
-                        );
-                        assert(rhead);
-                        return rhead;
+                }
-                inline node_t * tail() const {
-                        node_t * rtail = reinterpret_cast<node_t *>(
-                                reinterpret_cast<uintptr_t>( &after ) - fields_offset
-                        );
-                        assert(rtail);
-                        return rtail;
+                }
-                inline bool push(node_t * node) {
-                        assert(lock);
-                        assert(node->_links.ts != 0);
-                        node_t * tail = this->tail();
-                        node_t * prev = tail->_links.prev;
-                        // assertf(node->_links.ts >= prev->_links.ts,
-                        //      "New node has smaller timestamp: %llu < %llu", node->_links.ts, prev->_links.ts);
-                        node->_links.next = tail;
-                        node->_links.prev = prev;
-                        prev->_links.next = node;
-                        tail->_links.prev = node;
-                        #ifndef NO_STATS
-                                if(enable_stats) {
-                                        s.diff++;
-                                        s.push++;
+                                }
-                        #endif
-                        if(before._links.ts == 0l) {
-                                before._links.ts = node->_links.ts;
-                                assert(node->_links.prev == this->head());
-                                return true;
+                        }
-                        return false;
+                }
-                inline std::pair<node_t *, bool> pop() {
-                        assert(lock);
-                        node_t * head = this->head();
-                        node_t * tail = this->tail();
-                        node_t * node = head->_links.next;
-                        node_t * next = node->_links.next;
-                        if(node == tail) return {nullptr, false};
-                        head->_links.next = next;
-                        next->_links.prev = head;
-                        #ifndef NO_STATS
-                                if(enable_stats) {
-                                        s.diff--;
-                                        s.pop ++;
+                                }
-                        #endif
-                        if(next == tail) {
-                                before._links.ts = 0l;
-                                return {node, true};
+                        }
-                        else {
-                                assert(next->_links.ts != 0);
-                                before._links.ts = next->_links.ts;
-                                assert(before._links.ts != 0);
-                                return {node, false};
+                        }
+                }
-                long long ts() const {
-                        return before._links.ts;
+                }
-        };
 public:
 …
         static __attribute__((aligned(128))) thread_local struct TLS {
                 Random     rng = { int(rdtscl()) };
+                unsigned   my_queue = (ticket++) * 4;
                 pick_stat  pick;
                 empty_stat empty;
+                __attribute__((aligned(64))) std::atomic_size_t mask = { 0 };
         } tls;
+private:
+        const unsigned numLists;
+        __attribute__((aligned(64))) std::unique_ptr<intrusive_queue_t<node_t> []> lists;
+private:
+        #if VARIANT == SNZI || VARIANT == BIAS
+                snzi_t snzi;
+        #elif VARIANT == SNZM || VARIANT == DISCOVER
+                snzm_t snzm;
+        #else
+                std::atomic_int numNonEmpty  = { 0 };  // number of non-empty lists
+        #endif
+        #if VARIANT == BITMASK
+                std::atomic_size_t list_mask[7] = { {0}, {0}, {0}, {0}, {0}, {0}, {0} }; // which queues are empty
+        #endif
 public:
+        std::atomic_int numNonEmpty  = { 0 };  // number of non-empty lists
+        std::atomic_size_t list_mask[7] = { {0}, {0}, {0}, {0}, {0}, {0}, {0} }; // which queues are empty
+private:
+        __attribute__((aligned(64))) std::unique_ptr<intrusive_queue_t []> lists;
+        const unsigned numLists;
+public:
+        static const constexpr size_t sizeof_queue = sizeof(intrusive_queue_t);
+        static const constexpr size_t sizeof_queue = sizeof(intrusive_queue_t<node_t>);
+        static std::atomic_uint32_t ticket;
 #ifndef NO_STATS
-        static void stats_print(std::ostream & os) {
-                auto it = head;
-                while(it) {
-                        it->stats_print_local(os);
-                        it = it->next;
+                }
+        }
         static void stats_tls_tally() {
                 global_stats.pick.push.attempt += tls.pick.push.attempt;
                 global_stats.pick.push.success += tls.pick.push.success;
+                global_stats.pick.push.local += tls.pick.push.local;
                 global_stats.pick.pop .attempt += tls.pick.pop.attempt;
                 global_stats.pick.pop .success += tls.pick.pop.success;
                 global_stats.pick.pop .mask_attempt += tls.pick.pop.mask_attempt;
+                global_stats.pick.pop .mask_reset += tls.pick.pop.mask_reset;
+                global_stats.pick.pop .local += tls.pick.pop.local;
                 global_stats.qstat.push.value += tls.empty.push.value;
 …
                                 std::atomic_size_t attempt = { 0 };
                                 std::atomic_size_t success = { 0 };
+                                std::atomic_size_t local = { 0 };
                         } push;
                         struct {
 …
                                 std::atomic_size_t success = { 0 };
                                 std::atomic_size_t mask_attempt = { 0 };
+                                std::atomic_size_t mask_reset = { 0 };
+                                std::atomic_size_t local = { 0 };
                         } pop;
                 } pick;
 …
         } global_stats;
+        // Link list of all lists for stats
+        __attribute__((aligned(64))) relaxed_list<node_t> * next = nullptr;
+        static relaxed_list<node_t> * head;
+        void stats_print_local(std::ostream & os ) {
+public:
+        static void stats_print(std::ostream & os ) {
                 std::cout << "----- Relaxed List Stats -----" << std::endl;
+                {
-                        ssize_t diff = 0;
-                        size_t  num  = 0;
-                        ssize_t max  = 0;
-                        for(size_t i = 0; i < numLists; i++) {
-                                const auto & list = lists[i];
-                                diff+= list.s.diff;
-                                num ++;
-                                max  = std::abs(max) > std::abs(list.s.diff) ? max : list.s.diff;
-                                os << "Local Q ops   : " << (list.s.push + list.s.pop) << "(" << list.s.push << "i, " << list.s.pop << "o)\n";
+                        }
-                        os << "Difference   : " << ssize_t(double(diff) / num  ) << " avg\t" << max << "max" << std::endl;
+                }
                 const auto & global = global_stats;
 …
                 double pop_sur  = (100.0 * double(global.pick.pop .success) / global.pick.pop .attempt);
                 double mpop_sur = (100.0 * double(global.pick.pop .success) / global.pick.pop .mask_attempt);
+                os << "Push   Pick % : " << push_sur << "(" << global.pick.push.success << " / " << global.pick.push.attempt << ")\n";
+                os << "Pop    Pick % : " << pop_sur  << "(" << global.pick.pop .success << " / " << global.pick.pop .attempt << ")\n";
+                os << "TryPop Pick % : " << mpop_sur << "(" << global.pick.pop .success << " / " << global.pick.pop .mask_attempt << ")\n";
+                double rpop_sur = (100.0 * double(global.pick.pop .success) / global.pick.pop .mask_reset);
+                double push_len = double(global.pick.push.attempt     ) / global.pick.push.success;
+                double pop_len  = double(global.pick.pop .attempt     ) / global.pick.pop .success;
+                double mpop_len = double(global.pick.pop .mask_attempt) / global.pick.pop .success;
+                double rpop_len = double(global.pick.pop .mask_reset  ) / global.pick.pop .success;
+                os << "Push   Pick   : " << push_sur << " %, len " << push_len << " (" << global.pick.push.attempt      << " / " << global.pick.push.success << ")\n";
+                os << "Pop    Pick   : " << pop_sur  << " %, len " << pop_len  << " (" << global.pick.pop .attempt      << " / " << global.pick.pop .success << ")\n";
+                os << "TryPop Pick   : " << mpop_sur << " %, len " << mpop_len << " (" << global.pick.pop .mask_attempt << " / " << global.pick.pop .success << ")\n";
+                os << "Pop M Reset   : " << rpop_sur << " %, len " << rpop_len << " (" << global.pick.pop .mask_reset   << " / " << global.pick.pop .success << ")\n";
                 double avgQ_push = double(global.qstat.push.value) / global.qstat.push.count;
 …
                 os << "Pop    Avg Qs : " << avgQ_pop  << " (" << global.qstat.pop .count << "ops)\n";
                 os << "Global Avg Qs : " << avgQ      << " (" << (global.qstat.push.count + global.qstat.pop .count) << "ops)\n";
+                os << "Local Push    : " << global.pick.push.local << "\n";
+                os << "Local Pop     : " << global.pick.pop .local << "\n";
+        }
 #endif

doc/theses/thierry_delisle_PhD/code/utils.hpp

-              r7f9968ad
+              r8b58bae
+}
+static inline unsigned rand_bit(unsigned rnum, size_t mask) __attribute__((artificial));
 static inline unsigned rand_bit(unsigned rnum, size_t mask) {
         unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
 …
 #endif
+}
+struct spinlock_t {
+        std::atomic_bool ll = { false };
+        inline void lock() {
+                while( __builtin_expect(ll.exchange(true),false) ) {
+                        while(ll.load(std::memory_order_relaxed))
+                                asm volatile("pause");
+                }
+        }
+        inline bool try_lock() {
+                return false == ll.exchange(true);
+        }
+        inline void unlock() {
+                ll.store(false, std::memory_order_release);
+        }
+        inline explicit operator bool() {
+                return ll.load(std::memory_order_relaxed);
+        }
+};
+static inline bool bts(std::atomic_size_t & target, size_t bit ) {
+        //*
+        int result = 0;
+        asm volatile(
+                "LOCK btsq %[bit], %[target]\n\t"
+                :"=@ccc" (result)
+                : [target] "m" (target), [bit] "r" (bit)
+        );
+        return result != 0;
+        /*/
+        size_t mask = 1ul << bit;
+        size_t ret = target.fetch_or(mask, std::memory_order_relaxed);
+        return (ret & mask) != 0;
+        //*/
+}
+static inline bool btr(std::atomic_size_t & target, size_t bit ) {
+        //*
+        int result = 0;
+        asm volatile(
+                "LOCK btrq %[bit], %[target]\n\t"
+                :"=@ccc" (result)
+                : [target] "m" (target), [bit] "r" (bit)
+        );
+        return result != 0;
+        /*/
+        size_t mask = 1ul << bit;
+        size_t ret = target.fetch_and(~mask, std::memory_order_relaxed);
+        return (ret & mask) != 0;
+        //*/
+}

libcfa/src/Makefile.am

r7f9968ad	r8b58bae
50	50	thread_headers_nosrc = concurrency/invoke.h
51	51	thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
52		thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/io.cfa concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
	52	thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/io.cfa concurrency/preemption.cfa concurrency/ready_queue.cfa concurrency/stats.cfa ${thread_headers:.hfa=.cfa}
53	53	else
54	54	headers =

libcfa/src/Makefile.in

-              r7f9968ad
+              r8b58bae
         concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa \
         concurrency/invoke.c concurrency/io.cfa \
+        concurrency/preemption.cfa concurrency/coroutine.cfa \
+        concurrency/preemption.cfa concurrency/ready_queue.cfa \
+        concurrency/stats.cfa concurrency/coroutine.cfa \
         concurrency/thread.cfa concurrency/kernel.cfa \
         concurrency/monitor.cfa concurrency/mutex.cfa
 …
 @BUILDLIB_TRUE@ concurrency/alarm.lo concurrency/invoke.lo \
 @BUILDLIB_TRUE@ concurrency/io.lo concurrency/preemption.lo \
+@BUILDLIB_TRUE@ concurrency/ready_queue.lo concurrency/stats.lo \
 @BUILDLIB_TRUE@ $(am__objects_3)
 am_libcfathread_la_OBJECTS = $(am__objects_4)
 …
 @BUILDLIB_FALSE@thread_headers =
 @BUILDLIB_TRUE@thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
 @BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/io.cfa concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
+@BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/io.cfa concurrency/preemption.cfa concurrency/ready_queue.cfa concurrency/stats.cfa ${thread_headers:.hfa=.cfa}
 #----------------------------------------------------------------------------------------------------------------
 …
         concurrency/$(DEPDIR)/$(am__dirstamp)
 concurrency/preemption.lo: concurrency/$(am__dirstamp) \
+        concurrency/$(DEPDIR)/$(am__dirstamp)
+concurrency/ready_queue.lo: concurrency/$(am__dirstamp) \
+        concurrency/$(DEPDIR)/$(am__dirstamp)
+concurrency/stats.lo: concurrency/$(am__dirstamp) \
         concurrency/$(DEPDIR)/$(am__dirstamp)
 concurrency/coroutine.lo: concurrency/$(am__dirstamp) \

libcfa/src/bits/debug.hfa

-              r7f9968ad
+              r8b58bae
                 || defined(__CFA_DEBUG_PRINT_IO__) || defined(__CFA_DEBUG_PRINT_IO_CORE__) \
                 || defined(__CFA_DEBUG_PRINT_MONITOR__) || defined(__CFA_DEBUG_PRINT_PREEMPTION__) \
+                || defined(__CFA_DEBUG_PRINT_RUNTIME_CORE__) || defined(__CFA_DEBUG_PRINT_EXCEPTION__)
+                || defined(__CFA_DEBUG_PRINT_RUNTIME_CORE__) || defined(__CFA_DEBUG_PRINT_EXCEPTION__) \
+                || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
         #include <stdio.h>
         #include <unistd.h>

libcfa/src/bits/defs.hfa

-              r7f9968ad
+              r8b58bae
     return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
+}
+// #define __CFA_NO_BIT_TEST_AND_SET__
+#if defined( __i386 )
+static inline bool __atomic_bts(volatile unsigned long int * target, unsigned long int bit ) {
+        #if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long int mask = 1ul << bit;
+        unsigned long int ret = __atomic_fetch_or(target, mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+    #else
+        int result = 0;
+        asm volatile(
+            "LOCK btsl %[bit], %[target]\n\t"
+            : "=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+static inline bool __atomic_btr(volatile unsigned long int * target, unsigned long int bit ) {
+        #if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long int mask = 1ul << bit;
+        unsigned long int ret = __atomic_fetch_and(target, ~mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+        #else
+        int result = 0;
+        asm volatile(
+            "LOCK btrl %[bit], %[target]\n\t"
+            :"=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+#elif defined( __x86_64 )
+static inline bool __atomic_bts(volatile unsigned long long int * target, unsigned long long int bit ) {
+        #if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long long int mask = 1ul << bit;
+        unsigned long long int ret = __atomic_fetch_or(target, mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+    #else
+        int result = 0;
+        asm volatile(
+            "LOCK btsq %[bit], %[target]\n\t"
+            : "=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+static inline bool __atomic_btr(volatile unsigned long long int * target, unsigned long long int bit ) {
+        #if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long long int mask = 1ul << bit;
+        unsigned long long int ret = __atomic_fetch_and(target, ~mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+        #else
+        int result = 0;
+        asm volatile(
+            "LOCK btrq %[bit], %[target]\n\t"
+            :"=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+#elif defined( __ARM_ARCH )
+    #error __atomic_bts and __atomic_btr not implemented for arm
+#else
+        #error uknown hardware architecture
+#endif

libcfa/src/concurrency/invoke.h

-              r7f9968ad
+              r8b58bae
                 extern __attribute__((aligned(128))) thread_local struct KernelThreadData {
                         struct $thread    * volatile this_thread;
+                        struct processor      * volatile this_processor;
+                        struct processor  * volatile this_processor;
+                        struct __stats_t  * volatile this_stats;
                         struct {
 …
                         } preemption_state;
                         uint32_t rand_seed;
+                        __uint128_t rand_seed;
                 } kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
+        }
 …
         };
         enum coroutine_state { Halted, Start, Primed, Blocked, Ready, Active, Rerun };
+        enum __Coroutine_State { Halted, Start, Primed, Blocked, Ready, Active };
         enum __Preemption_Reason { __NO_PREEMPTION, __ALARM_PREEMPTION, __POLL_PREEMPTION, __MANUAL_PREEMPTION };
 …
                 // current execution status for coroutine
                 enum coroutine_state state;
+                enum __Coroutine_State state;
                 // first coroutine to resume this one
 …
         };
+        // Link lists fields
+        // instrusive link field for threads
+        struct __thread_desc_link {
+                struct $thread * next;
+                struct $thread * prev;
+                volatile unsigned long long ts;
+                int preferred;
+        };
         struct $thread {
                 // Core threading fields
 …
                 // current execution status for coroutine
+                volatile int state;
+                enum __Preemption_Reason preempted;
+                volatile int ticket;
+                enum __Coroutine_State state:8;
+                enum __Preemption_Reason preempted:8;
                 //SKULLDUGGERY errno is not save in the thread data structure because returnToKernel appears to be the only function to require saving and restoring it
+                // pointer to the cluster on which the thread is running
+                struct cluster * curr_cluster;
+                // Link lists fields
+                // instrusive link field for threads
+                struct __thread_desc_link link;
                 // coroutine body used to store context
 …
                 struct $monitor *  self_mon_p;
-                // pointer to the cluster on which the thread is running
-                struct cluster * curr_cluster;
                 // monitors currently held by this thread
                 struct __monitor_group_t monitors;
-                // Link lists fields
-                // instrusive link field for threads
-                struct $thread * next;
                 struct {
 …
                         // previous function to park/unpark the thread
                         const char * park_caller;
+                        enum coroutine_state park_result;
+                        int park_result;
+                        enum __Coroutine_State park_state;
                         bool park_stale;
                         const char * unpark_caller;
+                        enum coroutine_state unpark_result;
+                        int unpark_result;
+                        enum __Coroutine_State unpark_state;
                         bool unpark_stale;
                 #endif
 …
         #ifdef __cforall
         extern "Cforall" {
                 static inline $thread *& get_next( $thread & this ) __attribute__((const)) {
                         return this.next;
+                        return this.link.next;
+                }

libcfa/src/concurrency/io.cfa

-              r7f9968ad
+              r8b58bae
                 void * ring_ptr;
                 size_t ring_sz;
-                // Statistics
-                #if !defined(__CFA_NO_STATISTICS__)
-                        struct {
-                                struct {
-                                        volatile unsigned long long int rdy;
-                                        volatile unsigned long long int csm;
-                                        volatile unsigned long long int avl;
-                                        volatile unsigned long long int cnt;
-                                } submit_avg;
-                                struct {
-                                        volatile unsigned long long int val;
-                                        volatile unsigned long long int cnt;
-                                        volatile unsigned long long int block;
-                                } look_avg;
-                                struct {
-                                        volatile unsigned long long int val;
-                                        volatile unsigned long long int cnt;
-                                        volatile unsigned long long int block;
-                                } alloc_avg;
-                        } stats;
-                #endif
         };
 …
                 void * ring_ptr;
                 size_t ring_sz;
-                // Statistics
-                #if !defined(__CFA_NO_STATISTICS__)
-                        struct {
-                                struct {
-                                        unsigned long long int val;
-                                        unsigned long long int slow_cnt;
-                                        unsigned long long int fast_cnt;
-                                } completed_avg;
-                        } stats;
-                #endif
         };
 …
                 struct {
                         struct {
+                                __processor_id_t id;
                                 void * stack;
                                 pthread_t kthrd;
 …
                 (this.io->submit){ min(*sq.num, *cq.num) };
-                // Initialize statistics
-                #if !defined(__CFA_NO_STATISTICS__)
-                        this.io->submit_q.stats.submit_avg.rdy = 0;
-                        this.io->submit_q.stats.submit_avg.csm = 0;
-                        this.io->submit_q.stats.submit_avg.avl = 0;
-                        this.io->submit_q.stats.submit_avg.cnt = 0;
-                        this.io->submit_q.stats.look_avg.val   = 0;
-                        this.io->submit_q.stats.look_avg.cnt   = 0;
-                        this.io->submit_q.stats.look_avg.block = 0;
-                        this.io->submit_q.stats.alloc_avg.val   = 0;
-                        this.io->submit_q.stats.alloc_avg.cnt   = 0;
-                        this.io->submit_q.stats.alloc_avg.block = 0;
-                        this.io->completion_q.stats.completed_avg.val = 0;
-                        this.io->completion_q.stats.completed_avg.slow_cnt = 0;
-                        this.io->completion_q.stats.completed_avg.fast_cnt = 0;
-                #endif
                 if(!main_cluster) {
                         __kernel_io_finish_start( this );
 …
                 if( this.io->cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD ) {
                         with( this.io->poller.fast ) {
                                 /* paranoid */ verify( this.procs.head == 0p || &this == mainCluster );
                                 /* paranoid */ verify( this.idles.head == 0p || &this == mainCluster );
+                                /* paranoid */ verify( this.nprocessors == 0 || &this == mainCluster );
+                                /* paranoid */ verify( !ready_mutate_islocked() );
                                 // We need to adjust the clean-up based on where the thread is
                                 if( thrd.state == Ready || thrd.preempted != __NO_PREEMPTION ) {
+                                        // This is the tricky case
+                                        // The thread was preempted and now it is on the ready queue
+                                        /* paranoid */ verify( thrd.next == 1p );                // The thread should be the last on the list
+                                        /* paranoid */ verify( this.ready_queue.head == &thrd ); // The thread should be the only thing on the list
+                                        // Remove the thread from the ready queue of this cluster
+                                        this.ready_queue.head = 1p;
+                                        thrd.next = 0p;
+                                        __cfaabi_dbg_debug_do( thrd.unpark_stale = true );
+                                        // Fixup the thread state
+                                        thrd.state = Blocked;
+                                        thrd.preempted = __NO_PREEMPTION;
+                                        ready_schedule_lock( (struct __processor_id_t *)active_processor() );
+                                                // This is the tricky case
+                                                // The thread was preempted and now it is on the ready queue
+                                                // The thread should be the last on the list
+                                                /* paranoid */ verify( thrd.link.next != 0p );
+                                                // Remove the thread from the ready queue of this cluster
+                                                __attribute__((unused)) bool removed = remove_head( &this, &thrd );
+                                                /* paranoid */ verify( removed );
+                                                thrd.link.next = 0p;
+                                                thrd.link.prev = 0p;
+                                                __cfaabi_dbg_debug_do( thrd.unpark_stale = true );
+                                                // Fixup the thread state
+                                                thrd.state = Blocked;
+                                                thrd.ticket = 0;
+                                                thrd.preempted = __NO_PREEMPTION;
+                                        ready_schedule_unlock( (struct __processor_id_t *)active_processor() );
                                         // Pretend like the thread was blocked all along
 …
                                         thrd.curr_cluster = active_cluster();
                         // unpark the fast io_poller
+                                        // unpark the fast io_poller
                                         unpark( &thrd __cfaabi_dbg_ctx2 );
+                                }
 …
                         __kernel_io_prepare_stop( this );
+                }
-                // print statistics
-                #if !defined(__CFA_NO_STATISTICS__)
-                        if(this.print_stats) {
-                                with(this.io->submit_q.stats, this.io->completion_q.stats) {
-                                        double avgrdy = ((double)submit_avg.rdy) / submit_avg.cnt;
-                                        double avgcsm = ((double)submit_avg.csm) / submit_avg.cnt;
-                                        double avgavl = ((double)submit_avg.avl) / submit_avg.cnt;
-                                        double lavgv = 0;
-                                        double lavgb = 0;
-                                        if(look_avg.cnt != 0) {
-                                                lavgv = ((double)look_avg.val  ) / look_avg.cnt;
-                                                lavgb = ((double)look_avg.block) / look_avg.cnt;
+                                        }
-                                        double aavgv = 0;
-                                        double aavgb = 0;
-                                        if(alloc_avg.cnt != 0) {
-                                                aavgv = ((double)alloc_avg.val  ) / alloc_avg.cnt;
-                                                aavgb = ((double)alloc_avg.block) / alloc_avg.cnt;
+                                        }
-                                        __cfaabi_bits_print_safe( STDOUT_FILENO,
-                                                "----- I/O uRing Stats -----\n"
-                                                "- total submit calls     : %'15llu\n"
-                                                "- avg ready entries      : %'18.2lf\n"
-                                                "- avg submitted entries  : %'18.2lf\n"
-                                                "- avg available entries  : %'18.2lf\n"
-                                                "- total ready search     : %'15llu\n"
-                                                "- avg ready search len   : %'18.2lf\n"
-                                                "- avg ready search block : %'18.2lf\n"
-                                                "- total alloc search     : %'15llu\n"
-                                                "- avg alloc search len   : %'18.2lf\n"
-                                                "- avg alloc search block : %'18.2lf\n"
-                                                "- total wait calls       : %'15llu   (%'llu slow, %'llu fast)\n"
-                                                "- avg completion/wait    : %'18.2lf\n",
-                                                submit_avg.cnt,
-                                                avgrdy,
-                                                avgcsm,
-                                                avgavl,
-                                                look_avg.cnt,
-                                                lavgv,
-                                                lavgb,
-                                                alloc_avg.cnt,
-                                                aavgv,
-                                                aavgb,
-                                                completed_avg.slow_cnt + completed_avg.fast_cnt,
-                                                completed_avg.slow_cnt,  completed_avg.fast_cnt,
-                                                ((double)completed_avg.val) / (completed_avg.slow_cnt + completed_avg.fast_cnt)
-                                        );
+                                }
+                        }
-                #endif
                 // Shutdown the io rings
 …
+                }
-                verify( (shead + ret) == *ring.submit_q.head );
                 // Release the consumed SQEs
                 for( i; ret ) {
 …
                 // update statistics
                 #if !defined(__CFA_NO_STATISTICS__)
                         ring.submit_q.stats.submit_avg.rdy += to_submit;
                         ring.submit_q.stats.submit_avg.csm += ret;
                         ring.submit_q.stats.submit_avg.avl += avail;
                         ring.submit_q.stats.submit_avg.cnt += 1;
+                        __tls_stats()->io.submit_q.submit_avg.rdy += to_submit;
+                        __tls_stats()->io.submit_q.submit_avg.csm += ret;
+                        __tls_stats()->io.submit_q.submit_avg.avl += avail;
+                        __tls_stats()->io.submit_q.submit_avg.cnt += 1;
                 #endif
 …
                         data->result = cqe.res;
                         if(!in_kernel) { unpark( data->thrd __cfaabi_dbg_ctx2 ); }
                         else         { __unpark( data->thrd __cfaabi_dbg_ctx2 ); }
+                        else         { __unpark( &ring.poller.slow.id, data->thrd __cfaabi_dbg_ctx2 ); }
+                }
 …
         static void * __io_poller_slow( void * arg ) {
+                #if !defined( __CFA_NO_STATISTICS__ )
+                        __stats_t local_stats;
+                        __init_stats( &local_stats );
+                        kernelTLS.this_stats = &local_stats;
+                #endif
                 cluster * cltr = (cluster *)arg;
                 struct __io_data & ring = *cltr->io;
+                ring.poller.slow.id.id = doregister( &ring.poller.slow.id );
                 sigset_t mask;
 …
                                 // Update statistics
                                 #if !defined(__CFA_NO_STATISTICS__)
                                         ring.completion_q.stats.completed_avg.val += count;
                                         ring.completion_q.stats.completed_avg.slow_cnt += 1;
+                                        __tls_stats()->io.complete_q.completed_avg.val += count;
+                                        __tls_stats()->io.complete_q.completed_avg.slow_cnt += 1;
                                 #endif
                                 if(again) {
                                         __cfadbg_print_safe(io_core, "Kernel I/O : Moving to ring %p to fast poller\n", &ring);
                                         __unpark( &ring.poller.fast.thrd __cfaabi_dbg_ctx2 );
+                                        __unpark( &ring.poller.slow.id, &ring.poller.fast.thrd __cfaabi_dbg_ctx2 );
                                         wait( ring.poller.sem );
+                                }
 …
                                 // Update statistics
                                 #if !defined(__CFA_NO_STATISTICS__)
                                         ring.completion_q.stats.completed_avg.val += count;
                                         ring.completion_q.stats.completed_avg.slow_cnt += 1;
+                                        __tls_stats()->io.complete_q.completed_avg.val += count;
+                                        __tls_stats()->io.complete_q.completed_avg.slow_cnt += 1;
                                 #endif
+                        }
 …
                 __cfadbg_print_safe(io_core, "Kernel I/O : Slow poller for ring %p stopping\n", &ring);
+                unregister( &ring.poller.slow.id );
                 return 0p;
 …
                         int count;
                         bool again;
+                        [count, again] = __drain_io( *this.ring, 0p, 0, false );
+                        if(!again) reset++;
+                        // Update statistics
+                        #if !defined(__CFA_NO_STATISTICS__)
+                                this.ring->completion_q.stats.completed_avg.val += count;
+                                this.ring->completion_q.stats.completed_avg.fast_cnt += 1;
+                        #endif
+                        disable_interrupts();
+                                [count, again] = __drain_io( *this.ring, 0p, 0, false );
+                                if(!again) reset++;
+                                // Update statistics
+                                #if !defined(__CFA_NO_STATISTICS__)
+                                        __tls_stats()->io.complete_q.completed_avg.val += count;
+                                        __tls_stats()->io.complete_q.completed_avg.fast_cnt += 1;
+                                #endif
+                        enable_interrupts( __cfaabi_dbg_ctx );
                         // If we got something, just yield and check again
 …
                 verify( data != 0 );
                 // Prepare the data we need
                 __attribute((unused)) int len   = 0;
 …
                 uint32_t cnt = *ring.submit_q.num;
                 uint32_t mask = *ring.submit_q.mask;
+                uint32_t off = __tls_rand();
+                disable_interrupts();
+                        uint32_t off = __tls_rand();
+                enable_interrupts( __cfaabi_dbg_ctx );
                 // Loop around looking for an available spot
                 LOOKING: for() {
+                for() {
                         // Look through the list starting at some offset
                         for(i; cnt) {
 …
                                         // update statistics
                                         #if !defined(__CFA_NO_STATISTICS__)
+                                                __atomic_fetch_add( &ring.submit_q.stats.alloc_avg.val,   len,   __ATOMIC_RELAXED );
+                                                __atomic_fetch_add( &ring.submit_q.stats.alloc_avg.block, block, __ATOMIC_RELAXED );
+                                                __atomic_fetch_add( &ring.submit_q.stats.alloc_avg.cnt,   1,     __ATOMIC_RELAXED );
+                                                disable_interrupts();
+                                                        __tls_stats()->io.submit_q.alloc_avg.val   += len;
+                                                        __tls_stats()->io.submit_q.alloc_avg.block += block;
+                                                        __tls_stats()->io.submit_q.alloc_avg.cnt   += 1;
+                                                enable_interrupts( __cfaabi_dbg_ctx );
                                         #endif
                                         // Success return the data
 …
                 uint32_t * const tail = ring.submit_q.tail;
                 const uint32_t mask = *ring.submit_q.mask;
+                disable_interrupts();
                 // There are 2 submission schemes, check which one we are using
 …
                         // update statistics
                         #if !defined(__CFA_NO_STATISTICS__)
                                 __atomic_fetch_add( &ring.submit_q.stats.look_avg.val,   len,   __ATOMIC_RELAXED );
                                 __atomic_fetch_add( &ring.submit_q.stats.look_avg.block, block, __ATOMIC_RELAXED );
                                 __atomic_fetch_add( &ring.submit_q.stats.look_avg.cnt,   1,     __ATOMIC_RELAXED );
+                                __tls_stats()->io.submit_q.look_avg.val   += len;
+                                __tls_stats()->io.submit_q.look_avg.block += block;
+                                __tls_stats()->io.submit_q.look_avg.cnt   += 1;
                         #endif
 …
                         // update statistics
                         #if !defined(__CFA_NO_STATISTICS__)
                                 ring.submit_q.stats.submit_avg.csm += 1;
                                 ring.submit_q.stats.submit_avg.cnt += 1;
+                                __tls_stats()->io.submit_q.submit_avg.csm += 1;
+                                __tls_stats()->io.submit_q.submit_avg.cnt += 1;
                         #endif
+                        ring.submit_q.sqes[ idx & mask ].user_data = 0;
                         unlock(ring.submit_q.lock);
                         __cfadbg_print_safe( io, "Kernel I/O : Performed io_submit for %p, returned %d\n", active_thread(), ret );
+                }
+                enable_interrupts( __cfaabi_dbg_ctx );
+        }

libcfa/src/concurrency/kernel.cfa

-              r7f9968ad
+              r8b58bae
 // Kernel Scheduling logic
 static $thread * __next_thread(cluster * this);
+static bool __has_next_thread(cluster * this);
 static void __run_thread(processor * this, $thread * dst);
-static $thread * __halt(processor * this);
-static bool __wake_one(cluster * cltr, bool was_empty);
 static bool __wake_proc(processor *);
+static bool __wake_one(struct __processor_id_t * id, cluster * cltr);
+static void __halt(processor * this);
 //-----------------------------------------------------------------------------
 // Kernel storage
+KERNEL_STORAGE(cluster,         mainCluster);
+KERNEL_STORAGE(processor,       mainProcessor);
+KERNEL_STORAGE($thread, mainThread);
+KERNEL_STORAGE(__stack_t,       mainThreadCtx);
+cluster     * mainCluster;
+processor   * mainProcessor;
+$thread * mainThread;
+KERNEL_STORAGE(cluster,              mainCluster);
+KERNEL_STORAGE(processor,            mainProcessor);
+KERNEL_STORAGE($thread,              mainThread);
+KERNEL_STORAGE(__stack_t,            mainThreadCtx);
+KERNEL_STORAGE(__scheduler_RWLock_t, __scheduler_lock);
+#if !defined(__CFA_NO_STATISTICS__)
+KERNEL_STORAGE(__stats_t, mainProcStats);
+#endif
+cluster              * mainCluster;
+processor            * mainProcessor;
+$thread              * mainThread;
+__scheduler_RWLock_t * __scheduler_lock;
 extern "C" {
 …
 thread_local struct KernelThreadData kernelTLS __attribute__ ((tls_model ( "initial-exec" ))) = {
         NULL,                                                                                           // cannot use 0p
+        NULL,
         NULL,
         { 1, false, false },
 …
 void ?{}( $thread & this, current_stack_info_t * info) with( this ) {
+        ticket = 1;
         state = Start;
         self_cor{ info };
 …
         self_mon.recursion = 1;
         self_mon_p = &self_mon;
+        next = 0p;
+        link.next = 0p;
+        link.prev = 0p;
         node.next = 0p;
 …
 static void * __invoke_processor(void * arg);
 void ?{}(processor & this, const char name[], cluster & cltr) with( this ) {
+void ?{}(processor & this, const char name[], cluster & _cltr) with( this ) {
         this.name = name;
+        this.cltr = &cltr;
+        this.cltr = &_cltr;
+        id = -1u;
         terminated{ 0 };
         destroyer = 0p;
 …
         this.stack = __create_pthread( &this.kernel_thread, __invoke_processor, (void *)&this );
+        __atomic_fetch_add( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
         __cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
 …
         free( this.stack );
+        __atomic_fetch_sub( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+}
 …
         this.name = name;
         this.preemption_rate = preemption_rate;
+        this.nprocessors = 0;
         ready_queue{};
-        ready_queue_lock{};
         #if !defined(__CFA_NO_STATISTICS__)
                 print_stats = false;
+                stats = alloc();
+                __init_stats( stats );
         #endif
-        procs{ __get };
-        idles{ __get };
         threads{ __get };
 …
 void ^?{}(cluster & this) {
         __kernel_io_shutdown( this, &this == mainCluster );
+        #if !defined(__CFA_NO_STATISTICS__)
+                if(this.print_stats) {
+                        __print_stats( this.stats );
+                }
+                free( this.stats );
+        #endif
         unregister(this);
 …
         __cfadbg_print_safe(runtime_core, "Kernel : core %p starting\n", this);
+        doregister(this->cltr, this);
+        // register the processor unless it's the main thread which is handled in the boot sequence
+        if(this != mainProcessor) {
+                this->id = doregister((__processor_id_t*)this);
+                // Lock the RWlock so no-one pushes/pops while we are changing the queue
+                uint_fast32_t last_size = ready_mutate_lock();
+                        // Adjust the ready queue size
+                        ready_queue_grow( this->cltr );
+                // Unlock the RWlock
+                ready_mutate_unlock( last_size );
+        }
+        {
 …
                         readyThread = __next_thread( this->cltr );
-                        // If no ready thread
-                        if( readyThread == 0p ) {
-                                // Block until a thread is ready
-                                readyThread = __halt(this);
+                        }
                         // Check if we actually found a thread
                         if( readyThread ) {
                                 /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
                                 /* paranoid */ verifyf( readyThread->state == Ready || readyThread->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", readyThread->state, readyThread->preempted);
+                                /* paranoid */ verifyf( readyThread->next == 0p, "Expected null got %p", readyThread->next );
+                                /* paranoid */ verifyf( readyThread->link.next == 0p, "Expected null got %p", readyThread->link.next );
+                                __builtin_prefetch( readyThread->context.SP );
                                 // We found a thread run it
 …
                                 /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                        }
+                        else {
+                                // Block until a thread is ready
+                                __halt(this);
+                        }
+                }
 …
+        }
-        unregister(this->cltr, this);
         V( this->terminated );
+        // unregister the processor unless it's the main thread which is handled in the boot sequence
+        if(this != mainProcessor) {
+                // Lock the RWlock so no-one pushes/pops while we are changing the queue
+                uint_fast32_t last_size = ready_mutate_lock();
+                        // Adjust the ready queue size
+                        ready_queue_shrink( this->cltr );
+                        // Make sure we aren't on the idle queue
+                        #if !defined(__CFA_NO_STATISTICS__)
+                                bool removed =
+                        #endif
+                        unsafe_remove( this->cltr->idles, this );
+                        #if !defined(__CFA_NO_STATISTICS__)
+                                if(removed) __tls_stats()->ready.sleep.exits++;
+                        #endif
+                // Unlock the RWlock
+                ready_mutate_unlock( last_size );
+                // Finally we don't need the read_lock any more
+                unregister((__processor_id_t*)this);
+        }
+        else {
+                // HACK : the coroutine context switch expects this_thread to be set
+                // and it make sense for it to be set in all other cases except here
+                // fake it
+                kernelTLS.this_thread = mainThread;
+        }
         __cfadbg_print_safe(runtime_core, "Kernel : core %p terminated\n", this);
-        // HACK : the coroutine context switch expects this_thread to be set
-        // and it make sense for it to be set in all other cases except here
-        // fake it
-        if( this == mainProcessor ) kernelTLS.this_thread = mainThread;
+}
 …
         // Actually run the thread
         RUNNING:  while(true) {
+                if(unlikely(thrd_dst->preempted)) {
+                        thrd_dst->preempted = __NO_PREEMPTION;
+                        verify(thrd_dst->state == Active  || thrd_dst->state == Rerun);
+                } else {
+                        verify(thrd_dst->state == Blocked || thrd_dst->state == Ready); // Ready means scheduled normally, blocked means rerun
+                        thrd_dst->state = Active;
+                }
+                thrd_dst->preempted = __NO_PREEMPTION;
+                thrd_dst->state = Active;
                 __cfaabi_dbg_debug_do(
 …
                 if(unlikely(thrd_dst->preempted != __NO_PREEMPTION)) {
                         // The thread was preempted, reschedule it and reset the flag
                         __schedule_thread( thrd_dst );
+                        __schedule_thread( (__processor_id_t*)this, thrd_dst );
                         break RUNNING;
+                }
+                if(unlikely(thrd_dst->state == Halted)) {
+                        // The thread has halted, it should never be scheduled/run again
+                        // We may need to wake someone up here since
+                        unpark( this->destroyer __cfaabi_dbg_ctx2 );
+                        this->destroyer = 0p;
+                        break RUNNING;
+                }
+                /* paranoid */ verify( thrd_dst->state == Active );
+                thrd_dst->state = Blocked;
                 // set state of processor coroutine to active and the thread to inactive
+                static_assert(sizeof(thrd_dst->state) == sizeof(int));
+                enum coroutine_state old_state = __atomic_exchange_n(&thrd_dst->state, Blocked, __ATOMIC_SEQ_CST);
+                __cfaabi_dbg_debug_do( thrd_dst->park_result = old_state; )
+                switch(old_state) {
+                        case Halted:
+                                // The thread has halted, it should never be scheduled/run again, leave it back to Halted and move on
+                                thrd_dst->state = Halted;
+                                // We may need to wake someone up here since
+                                unpark( this->destroyer __cfaabi_dbg_ctx2 );
+                                this->destroyer = 0p;
+                                break RUNNING;
+                        case Active:
+                int old_ticket = __atomic_fetch_sub(&thrd_dst->ticket, 1, __ATOMIC_SEQ_CST);
+                __cfaabi_dbg_debug_do( thrd_dst->park_result = old_ticket; )
+                switch(old_ticket) {
+                        case 1:
                                 // This is case 1, the regular case, nothing more is needed
                                 break RUNNING;
                         case Rerun:
+                        case 2:
                                 // This is case 2, the racy case, someone tried to run this thread before it finished blocking
                                 // In this case, just run it again.
 …
                         default:
                                 // This makes no sense, something is wrong abort
                                 abort("Finished running a thread that was Blocked/Start/Primed %d\n", old_state);
+                                abort();
+                }
+        }
 …
         $coroutine * proc_cor = get_coroutine(kernelTLS.this_processor->runner);
         $thread * thrd_src = kernelTLS.this_thread;
+        #if !defined(__CFA_NO_STATISTICS__)
+                struct processor * last_proc = kernelTLS.this_processor;
+        #endif
         // Run the thread on this processor
 …
+        }
+        #if !defined(__CFA_NO_STATISTICS__)
+                if(last_proc != kernelTLS.this_processor) {
+                        __tls_stats()->ready.threads.migration++;
+                }
+        #endif
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         /* paranoid */ verifyf( ((uintptr_t)thrd_src->context.SP) < ((uintptr_t)__get_stack(thrd_src->curr_cor)->base ), "ERROR : Returning $thread %p has been corrupted.\n StackPointer too small.\n", thrd_src );
 …
 // It effectively constructs a coroutine by stealing the pthread stack
 static void * __invoke_processor(void * arg) {
+        #if !defined( __CFA_NO_STATISTICS__ )
+                __stats_t local_stats;
+                __init_stats( &local_stats );
+                kernelTLS.this_stats = &local_stats;
+        #endif
         processor * proc = (processor *) arg;
         kernelTLS.this_processor = proc;
 …
         __cfadbg_print_safe(runtime_core, "Kernel : core %p main ended (%p)\n", proc, &proc->runner);
+        #if !defined(__CFA_NO_STATISTICS__)
+                __tally_stats(proc->cltr->stats, &local_stats);
+        #endif
         return 0p;
+}
 …
 // Scheduler routines
 // KERNEL ONLY
+void __schedule_thread( $thread * thrd ) with( *thrd->curr_cluster ) {
+void __schedule_thread( struct __processor_id_t * id, $thread * thrd ) {
+        /* paranoid */ verify( thrd );
+        /* paranoid */ verify( thrd->state != Halted );
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         /* paranoid */ #if defined( __CFA_WITH_VERIFY__ )
         /* paranoid */ if( thrd->state == Blocked || thrd->state == Start ) assertf( thrd->preempted == __NO_PREEMPTION,
                           "Error inactive thread marked as preempted, state %d, preemption %d\n", thrd->state, thrd->preempted );
         /* paranoid */ if( thrd->preempted != __NO_PREEMPTION ) assertf(thrd->state == Active || thrd->state == Rerun,
                           "Error preempted thread marked as not currently running, state %d, preemption %d\n", thrd->state, thrd->preempted );
+        /* paranoid */  if( thrd->state == Blocked || thrd->state == Start ) assertf( thrd->preempted == __NO_PREEMPTION,
+                                        "Error inactive thread marked as preempted, state %d, preemption %d\n", thrd->state, thrd->preempted );
+        /* paranoid */  if( thrd->preempted != __NO_PREEMPTION ) assertf(thrd->state == Active,
+                                        "Error preempted thread marked as not currently running, state %d, preemption %d\n", thrd->state, thrd->preempted );
         /* paranoid */ #endif
         /* paranoid */ verifyf( thrd->next == 0p, "Expected null got %p", thrd->next );
+        /* paranoid */ verifyf( thrd->link.next == 0p, "Expected null got %p", thrd->link.next );
         if (thrd->preempted == __NO_PREEMPTION) thrd->state = Ready;
+        lock  ( ready_queue_lock __cfaabi_dbg_ctx2 );
+        bool was_empty = !(ready_queue != 0);
+        append( ready_queue, thrd );
+        unlock( ready_queue_lock );
+        __wake_one(thrd->curr_cluster, was_empty);
+        ready_schedule_lock  ( id );
+                push( thrd->curr_cluster, thrd );
+                #if !defined(__CFA_NO_STATISTICS__)
+                        bool woke =
+                #endif
+                        __wake_one(id, thrd->curr_cluster);
+                #if !defined(__CFA_NO_STATISTICS__)
+                        if(woke) __tls_stats()->ready.sleep.wakes++;
+                #endif
+        ready_schedule_unlock( id );
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 …
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         lock( ready_queue_lock __cfaabi_dbg_ctx2 );
         $thread * head = pop_head( ready_queue );
         unlock( ready_queue_lock );
+        ready_schedule_lock  ( (__processor_id_t*)kernelTLS.this_processor );
+                $thread * head = pop( this );
+        ready_schedule_unlock( (__processor_id_t*)kernelTLS.this_processor );
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 …
+}
+// KERNEL ONLY
+static bool __has_next_thread(cluster * this) with( *this ) {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        ready_schedule_lock  ( (__processor_id_t*)kernelTLS.this_processor );
+                bool not_empty = query( this );
+        ready_schedule_unlock( (__processor_id_t*)kernelTLS.this_processor );
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        return not_empty;
+}
 // KERNEL ONLY unpark with out disabling interrupts
+void __unpark( $thread * thrd __cfaabi_dbg_ctx_param2 ) {
+        static_assert(sizeof(thrd->state) == sizeof(int));
+void __unpark(  struct __processor_id_t * id, $thread * thrd __cfaabi_dbg_ctx_param2 ) {
         // record activity
         __cfaabi_dbg_debug_do( char * old_caller = thrd->unpark_caller; )
         __cfaabi_dbg_record_thrd( *thrd, false, caller );
         enum coroutine_state old_state = __atomic_exchange_n(&thrd->state, Rerun, __ATOMIC_SEQ_CST);
         __cfaabi_dbg_debug_do( thrd->unpark_result = old_state; )
         switch(old_state) {
                 case Active:
+        int old_ticket = __atomic_fetch_add(&thrd->ticket, 1, __ATOMIC_SEQ_CST);
+        __cfaabi_dbg_debug_do( thrd->unpark_result = old_ticket; thrd->unpark_state = thrd->state; )
+        switch(old_ticket) {
+                case 1:
                         // Wake won the race, the thread will reschedule/rerun itself
                         break;
                 case Blocked:
+                case 0:
                         /* paranoid */ verify( ! thrd->preempted != __NO_PREEMPTION );
+                        /* paranoid */ verify( thrd->state == Blocked );
                         // Wake lost the race,
+                        thrd->state = Blocked;
+                        __schedule_thread( thrd );
+                        __schedule_thread( id, thrd );
                         break;
-                case Rerun:
-                        abort("More than one thread attempted to schedule thread %p\n", thrd);
-                        break;
-                case Halted:
-                case Start:
-                case Primed:
                 default:
                         // This makes no sense, something is wrong abort
 …
         disable_interrupts();
         __unpark( thrd __cfaabi_dbg_ctx_fwd2 );
+        __unpark( (__processor_id_t*)kernelTLS.this_processor, thrd __cfaabi_dbg_ctx_fwd2 );
         enable_interrupts( __cfaabi_dbg_ctx );
+}
 …
         $thread * thrd = kernelTLS.this_thread;
         /* paranoid */ verify(thrd->state == Active || thrd->state == Rerun);
+        /* paranoid */ verify(thrd->state == Active);
         // SKULLDUGGERY: It is possible that we are preempting this thread just before
 …
         // If that is the case, abandon the preemption.
         bool preempted = false;
         if(thrd->next == 0p) {
+        if(thrd->link.next == 0p) {
                 preempted = true;
                 thrd->preempted = reason;
 …
         __cfa_dbg_global_clusters.list{ __get };
         __cfa_dbg_global_clusters.lock{};
+        // Initialize the global scheduler lock
+        __scheduler_lock = (__scheduler_RWLock_t*)&storage___scheduler_lock;
+        (*__scheduler_lock){};
         // Initialize the main cluster
 …
                 pending_preemption = false;
                 kernel_thread = pthread_self();
+                id = -1u;
                 runner{ &this };
                 __cfadbg_print_safe(runtime_core, "Kernel : constructed main processor context %p\n", &runner);
+                __atomic_fetch_add( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+        }
 …
         (*mainProcessor){};
+        mainProcessor->id = doregister( (__processor_id_t*)mainProcessor);
         //initialize the global state variables
         kernelTLS.this_processor = mainProcessor;
         kernelTLS.this_thread    = mainThread;
+        #if !defined( __CFA_NO_STATISTICS__ )
+                kernelTLS.this_stats = (__stats_t *)& storage_mainProcStats;
+                __init_stats( kernelTLS.this_stats );
+        #endif
         // Enable preemption
         kernel_start_preemption();
 …
         // Add the main thread to the ready queue
         // once resume is called on mainProcessor->runner the mainThread needs to be scheduled like any normal thread
         __schedule_thread(mainThread);
+        __schedule_thread((__processor_id_t *)mainProcessor, mainThread);
         // SKULLDUGGERY: Force a context switch to the main processor to set the main thread's context to the current UNIX
 …
         kernel_stop_preemption();
+        unregister((__processor_id_t*)mainProcessor);
         // Destroy the main processor and its context in reverse order of construction
         // These were manually constructed so we need manually destroy them
         void ^?{}(processor & this) with( this ){
                 /* paranoid */ verify( this.do_terminate == true );
+                __atomic_fetch_sub( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+                __cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
+        }
 …
         // Final step, destroy the main thread since it is no longer needed
         // Since we provided a stack to this taxk it will not destroy anything
         /* paranoid */ verify(mainThread->self_cor.stack.storage == (__stack_t*)(((uintptr_t)&storage_mainThreadCtx)| 0x1));
 …
         ^(*mainCluster){};
+        ^(*__scheduler_lock){};
         ^(__cfa_dbg_global_clusters.list){};
         ^(__cfa_dbg_global_clusters.lock){};
 …
 // Kernel Idle Sleep
 //=============================================================================================
-static $thread * __halt(processor * this) with( *this ) {
-        if( do_terminate ) return 0p;
-        // First, lock the cluster idle
-        lock( cltr->idle_lock __cfaabi_dbg_ctx2 );
-        // Check if we can find a thread
-        if( $thread * found = __next_thread( cltr ) ) {
-                unlock( cltr->idle_lock );
-                return found;
+        }
-        // Move this processor from the active list to the idle list
-        move_to_front(cltr->procs, cltr->idles, *this);
-        // Unlock the idle lock so we don't go to sleep with a lock
-        unlock    (cltr->idle_lock);
-        // We are ready to sleep
-        __cfadbg_print_safe(runtime_core, "Kernel : Processor %p ready to sleep\n", this);
-        wait( idle );
-        // We have woken up
-        __cfadbg_print_safe(runtime_core, "Kernel : Processor %p woke up and ready to run\n", this);
-        // Get ourself off the idle list
-        with( *cltr ) {
-                lock  (idle_lock __cfaabi_dbg_ctx2);
-                move_to_front(idles, procs, *this);
-                unlock(idle_lock);
+        }
-        // Don't check the ready queue again, we may not be in a position to run a thread
-        return 0p;
+}
 // Wake a thread from the front if there are any
+static bool __wake_one(cluster * this, __attribute__((unused)) bool force) {
+        // if we don't want to force check if we know it's false
+        // if( !this->idles.head && !force ) return false;
+        // First, lock the cluster idle
+        lock( this->idle_lock __cfaabi_dbg_ctx2 );
+        // Check if there is someone to wake up
+        if( !this->idles.head ) {
+                // Nope unlock and return false
+                unlock( this->idle_lock );
+                return false;
+        }
+        // Wake them up
+        __cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this->idles.head);
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        post( this->idles.head->idle );
+        // Unlock and return true
+        unlock( this->idle_lock );
+static bool __wake_one(struct __processor_id_t * id, cluster * this) {
+        /* paranoid */ verify( ready_schedule_islocked( id ) );
+        // Check if there is a sleeping processor
+        processor * p = pop(this->idles);
+        // If no one is sleeping, we are done
+        if( 0p == p ) return false;
+        // We found a processor, wake it up
+        post( p->idle );
         return true;
+}
 …
         return ret;
+}
+static void __halt(processor * this) with( *this ) {
+        if( do_terminate ) return;
+        #if !defined(__CFA_NO_STATISTICS__)
+                __tls_stats()->ready.sleep.halts++;
+        #endif
+        // Push self to queue
+        push(cltr->idles, *this);
+        // Makre sure we don't miss a thread
+        if( __has_next_thread(cltr) ) {
+                // A thread was posted, make sure a processor is woken up
+                struct __processor_id_t *id = (struct __processor_id_t *) this;
+                ready_schedule_lock  ( id );
+                        __wake_one( id, cltr );
+                ready_schedule_unlock( id );
+                #if !defined(__CFA_NO_STATISTICS__)
+                        __tls_stats()->ready.sleep.cancels++;
+                #endif
+        }
+        wait( idle );
+}
 …
         cltr->nthreads -= 1;
         unlock(cltr->thread_list_lock);
+}
-void doregister( cluster * cltr, processor * proc ) {
-        lock      (cltr->idle_lock __cfaabi_dbg_ctx2);
-        cltr->nprocessors += 1;
-        push_front(cltr->procs, *proc);
-        unlock    (cltr->idle_lock);
+}
-void unregister( cluster * cltr, processor * proc ) {
-        lock  (cltr->idle_lock __cfaabi_dbg_ctx2);
-        remove(cltr->procs, *proc );
-        cltr->nprocessors -= 1;
-        unlock(cltr->idle_lock);
+}

libcfa/src/concurrency/kernel.hfa

-              r7f9968ad
+              r8b58bae
 #include "coroutine.hfa"
+#include "containers/stackLockFree.hfa"
 extern "C" {
 #include <pthread.h>
 …
 extern struct cluster * mainCluster;
+// Processor
+// Processor id, required for scheduling threads
+struct __processor_id_t {
+        unsigned id;
+        #if !defined(__CFA_NO_STATISTICS__)
+                struct __stats_t * stats;
+        #endif
+};
 coroutine processorCtx_t {
         struct processor * proc;
 …
 // Wrapper around kernel threads
 struct processor {
+struct __attribute__((aligned(128))) processor {
         // Main state
+        inline __processor_id_t;
+        // Cluster from which to get threads
+        struct cluster * cltr;
+        // Set to true to notify the processor should terminate
+        volatile bool do_terminate;
         // Coroutine ctx who does keeps the state of the processor
         struct processorCtx_t runner;
-        // Cluster from which to get threads
-        struct cluster * cltr;
         // Name of the processor
 …
         __bin_sem_t idle;
-        // Termination
-        // Set to true to notify the processor should terminate
-        volatile bool do_terminate;
         // Termination synchronisation (user semaphore)
         semaphore terminated;
 …
         // Link lists fields
+        struct __dbg_node_proc {
+                struct processor * next;
+                struct processor * prev;
+        } node;
+        Link(processor) link;
 #ifdef __CFA_DEBUG__
 …
 static inline void  ?{}(processor & this, const char name[]) { this{name, *mainCluster }; }
 static inline [processor *&, processor *& ] __get( processor & this ) __attribute__((const)) { return this.node.[next, prev]; }
+static inline Link(processor) * ?`next( processor * this ) { return &this->link; }
 //-----------------------------------------------------------------------------
 …
 #define CFA_CLUSTER_IO_BUFFLEN_OFFSET        16
+//-----------------------------------------------------------------------------
+// Cluster Tools
+// Intrusives lanes which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __intrusive_lane_t;
+void  ?{}(__intrusive_lane_t & this);
+void ^?{}(__intrusive_lane_t & this);
+// Counter used for wether or not the lanes are all empty
+struct __attribute__((aligned(128))) __snzi_node_t;
+struct __snzi_t {
+        unsigned mask;
+        int root;
+        __snzi_node_t * nodes;
+};
+void  ?{}( __snzi_t & this, unsigned depth );
+void ^?{}( __snzi_t & this );
+//TODO adjust cache size to ARCHITECTURE
+// Structure holding the relaxed ready queue
+struct __ready_queue_t {
+        // Data tracking how many/which lanes are used
+        // Aligned to 128 for cache locality
+        __snzi_t snzi;
+        // Data tracking the actual lanes
+        // On a seperate cacheline from the used struct since
+        // used can change on each push/pop but this data
+        // only changes on shrink/grow
+        struct {
+                // Arary of lanes
+                __intrusive_lane_t * volatile data;
+                // Number of lanes (empty or not)
+                volatile size_t count;
+        } lanes;
+};
+void  ?{}(__ready_queue_t & this);
+void ^?{}(__ready_queue_t & this);
 //-----------------------------------------------------------------------------
 // Cluster
+struct cluster {
+        // Ready queue locks
+        __spinlock_t ready_queue_lock;
+struct __attribute__((aligned(128))) cluster {
         // Ready queue for threads
         __queue_t($thread) ready_queue;
+        __ready_queue_t ready_queue;
         // Name of the cluster
 …
         Duration preemption_rate;
+        // List of processors
+        __spinlock_t idle_lock;
+        __dllist_t(struct processor) procs;
+        __dllist_t(struct processor) idles;
+        unsigned int nprocessors;
+        // List of idle processors
+        StackLF(processor) idles;
+        volatile unsigned int nprocessors;
         // List of threads
 …
         #if !defined(__CFA_NO_STATISTICS__)
                 bool print_stats;
+                struct __stats_t * stats;
         #endif
 };

libcfa/src/concurrency/kernel_private.hfa

-              r7f9968ad
+              r8b58bae
 #include "alarm.hfa"
+#include "stats.hfa"
+#include "bits/random.hfa"
 //-----------------------------------------------------------------------------
 // Scheduler
+struct __attribute__((aligned(128))) __scheduler_lock_id_t;
 extern "C" {
 …
+}
 void __schedule_thread( $thread * ) __attribute__((nonnull (1)));
+void __schedule_thread( struct __processor_id_t *, $thread * ) __attribute__((nonnull (2)));
 //Block current thread and release/wake-up the following resources
 …
 // KERNEL ONLY unpark with out disabling interrupts
 void __unpark( $thread * thrd __cfaabi_dbg_ctx_param2 );
+void __unpark( struct __processor_id_t *, $thread * thrd __cfaabi_dbg_ctx_param2 );
 //-----------------------------------------------------------------------------
 …
 //-----------------------------------------------------------------------------
 // Utils
+#define KERNEL_STORAGE(T,X) static char storage_##X[sizeof(T)]
+static inline uint32_t __tls_rand() {
+        kernelTLS.rand_seed ^= kernelTLS.rand_seed << 6;
+        kernelTLS.rand_seed ^= kernelTLS.rand_seed >> 21;
+        kernelTLS.rand_seed ^= kernelTLS.rand_seed << 7;
+        return kernelTLS.rand_seed;
+#define KERNEL_STORAGE(T,X) __attribute((aligned(__alignof__(T)))) static char storage_##X[sizeof(T)]
+static inline uint64_t __tls_rand() {
+        // kernelTLS.rand_seed ^= kernelTLS.rand_seed << 6;
+        // kernelTLS.rand_seed ^= kernelTLS.rand_seed >> 21;
+        // kernelTLS.rand_seed ^= kernelTLS.rand_seed << 7;
+        // return kernelTLS.rand_seed;
+        return __lehmer64( kernelTLS.rand_seed );
+}
 …
 void unregister( struct cluster * cltr, struct $thread & thrd );
+void doregister( struct cluster * cltr, struct processor * proc );
+void unregister( struct cluster * cltr, struct processor * proc );
+//=======================================================================
+// Cluster lock API
+//=======================================================================
+// Cells use by the reader writer lock
+// while not generic it only relies on a opaque pointer
+struct __attribute__((aligned(128))) __scheduler_lock_id_t {
+        // Spin lock used as the underlying lock
+        volatile bool lock;
+        // Handle pointing to the proc owning this cell
+        // Used for allocating cells and debugging
+        __processor_id_t * volatile handle;
+        #ifdef __CFA_WITH_VERIFY__
+                // Debug, check if this is owned for reading
+                bool owned;
+        #endif
+};
+static_assert( sizeof(struct __scheduler_lock_id_t) <= __alignof(struct __scheduler_lock_id_t));
+// Lock-Free registering/unregistering of threads
+// Register a processor to a given cluster and get its unique id in return
+unsigned doregister( struct __processor_id_t * proc );
+// Unregister a processor from a given cluster using its id, getting back the original pointer
+void     unregister( struct __processor_id_t * proc );
+//=======================================================================
+// Reader-writer lock implementation
+// Concurrent with doregister/unregister,
+//    i.e., threads can be added at any point during or between the entry/exit
+//-----------------------------------------------------------------------
+// simple spinlock underlying the RWLock
+// Blocking acquire
+static inline void __atomic_acquire(volatile bool * ll) {
+        while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) {
+                while(__atomic_load_n(ll, (int)__ATOMIC_RELAXED))
+                        asm volatile("pause");
+        }
+        /* paranoid */ verify(*ll);
+}
+// Non-Blocking acquire
+static inline bool __atomic_try_acquire(volatile bool * ll) {
+        return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
+}
+// Release
+static inline void __atomic_unlock(volatile bool * ll) {
+        /* paranoid */ verify(*ll);
+        __atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
+}
+//-----------------------------------------------------------------------
+// Reader-Writer lock protecting the ready-queues
+// while this lock is mostly generic some aspects
+// have been hard-coded to for the ready-queue for
+// simplicity and performance
+struct __scheduler_RWLock_t {
+        // total cachelines allocated
+        unsigned int max;
+        // cachelines currently in use
+        volatile unsigned int alloc;
+        // cachelines ready to itereate over
+        // (!= to alloc when thread is in second half of doregister)
+        volatile unsigned int ready;
+        // writer lock
+        volatile bool lock;
+        // data pointer
+        __scheduler_lock_id_t * data;
+};
+void  ?{}(__scheduler_RWLock_t & this);
+void ^?{}(__scheduler_RWLock_t & this);
+extern __scheduler_RWLock_t * __scheduler_lock;
+//-----------------------------------------------------------------------
+// Reader side : acquire when using the ready queue to schedule but not
+//  creating/destroying queues
+static inline void ready_schedule_lock( struct __processor_id_t * proc) with(*__scheduler_lock) {
+        unsigned iproc = proc->id;
+        /*paranoid*/ verify(data[iproc].handle == proc);
+        /*paranoid*/ verify(iproc < ready);
+        // Step 1 : make sure no writer are in the middle of the critical section
+        while(__atomic_load_n(&lock, (int)__ATOMIC_RELAXED))
+                asm volatile("pause");
+        // Fence needed because we don't want to start trying to acquire the lock
+        // before we read a false.
+        // Not needed on x86
+        // std::atomic_thread_fence(std::memory_order_seq_cst);
+        // Step 2 : acquire our local lock
+        __atomic_acquire( &data[iproc].lock );
+        /*paranoid*/ verify(data[iproc].lock);
+        #ifdef __CFA_WITH_VERIFY__
+                // Debug, check if this is owned for reading
+                data[iproc].owned = true;
+        #endif
+}
+static inline void ready_schedule_unlock( struct __processor_id_t * proc) with(*__scheduler_lock) {
+        unsigned iproc = proc->id;
+        /*paranoid*/ verify(data[iproc].handle == proc);
+        /*paranoid*/ verify(iproc < ready);
+        /*paranoid*/ verify(data[iproc].lock);
+        /*paranoid*/ verify(data[iproc].owned);
+        #ifdef __CFA_WITH_VERIFY__
+                // Debug, check if this is owned for reading
+                data[iproc].owned = false;
+        #endif
+        __atomic_unlock(&data[iproc].lock);
+}
+#ifdef __CFA_WITH_VERIFY__
+        static inline bool ready_schedule_islocked( struct __processor_id_t * proc) {
+                return __scheduler_lock->data[proc->id].owned;
+        }
+        static inline bool ready_mutate_islocked() {
+                return __scheduler_lock->lock;
+        }
+#endif
+//-----------------------------------------------------------------------
+// Writer side : acquire when changing the ready queue, e.g. adding more
+//  queues or removing them.
+uint_fast32_t ready_mutate_lock( void );
+void ready_mutate_unlock( uint_fast32_t /* value returned by lock */ );
+//=======================================================================
+// Ready-Queue API
+//-----------------------------------------------------------------------
+// pop thread from the ready queue of a cluster
+// returns 0p if empty
+__attribute__((hot)) bool query(struct cluster * cltr);
+//-----------------------------------------------------------------------
+// push thread onto a ready queue for a cluster
+// returns true if the list was previously empty, false otherwise
+__attribute__((hot)) bool push(struct cluster * cltr, struct $thread * thrd);
+//-----------------------------------------------------------------------
+// pop thread from the ready queue of a cluster
+// returns 0p if empty
+__attribute__((hot)) struct $thread * pop(struct cluster * cltr);
+//-----------------------------------------------------------------------
+// remove thread from the ready queue of a cluster
+// returns bool if it wasn't found
+bool remove_head(struct cluster * cltr, struct $thread * thrd);
+//-----------------------------------------------------------------------
+// Increase the width of the ready queue (number of lanes) by 4
+void ready_queue_grow  (struct cluster * cltr);
+//-----------------------------------------------------------------------
+// Decrease the width of the ready queue (number of lanes) by 4
+void ready_queue_shrink(struct cluster * cltr);
+//-----------------------------------------------------------------------
+// Statics call at the end of each thread to register statistics
+#if !defined(__CFA_NO_STATISTICS__)
+static inline struct __stats_t * __tls_stats() {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        /* paranoid */ verify( kernelTLS.this_stats );
+        return kernelTLS.this_stats;
+}
+#endif
 // Local Variables: //

libcfa/src/concurrency/monitor.cfa

-              r7f9968ad
+              r8b58bae
                 // Some one else has the monitor, wait in line for it
                 /* paranoid */ verify( thrd->next == 0p );
+                /* paranoid */ verify( thrd->link.next == 0p );
                 append( this->entry_queue, thrd );
                 /* paranoid */ verify( thrd->next == 1p );
+                /* paranoid */ verify( thrd->link.next == 1p );
                 unlock( this->lock );
 …
                 // Some one else has the monitor, wait in line for it
                 /* paranoid */ verify( thrd->next == 0p );
+                /* paranoid */ verify( thrd->link.next == 0p );
                 append( this->entry_queue, thrd );
                 /* paranoid */ verify( thrd->next == 1p );
+                /* paranoid */ verify( thrd->link.next == 1p );
                 unlock( this->lock );
 …
         $thread * new_owner = pop_head( this->entry_queue );
         /* paranoid */ verifyf( !this->owner || kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
         /* paranoid */ verify( !new_owner || new_owner->next == 0p );
+        /* paranoid */ verify( !new_owner || new_owner->link.next == 0p );
         __set_owner( this, new_owner );
 …
+        }
         __cfaabi_dbg_print_safe( "Kernel :  Runing %i (%p)\n", ready2run, ready2run ? node->waiting_thread : 0p );
+        __cfaabi_dbg_print_safe( "Kernel :  Runing %i (%p)\n", ready2run, ready2run ? (thread*)node->waiting_thread : (thread*)0p );
         return ready2run ? node->waiting_thread : 0p;
+}
 …
         // For each thread in the entry-queue
         for(    $thread ** thrd_it = &entry_queue.head;
                 *thrd_it != 1p;
                 thrd_it = &(*thrd_it)->next
+                (*thrd_it) != 1p;
+                thrd_it = &(*thrd_it)->link.next
         ) {
                 // For each acceptable check if it matches

libcfa/src/concurrency/preemption.cfa

-              r7f9968ad
+              r8b58bae
 // FwdDeclarations : timeout handlers
 static void preempt( processor   * this );
 static void timeout( $thread * this );
+static void timeout( struct __processor_id_t * id, $thread * this );
 // FwdDeclarations : Signal handlers
 …
 // Tick one frame of the Discrete Event Simulation for alarms
 static void tick_preemption() {
+static void tick_preemption( struct __processor_id_t * id ) {
         alarm_node_t * node = 0p;                                                       // Used in the while loop but cannot be declared in the while condition
         alarm_list_t * alarms = &event_kernel->alarms;          // Local copy for ease of reading
 …
+                }
                 else {
                         timeout( node->thrd );
+                        timeout( id, node->thrd );
+                }
 …
         // If there are still alarms pending, reset the timer
         if( & (*alarms)`first ) {
                 __cfaabi_dbg_print_buffer_decl( " KERNEL: @%ju(%ju) resetting alarm to %ju.\n", currtime.tv, __kernel_get_time().tv, (alarms->head->alarm - currtime).tv);
+                __cfadbg_print_buffer_decl(preemption, " KERNEL: @%ju(%ju) resetting alarm to %ju.\n", currtime.tv, __kernel_get_time().tv, (alarms->head->alarm - currtime).tv);
                 Duration delta = (*alarms)`first.alarm - currtime;
                 Duration capped = max(delta, 50`us);
 …
 // reserved for future use
+static void timeout( $thread * this ) {
+        __unpark( this __cfaabi_dbg_ctx2 );
+static void timeout( struct __processor_id_t * id, $thread * this ) {
+        #if !defined( __CFA_NO_STATISTICS__ )
+                kernelTLS.this_stats = this->curr_cluster->stats;
+        #endif
+        __unpark( id, this __cfaabi_dbg_ctx2 );
+}
 …
 // Waits on SIGALRM and send SIGUSR1 to whom ever needs it
 static void * alarm_loop( __attribute__((unused)) void * args ) {
+        __processor_id_t id;
+        id.id = doregister(&id);
         // Block sigalrms to control when they arrive
         sigset_t mask;
 …
                         // __cfaabi_dbg_print_safe( "Kernel : Preemption thread tick\n" );
                         lock( event_kernel->lock __cfaabi_dbg_ctx2 );
                         tick_preemption();
+                        tick_preemption( &id );
                         unlock( event_kernel->lock );
                         break;
 …
 EXIT:
         __cfaabi_dbg_print_safe( "Kernel : Preemption thread stopping\n" );
+        unregister(&id);
         return 0p;
+}

libcfa/src/concurrency/thread.cfa

-              r7f9968ad
+              r8b58bae
         context{ 0p, 0p };
         self_cor{ name, storage, storageSize };
+        ticket = 1;
         state = Start;
         preempted = __NO_PREEMPTION;
 …
         self_mon_p = &self_mon;
         curr_cluster = &cl;
+        next = 0p;
+        link.next = 0p;
+        link.prev = 0p;
+        link.preferred = -1;
         node.next = 0p;
 …
         verify( this_thrd->context.SP );
         __schedule_thread(this_thrd);
+        __schedule_thread( (__processor_id_t *)kernelTLS.this_processor, this_thrd);
         enable_interrupts( __cfaabi_dbg_ctx );
+}

libcfa/src/containers/stackLockFree.hfa

-              r7f9968ad
+              r8b58bae
 //
+//
 // Cforall Version 1.0.0 Copyright (C) 2017 University of Waterloo
 // The contents of this file are covered under the licence agreement in the
 // file "LICENCE" distributed with Cforall.
 //
 // stackLockFree.hfa --
 //
+// stackLockFree.hfa --
+//
 // Author           : Peter A. Buhr
 // Created On       : Wed May 13 20:58:58 2020
 …
 // Last Modified On : Sun Jun 14 13:25:09 2020
 // Update Count     : 64
 //
+//
 #pragma once
 …
 }; // Link
 forall( dtype T | sized(T) | { Link(T) * getNext( T * ); } ) {
     struct StackLF {
+forall( otype T | sized(T) | { Link(T) * ?`next( T * ); } ) {
+        struct StackLF {
                 Link(T) stack;
         }; // StackLF
 …
                 void push( StackLF(T) & this, T & n ) with(this) {
                         *getNext( &n ) = stack;                                         // atomic assignment unnecessary, or use CAA
+                        *( &n )`next = stack;                                   // atomic assignment unnecessary, or use CAA
                         for () {                                                                        // busy wait
                           if ( __atomic_compare_exchange_n( &stack.atom, &getNext( &n )->atom, (Link(T))@{ {&n, getNext( &n )->count + 1} }.atom, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) ) break; // attempt to update top node
+                          if ( __atomic_compare_exchange_n( &stack.atom, &( &n )`next->atom, (Link(T))@{ {&n, ( &n )`next->count + 1} }.atom, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) ) break; // attempt to update top node
                         } // for
                 } // push
 …
                         for () {                                                                        // busy wait
                           if ( t.top == 0p ) return 0p;                         // empty stack ?
                           if ( __atomic_compare_exchange_n( &stack.atom, &t.atom, (Link(T))@{ {getNext( t.top )->top, t.count} }.atom, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) ) return t.top; // attempt to update top node
+                          if ( __atomic_compare_exchange_n( &stack.atom, &t.atom, (Link(T))@{ {( t.top )`next->top, t.count} }.atom, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) ) return t.top; // attempt to update top node
                         } // for
                 } // pop
+                bool unsafe_remove( StackLF(T) & this, T * node ) with(this) {
+                        Link(T) * link = &stack;
+                        for() {
+                                T * next = link->top;
+                                if( next == node ) {
+                                        link->top = ( node )`next->top;
+                                        return true;
+                                }
+                                if( next == 0p ) return false;
+                                link = (next)`next;
+                        }
+                }
         } // distribution
 } // distribution

libcfa/src/heap.cfa

-              r7f9968ad
+              r8b58bae
 #if BUCKETLOCK == LOCKFREE
 static inline {
         Link(HeapManager.Storage) * getNext( HeapManager.Storage * this ) { return &this->header.kind.real.next; }
+        Link(HeapManager.Storage) * ?`next( HeapManager.Storage * this ) { return &this->header.kind.real.next; }
         void ?{}( HeapManager.FreeHeader & ) {}
         void ^?{}( HeapManager.FreeHeader & ) {}
 …
                 #else
                 for ( HeapManager.Storage * p = top( freeLists[i].freeList ); p != 0p; /* p = getNext( p )->top */) {
                         typeof(p) temp = getNext( p )->top;                     // FIX ME: direct assignent fails, initialization works
+                        typeof(p) temp = ( p )`next->top;                       // FIX ME: direct assignent fails, initialization works
                         p = temp;
                 #endif // BUCKETLOCK
 …
                         return oaddr;
                 } // if
                 // change size, DO NOT preserve STICKY PROPERTIES.
                 free( oaddr );

libcfa/src/stdhdr/assert.h

-              r7f9968ad
+              r8b58bae
         #define verify(x) assert(x)
         #define verifyf(x, ...) assertf(x, __VA_ARGS__)
+        #define verifyfail(...)
         #define __CFA_WITH_VERIFY__
 #else
         #define verify(x)
         #define verifyf(x, ...)
+        #define verifyfail(...)
 #endif

tests/concurrent/examples/datingService.cfa

-              r7f9968ad
+              r8b58bae
                 signal_block( Boys[ccode] );                                    // restart boy to set phone number
         } // if
         //sout | "Girl:" | PhoneNo | "is dating Boy at" | BoyPhoneNo | "with ccode" | ccode;
+        // sout | "Girl:" | PhoneNo | "is dating Boy at" | BoyPhoneNo | "with ccode" | ccode;
         return BoyPhoneNo;
 } // DatingService girl
 …
                 signal_block( Girls[ccode] );                                   // restart girl to set phone number
         } // if
         //sout | " Boy:" | PhoneNo | "is dating Girl" | GirlPhoneNo | "with ccode" | ccode;
+        // sout | " Boy:" | PhoneNo | "is dating Girl" | GirlPhoneNo | "with ccode" | ccode;
         return GirlPhoneNo;
 } // DatingService boy

tests/concurrent/signal/disjoint.cfa

-              r7f9968ad
+              r8b58bae
 #endif
+// This tests checks what happens when someone barges in the midle of the release
+// of a bulk of monitors.
 enum state_t { WAIT, SIGNAL, BARGE };
 monitor global_t {};
-global_t mut;
 monitor global_data_t;
 …
         int counter;
         state_t state;
+} data;
+};
+// Use a global struct because the order needs to match with Signaller thread
+struct {
+        global_t mut;
+        global_data_t data;
+} globals;
 condition cond;
 …
 void ?{}( global_data_t & this ) {
         this.counter == 0;
+        this.counter = 0;
         this.state = BARGE;
+}
 …
 thread Barger {};
+void ?{}( Barger & this ) {
+        ((thread&)this){ "Barger Thread" };
+}
 void main( Barger & this ) {
         while( !all_done ) {
                 barge( data );
+                barge( globals.data );
                 yield();
+        }
 …
 thread Waiter {};
+void ?{}( Waiter & this ) {
+        ((thread&)this){ "Waiter Thread" };
+}
 void main( Waiter & this ) {
         while( wait( mut, data ) ) { KICK_WATCHDOG; yield(); }
+        while( wait( globals.mut, globals.data ) ) { KICK_WATCHDOG; yield(); }
+}
 …
 void logic( global_t & mutex a ) {
         signal( cond, a, data );
+        signal( cond, a, globals.data );
         yield( random( 10 ) );
         //This is technically a mutual exclusion violation but the mutex monitor protects us
         bool running = TEST(data.counter < N) && data.counter > 0;
         if( data.state != SIGNAL && running ) {
                 sout | "ERROR Eager signal" | data.state;
+        bool running = TEST(globals.data.counter < N) && globals.data.counter > 0;
+        if( globals.data.state != SIGNAL && running ) {
+                sout | "ERROR Eager signal" | globals.data.state;
+        }
+}
 thread Signaller {};
+void ?{}( Signaller & this ) {
+        ((thread&)this){ "Signaller Thread" };
+}
 void main( Signaller & this ) {
         while( !all_done ) {
                 logic( mut );
+                logic( globals.mut );
                 yield();
+        }

tests/concurrent/waitfor/when.cfa

-              r7f9968ad
+              r8b58bae
 void arbiter( global_t & mutex this ) {
+        // There is a race at start where callers can get in before the arbiter.
+        // It doesn't really matter here so just restart the loop correctly and move on
+        this.last_call = 6;
         for( int i = 0; i < N; i++ ) {
                    when( this.last_call == 6 ) waitfor( call1 : this ) { if( this.last_call != 1) { serr | "Expected last_call to be 1 got" | this.last_call; } }

tools/gdb/utils-gdb.py

r7f9968ad	r8b58bae
59	59	thread_ptr = gdb.lookup_type('struct $thread').pointer(),
60	60	int_ptr = gdb.lookup_type('int').pointer(),
61		thread_state = gdb.lookup_type('enum ~~coroutine_s~~tate'))
	61	thread_state = gdb.lookup_type('enum __Coroutine_State'))
62	62
63	63	def get_addr(addr):

Context Navigation

Changes in / [7f9968ad:8b58bae]

Legend:

benchmark/io/readv.cfa

doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp

doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp

doc/theses/thierry_delisle_PhD/code/utils.hpp

libcfa/src/Makefile.am

libcfa/src/Makefile.in

libcfa/src/bits/debug.hfa

libcfa/src/bits/defs.hfa

libcfa/src/concurrency/invoke.h

libcfa/src/concurrency/io.cfa

libcfa/src/concurrency/kernel.cfa

libcfa/src/concurrency/kernel.hfa

libcfa/src/concurrency/kernel_private.hfa

libcfa/src/concurrency/monitor.cfa

libcfa/src/concurrency/preemption.cfa

libcfa/src/concurrency/thread.cfa

libcfa/src/containers/stackLockFree.hfa

libcfa/src/heap.cfa

libcfa/src/stdhdr/assert.h

tests/concurrent/examples/datingService.cfa

tests/concurrent/signal/disjoint.cfa

tests/concurrent/waitfor/when.cfa

tools/gdb/utils-gdb.py

Download in other formats: