Diff [ee06db5c9b28732fc10c936eac06983a9d5378db:97392b6998d47706885e39064f5f23e53dbe7496] for / – Cforall

doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp

-              ree06db5c
+              r97392b69
         size_t valmax = 0;
         size_t valmin = 100000000ul;
+        struct {
+                size_t val = 0;
+                size_t cnt = 0;
+        } comp;
+        struct {
+                size_t val = 0;
+                size_t cnt = 0;
+        } subm;
 };
 …
         std::atomic_size_t valmax = { 0 };
         std::atomic_size_t valmin = { 100000000ul };
+        struct {
+                std::atomic_size_t val = { 0 };
+                std::atomic_size_t cnt = { 0 };
+        } comp;
+        struct {
+                std::atomic_size_t val = { 0 };
+                std::atomic_size_t cnt = { 0 };
+        } subm;
 };
 …
         global.crc_out += local.crc_out;
+        global.comp.val += local.comp.val;
+        global.comp.cnt += local.comp.cnt;
+        global.subm.val += local.subm.val;
+        global.subm.cnt += local.subm.cnt;
         atomic_max(global.valmax, local.valmax);
         atomic_min(global.valmin, local.valmin);
 …
         auto before = Clock::now();
         barrier.wait(0);
+        bool is_tty = isatty(STDOUT_FILENO);
         while(true) {
 …
                         break;
+                }
+                std::cout << "\r" << std::setprecision(4) << durr.count();
+                std::cout.flush();
+                if(is_tty) {
+                        std::cout << "\r" << std::setprecision(4) << durr.count();
+                        std::cout.flush();
+                }
+        }
 …
         auto dur_nano = duration_cast<std::nano>(1.0);
+        if(global.valmax != 0) {
+                std::cout << "Max runs      : " << global.valmax << "\n";
+                std::cout << "Min runs      : " << global.valmin << "\n";
+        }
+        if(global.comp.cnt != 0) {
+                std::cout << "Submit count  : " << global.subm.cnt << "\n";
+                std::cout << "Submit average: " << ((double(global.subm.val)) / global.subm.cnt) << "\n";
+                std::cout << "Complete count: " << global.comp.cnt << "\n";
+                std::cout << "Complete avg  : " << ((double(global.comp.val)) / global.comp.cnt) << "\n";
+        }
         std::cout << "Duration      : " << duration << "s\n";
         std::cout << "ns/Op         : " << ( dur_nano / ops_thread )<< "\n";
 …
         std::cout << "Ops/sec       : " << ops_sec << "\n";
         std::cout << "Total ops     : " << ops << "(" << global.in << "i, " << global.out << "o, " << global.empty << "e)\n";
-        if(global.valmax != 0) {
-                std::cout << "Max runs      : " << global.valmax << "\n";
-                std::cout << "Min runs      : " << global.valmin << "\n";
+        }
         #ifndef NO_STATS
                 relaxed_list<Node>::stats_print(std::cout);
 …
                 enable_stats = false;
+        }
+        print_stats(duration, nthread, global);
+}
+// ================================================================================================
+struct __attribute__((aligned(64))) Slot {
+        Node * volatile node;
+};
+__attribute__((noinline)) void runProducer_body(
+        std::atomic<bool>& done,
+        Random & rand,
+        Slot * slots,
+        int nslots,
+        local_stat_t & local,
+        relaxed_list<Node> & list
+) {
+        while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
+                Node * node = list.pop();
+                if(!node) {
+                        local.empty ++;
+                        continue;
+                }
+                local.crc_out += node->value;
+                local.out++;
+                if(node->id == 0) {
+                        unsigned cnt = 0;
+                        for(int i = 0; i < nslots; i++) {
+                                Node * found = __atomic_exchange_n( &slots[i].node, nullptr, __ATOMIC_SEQ_CST );
+                                if( found ) {
+                                        local.crc_in += found->value;
+                                        local.in++;
+                                        cnt++;
+                                        list.push( found );
+                                }
+                        }
+                        local.crc_in += node->value;
+                        local.in++;
+                        list.push( node );
+                        local.comp.cnt++;
+                        local.comp.val += cnt;
+                }
+                else {
+                        unsigned len = 0;
+                        while(true) {
+                                auto off = rand.next();
+                                for(int i = 0; i < nslots; i++) {
+                                        Node * expected = nullptr;
+                                        int idx = (i + off) % nslots;
+                                        Slot & slot = slots[ idx ];
+                                        if(
+                                                slot.node == nullptr &&
+                                                __atomic_compare_exchange_n( &slot.node, &expected, node, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST )
+                                        ) {
+                                                local.subm.cnt++;
+                                                local.subm.val += len;
+                                                goto LOOP;
+                                        }
+                                        assert( expected != node );
+                                        len++;
+                                }
+                        }
+                }
+                LOOP:;
+        }
+}
+void runProducer(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes) {
+        std::cout << "Producer Benchmark" << std::endl;
+        // Barrier for synchronization
+        barrier_t barrier(nthread + 1);
+        // Data to check everything is OK
+        global_stat_t global;
+        // Flag to signal termination
+        std::atomic_bool done  = { false };
+        std::cout << "Initializing ";
+        int nslots = nnodes * 4;
+        Slot * slots = new Slot[nslots];
+        std::cout << nnodes << " nodes (" << nslots << " slots)" << std::endl;
+        // List being tested
+        relaxed_list<Node> list = { nthread * nqueues };
+        {
+                Random rand(rdtscl());
+                for(unsigned i = 0; i < nnodes; i++) {
+                        Node * node = new Node(rand.next() % 100);
+                        node->id = i;
+                        global.crc_in += node->value;
+                        list.push(node);
+                }
+                for(int i = 0; i < nslots; i++) {
+                        slots[i].node = nullptr;
+                }
+        }
+        {
+                enable_stats = true;
+                std::thread * threads[nthread];
+                unsigned i = 1;
+                for(auto & t : threads) {
+                        t = new std::thread([&done, &list, &barrier, &global, slots, nslots](unsigned tid) {
+                                Random rand(tid + rdtscl());
+                                local_stat_t local;
+                                barrier.wait(tid);
+                                // EXPERIMENT START
+                                runProducer_body(done, rand, slots, nslots, local, list);
+                                // EXPERIMENT END
+                                barrier.wait(tid);
+                                tally_stats(global, local);
+                        }, i++);
+                }
+                waitfor(duration, barrier, done);
+                for(auto t : threads) {
+                        t->join();
+                        delete t;
+                }
+                enable_stats = false;
+        }
+        {
+                while(Node * node = list.pop()) {
+                        global.crc_out += node->value;
+                        delete node;
+                }
+                for(int i = 0; i < nslots; i++) {
+                        delete slots[i].node;
+                }
+                delete [] slots;
+        }
 …
         print_stats(duration, nthread, global);
         save_fairness(data_out.get(), 100, nthread, width, length, output);
+        // save_fairness(data_out.get(), 100, nthread, width, length, output);
+}
 …
                 Churn,
                 PingPong,
+                Producer,
                 Fairness,
                 NONE
 …
                                 case PingPong:
                                         nnodes = 1;
-                                        nslots = 1;
                                         switch(argc - optind) {
                                         case 0: break;
 …
                                                 break;
                                         default:
+                                                std::cerr << "'PingPong' benchmark doesn't accept more than 2 extra arguments" << std::endl;
+                                                std::cerr << "'PingPong' benchmark doesn't accept more than 1 extra arguments" << std::endl;
+                                                goto usage;
+                                        }
+                                        break;
+                                case Producer:
+                                        nnodes = 32;
+                                        switch(argc - optind) {
+                                        case 0: break;
+                                        case 1:
+                                                try {
+                                                        arg = optarg = argv[optind];
+                                                        nnodes = stoul(optarg, &len);
+                                                        if(len != arg.size()) { throw std::invalid_argument(""); }
+                                                } catch(std::invalid_argument &) {
+                                                        std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl;
+                                                        goto usage;
+                                                }
+                                                break;
+                                        default:
+                                                std::cerr << "'Producer' benchmark doesn't accept more than 1 extra arguments" << std::endl;
                                                 goto usage;
+                                        }
 …
                                         break;
+                                }
+                                if(iequals(arg, "producer")) {
+                                        benchmark = Producer;
+                                        break;
+                                }
                                 if(iequals(arg, "fairness")) {
                                         benchmark = Fairness;
 …
                                 std::cerr << "Usage: " << argv[0] << ": [options] -b churn [NNODES] [NSLOTS = NNODES]" << std::endl;
                                 std::cerr << "  or:  " << argv[0] << ": [options] -b pingpong [NNODES]" << std::endl;
+                                std::cerr << "  or:  " << argv[0] << ": [options] -b producer [NNODES]" << std::endl;
                                 std::cerr << std::endl;
                                 std::cerr << "  -d, --duration=DURATION  Duration of the experiment, in seconds" << std::endl;
 …
         std::cout << "Running " << nthreads << " threads (" << (nthreads * nqueues) << " queues) for " << duration << " seconds" << std::endl;
+        std::cout << "Relaxed list variant: " << relaxed_list<Node>::name() << std::endl;
         switch(benchmark) {
                 case Churn:
 …
                 case PingPong:
                         runPingPong(nthreads, nqueues, duration, nnodes);
+                        break;
+                case Producer:
+                        runProducer(nthreads, nqueues, duration, nnodes);
                         break;
                 case Fairness:
 …
+}
 void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output) {
         std::ofstream os(output);
         os << "<html>\n";
         os << "<head>\n";
         os << "<style>\n";
         os << "</style>\n";
         os << "</head>\n";
         os << "<body>\n";
         os << "<table style=\"width=100%\">\n";
         size_t idx = 0;
         for(size_t r = 0ul; r < rows; r++) {
                 os << "<tr>\n";
                 for(size_t c = 0ul; c < columns; c++) {
                         os << "<td class=\"custom custom" << data[idx] << "\"></td>\n";
                         idx++;
+                }
                 os << "</tr>\n";
+        }
         os << "</table>\n";
         os << "</body>\n";
         os << "</html>\n";
         os << std::endl;
+}
 #include <png.h>
 #include <setjmp.h>
+// void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output) {
+//      std::ofstream os(output);
+//      os << "<html>\n";
+//      os << "<head>\n";
+//      os << "<style>\n";
+//      os << "</style>\n";
+//      os << "</head>\n";
+//      os << "<body>\n";
+//      os << "<table style=\"width=100%\">\n";
+//      size_t idx = 0;
+//      for(size_t r = 0ul; r < rows; r++) {
+//              os << "<tr>\n";
+//              for(size_t c = 0ul; c < columns; c++) {
+//                      os << "<td class=\"custom custom" << data[idx] << "\"></td>\n";
+//                      idx++;
+//              }
+//              os << "</tr>\n";
+//      }
+//      os << "</table>\n";
+//      os << "</body>\n";
+//      os << "</html>\n";
+//      os << std::endl;
+// }
+// #include <png.h>
+// #include <setjmp.h>
 /*

doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp

-              ree06db5c
+              r97392b69
 #pragma once
+#define MACRO_XSTR(s) MACRO_STR(s)
+#define MACRO_STR(s) #s
+#define VANILLA 0
+#define SNZI 1
+#define BITMASK 2
+#define DISCOVER 3
+#define SNZM 4
+#ifndef VARIANT
+#define VARIANT VANILLA
+#endif
 #ifndef NO_STATS
 …
 #endif
+#include <cmath>
 #include <memory>
 #include <mutex>
 …
 #include "assert.hpp"
 #include "utils.hpp"
+#include "snzi.hpp"
+#include "snzm.hpp"
 using namespace std;
-struct spinlock_t {
-        std::atomic_bool ll = { false };
-        inline void lock() {
-                while( __builtin_expect(ll.exchange(true),false) ) {
-                        while(ll.load(std::memory_order_relaxed))
-                                asm volatile("pause");
+                }
+        }
-        inline bool try_lock() {
-                return false == ll.exchange(true);
+        }
-        inline void unlock() {
-                ll.store(false, std::memory_order_release);
+        }
-        inline explicit operator bool() {
-                return ll.load(std::memory_order_relaxed);
+        }
-};
-static inline bool bts(std::atomic_size_t & target, size_t bit ) {
-        //*
-        int result = 0;
-        asm volatile(
-                "LOCK btsq %[bit], %[target]\n\t"
-                :"=@ccc" (result)
-                : [target] "m" (target), [bit] "r" (bit)
-        );
-        return result != 0;
-        /*/
-        size_t mask = 1ul << bit;
-        size_t ret = target.fetch_or(mask, std::memory_order_relaxed);
-        return (ret & mask) != 0;
-        //*/
+}
-static inline bool btr(std::atomic_size_t & target, size_t bit ) {
-        //*
-        int result = 0;
-        asm volatile(
-                "LOCK btrq %[bit], %[target]\n\t"
-                :"=@ccc" (result)
-                : [target] "m" (target), [bit] "r" (bit)
-        );
-        return result != 0;
-        /*/
-        size_t mask = 1ul << bit;
-        size_t ret = target.fetch_and(~mask, std::memory_order_relaxed);
-        return (ret & mask) != 0;
-        //*/
+}
 extern bool enable_stats;
 …
                 size_t success = 0;
                 size_t mask_attempt = 0;
+                size_t mask_reset = 0;
         } pop;
 };
 …
 public:
+        static const char * name() {
+                const char * names[] = {
+                        "VANILLA",
+                        "SNZI",
+                        "BITMASK",
+                        "SNZI + DISCOVERED MASK",
+                        "SNZI + MASK"
+                };
+                return names[VARIANT];
+        }
         relaxed_list(unsigned numLists)
                 : lists(new intrusive_queue_t[numLists])
                 , numLists(numLists)
+                #if VARIANT == SNZI
+                        , snzi( std::log2( numLists / 8 ), 2 )
+                #elif VARIANT == SNZM || VARIANT == DISCOVER
+                        , snzm( numLists )
+                #endif
+        {
                 assertf(7 * 8 * 8 >= numLists, "List currently only supports 448 sublists");
 …
                         if( !lists[i].lock.try_lock() ) continue;
+                        __attribute__((unused)) int num = numNonEmpty;
+                        #if VARIANT != SNZM && VARIANT != SNZI && VARIANT != DISCOVER
+                                __attribute__((unused)) int num = numNonEmpty;
+                        #endif
                         // Actually push it
                         if(lists[i].push(node)) {
+                                numNonEmpty++;
+                                size_t qword = i >> 6ull;
+                                size_t bit   = i & 63ull;
+                                assertf((list_mask[qword] & (1ul << bit)) == 0, "Before set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit));
+                                __attribute__((unused)) bool ret = bts(list_mask[qword], bit);
+                                assert(!ret);
+                                assertf((list_mask[qword] & (1ul << bit)) != 0, "After set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit));
+                        }
+                        assert(numNonEmpty <= (int)numLists);
+                                #if VARIANT == DISCOVER
+                                        size_t qword = i >> 6ull;
+                                        size_t bit   = i & 63ull;
+                                        assert(qword == 0);
+                                        bts(tls.mask, bit);
+                                        snzm.arrive(i);
+                                #elif VARIANT == SNZI
+                                        snzi.arrive(i);
+                                #elif VARIANT == SNZM
+                                        snzm.arrive(i);
+                                #elif VARIANT == BITMASK
+                                        numNonEmpty++;
+                                        size_t qword = i >> 6ull;
+                                        size_t bit   = i & 63ull;
+                                        assertf((list_mask[qword] & (1ul << bit)) == 0, "Before set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit));
+                                        __attribute__((unused)) bool ret = bts(list_mask[qword], bit);
+                                        assert(!ret);
+                                        assertf((list_mask[qword] & (1ul << bit)) != 0, "After set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit));
+                                #else
+                                        numNonEmpty++;
+                                #endif
+                        }
+                        #if VARIANT != SNZM && VARIANT != SNZI && VARIANT != DISCOVER
+                                assert(numNonEmpty <= (int)numLists);
+                        #endif
                         // Unlock and return
 …
                         #ifndef NO_STATS
                                 tls.pick.push.success++;
+                                tls.empty.push.value += num;
+                                tls.empty.push.count += 1;
+                                #if VARIANT != SNZM && VARIANT != SNZI && VARIANT != DISCOVER
+                                        tls.empty.push.value += num;
+                                        tls.empty.push.count += 1;
+                                #endif
                         #endif
                         return;
 …
         __attribute__((noinline, hot)) node_t * pop() {
+                #if !defined(NO_BITMASK)
+                        // for(int r = 0; r < 10 && numNonEmpty != 0; r++) {
+                        //      // Pick two lists at random
+                        //      unsigned i = tls.rng.next() % numLists;
+                        //      unsigned j = tls.rng.next() % numLists;
+                        //      if(auto node = try_pop(i, j)) return node;
+                        // }
+                #if VARIANT == DISCOVER
+                        assert(numLists <= 64);
+                        while(snzm.query()) {
+                                tls.pick.pop.mask_attempt++;
+                                unsigned i, j;
+                                {
+                                        // Pick first list totally randomly
+                                        i = tls.rng.next() % numLists;
+                                        // Pick the other according to the bitmask
+                                        unsigned r = tls.rng.next();
+                                        size_t mask = tls.mask.load(std::memory_order_relaxed);
+                                        if(mask == 0) {
+                                                tls.pick.pop.mask_reset++;
+                                                mask = (1U << numLists) - 1;
+                                                tls.mask.store(mask, std::memory_order_relaxed);
+                                        }
+                                        unsigned b = rand_bit(r, mask);
+                                        assertf(b < 64, "%zu %u", mask, b);
+                                        j = b;
+                                        assert(j < numLists);
+                                }
+                                if(auto node = try_pop(i, j)) return node;
+                        }
+                #elif VARIANT == SNZI
+                        while(snzi.query()) {
+                                // Pick two lists at random
+                                int i = tls.rng.next() % numLists;
+                                int j = tls.rng.next() % numLists;
+                                if(auto node = try_pop(i, j)) return node;
+                        }
+                #elif VARIANT == SNZM
+                        //*
+                        while(snzm.query()) {
+                                tls.pick.pop.mask_attempt++;
+                                unsigned i, j;
+                                {
+                                        // Pick two random number
+                                        unsigned ri = tls.rng.next();
+                                        unsigned rj = tls.rng.next();
+                                        // Pick two nodes from it
+                                        unsigned wdxi = ri & snzm.mask;
+                                        unsigned wdxj = rj & snzm.mask;
+                                        // Get the masks from the nodes
+                                        size_t maski = snzm.masks(wdxi);
+                                        size_t maskj = snzm.masks(wdxj);
+                                        if(maski == 0 && maskj == 0) continue;
+                                        #if defined(__BMI2__)
+                                                uint64_t idxsi = _pext_u64(snzm.indexes, maski);
+                                                uint64_t idxsj = _pext_u64(snzm.indexes, maskj);
+                                                auto pi = __builtin_popcountll(maski);
+                                                auto pj = __builtin_popcountll(maskj);
+                                                ri = pi ? ri & ((pi >> 3) - 1) : 0;
+                                                rj = pj ? rj & ((pj >> 3) - 1) : 0;
+                                                unsigned bi = (idxsi >> (ri << 3)) & 0xff;
+                                                unsigned bj = (idxsj >> (rj << 3)) & 0xff;
+                                        #else
+                                                unsigned bi = rand_bit(ri >> snzm.depth, maski);
+                                                unsigned bj = rand_bit(rj >> snzm.depth, maskj);
+                                        #endif
+                                        i = (bi << snzm.depth) | wdxi;
+                                        j = (bj << snzm.depth) | wdxj;
+                                        /* paranoid */ assertf(i < numLists, "%u %u", bj, wdxi);
+                                        /* paranoid */ assertf(j < numLists, "%u %u", bj, wdxj);
+                                }
+                                if(auto node = try_pop(i, j)) return node;
+                        }
+                        /*/
+                        while(snzm.query()) {
+                                // Pick two lists at random
+                                int i = tls.rng.next() % numLists;
+                                int j = tls.rng.next() % numLists;
+                                if(auto node = try_pop(i, j)) return node;
+                        }
+                        //*/
+                #elif VARIANT == BITMASK
                         int nnempty;
                         while(0 != (nnempty = numNonEmpty)) {
                                 tls.pick.pop.mask_attempt++;
                                 unsigned i, j;
-                                // if( numLists < 4 || (numLists / nnempty) < 4 ) {
-                                //      // Pick two lists at random
-                                //      i = tls.rng.next() % numLists;
-                                //      j = tls.rng.next() % numLists;
-                                // } else
+                                {
-                                        #ifndef NO_STATS
-                                                // tls.pick.push.mask_attempt++;
-                                        #endif
                                         // Pick two lists at random
                                         unsigned num = ((numLists - 1) >> 6) + 1;
 …
                 #endif
+                #if VARIANT == DISCOVER
+                        if(lists[i].ts() > 0) bts(tls.mask, i); else btr(tls.mask, i);
+                        if(lists[j].ts() > 0) bts(tls.mask, j); else btr(tls.mask, j);
+                #endif
                 // Pick the bet list
                 int w = i;
 …
                 if( !list.lock.try_lock() ) return nullptr;
+                __attribute__((unused)) int num = numNonEmpty;
+                #if VARIANT != SNZM && VARIANT != SNZI && VARIANT != DISCOVER
+                        __attribute__((unused)) int num = numNonEmpty;
+                #endif
                 // If list is empty, unlock and retry
 …
                 if(emptied) {
+                        numNonEmpty--;
+                        size_t qword = w >> 6ull;
+                        size_t bit   = w & 63ull;
+                        assert((list_mask[qword] & (1ul << bit)) != 0);
+                        __attribute__((unused)) bool ret = btr(list_mask[qword], bit);
+                        assert(ret);
+                        assert((list_mask[qword] & (1ul << bit)) == 0);
+                        #if VARIANT == DISCOVER
+                                size_t qword = w >> 6ull;
+                                size_t bit   = w & 63ull;
+                                assert(qword == 0);
+                                __attribute__((unused)) bool ret = btr(tls.mask, bit);
+                                snzm.depart(w);
+                        #elif VARIANT == SNZI
+                                snzi.depart(w);
+                        #elif VARIANT == SNZM
+                                snzm.depart(w);
+                        #elif VARIANT == BITMASK
+                                numNonEmpty--;
+                                size_t qword = w >> 6ull;
+                                size_t bit   = w & 63ull;
+                                assert((list_mask[qword] & (1ul << bit)) != 0);
+                                __attribute__((unused)) bool ret = btr(list_mask[qword], bit);
+                                assert(ret);
+                                assert((list_mask[qword] & (1ul << bit)) == 0);
+                        #else
+                                numNonEmpty--;
+                        #endif
+                }
                 // Unlock and return
                 list.lock.unlock();
+                assert(numNonEmpty >= 0);
+                #if VARIANT != SNZM && VARIANT != SNZI && VARIANT != DISCOVER
+                        assert(numNonEmpty >= 0);
+                #endif
                 #ifndef NO_STATS
                         tls.pick.pop.success++;
+                        tls.empty.pop.value += num;
+                        tls.empty.pop.count += 1;
+                        #if VARIANT != SNZM && VARIANT != SNZI && VARIANT != DISCOVER
+                                tls.empty.pop.value += num;
+                                tls.empty.pop.count += 1;
+                        #endif
                 #endif
                 return node;
 …
                         size_t  push = 0;
                         size_t  pop  = 0;
-                        // size_t value = 0;
-                        // size_t count = 0;
                 };
 …
                 pick_stat  pick;
                 empty_stat empty;
+                __attribute__((aligned(64))) std::atomic_size_t mask = { 0 };
         } tls;
-public:
-        std::atomic_int numNonEmpty  = { 0 };  // number of non-empty lists
-        std::atomic_size_t list_mask[7] = { {0}, {0}, {0}, {0}, {0}, {0}, {0} }; // which queues are empty
 private:
         __attribute__((aligned(64))) std::unique_ptr<intrusive_queue_t []> lists;
         const unsigned numLists;
+private:
+        #if VARIANT == SNZI
+                snzi_t snzi;
+        #elif VARIANT == SNZM || VARIANT == DISCOVER
+                snzm_t snzm;
+        #else
+                std::atomic_int numNonEmpty  = { 0 };  // number of non-empty lists
+        #endif
+        #if VARIANT == BITMASK
+                std::atomic_size_t list_mask[7] = { {0}, {0}, {0}, {0}, {0}, {0}, {0} }; // which queues are empty
+        #endif
 public:
 …
                 global_stats.pick.pop .success += tls.pick.pop.success;
                 global_stats.pick.pop .mask_attempt += tls.pick.pop.mask_attempt;
+                global_stats.pick.pop .mask_reset += tls.pick.pop.mask_reset;
                 global_stats.qstat.push.value += tls.empty.push.value;
 …
                                 std::atomic_size_t success = { 0 };
                                 std::atomic_size_t mask_attempt = { 0 };
+                                std::atomic_size_t mask_reset = { 0 };
                         } pop;
                 } pick;
 …
         void stats_print_local(std::ostream & os ) {
                 std::cout << "----- Relaxed List Stats -----" << std::endl;
+                {
                         ssize_t diff = 0;
                         size_t  num  = 0;
                         ssize_t max  = 0;
                         for(size_t i = 0; i < numLists; i++) {
                                 const auto & list = lists[i];
                                 diff+= list.s.diff;
                                 num ++;
                                 max  = std::abs(max) > std::abs(list.s.diff) ? max : list.s.diff;
                                 os << "Local Q ops   : " << (list.s.push + list.s.pop) << "(" << list.s.push << "i, " << list.s.pop << "o)\n";
+                        }
                         os << "Difference   : " << ssize_t(double(diff) / num  ) << " avg\t" << max << "max" << std::endl;
+                }
+                // {
+                //      ssize_t diff = 0;
+                //      size_t  num  = 0;
+                //      ssize_t max  = 0;
+                //      for(size_t i = 0; i < numLists; i++) {
+                //              const auto & list = lists[i];
+                //              diff+= list.s.diff;
+                //              num ++;
+                //              max  = std::abs(max) > std::abs(list.s.diff) ? max : list.s.diff;
+                //              os << "Local Q ops   : " << (list.s.push + list.s.pop) << "(" << list.s.push << "i, " << list.s.pop << "o)\n";
+                //      }
+                //      os << "Difference   : " << ssize_t(double(diff) / num  ) << " avg\t" << max << "max" << std::endl;
+                // }
                 const auto & global = global_stats;
 …
                 double pop_sur  = (100.0 * double(global.pick.pop .success) / global.pick.pop .attempt);
                 double mpop_sur = (100.0 * double(global.pick.pop .success) / global.pick.pop .mask_attempt);
+                os << "Push   Pick % : " << push_sur << "(" << global.pick.push.success << " / " << global.pick.push.attempt << ")\n";
+                os << "Pop    Pick % : " << pop_sur  << "(" << global.pick.pop .success << " / " << global.pick.pop .attempt << ")\n";
+                os << "TryPop Pick % : " << mpop_sur << "(" << global.pick.pop .success << " / " << global.pick.pop .mask_attempt << ")\n";
+                double rpop_sur = (100.0 * double(global.pick.pop .success) / global.pick.pop .mask_reset);
+                double push_len = double(global.pick.push.attempt     ) / global.pick.push.success;
+                double pop_len  = double(global.pick.pop .attempt     ) / global.pick.pop .success;
+                double mpop_len = double(global.pick.pop .mask_attempt) / global.pick.pop .success;
+                double rpop_len = double(global.pick.pop .mask_reset  ) / global.pick.pop .success;
+                os << "Push   Pick   : " << push_sur << " %, len " << push_len << " (" << global.pick.push.attempt      << " / " << global.pick.push.success << ")\n";
+                os << "Pop    Pick   : " << pop_sur  << " %, len " << pop_len  << " (" << global.pick.pop .attempt      << " / " << global.pick.pop .success << ")\n";
+                os << "TryPop Pick   : " << mpop_sur << " %, len " << mpop_len << " (" << global.pick.pop .mask_attempt << " / " << global.pick.pop .success << ")\n";
+                os << "Pop M Reset   : " << rpop_sur << " %, len " << rpop_len << " (" << global.pick.pop .mask_reset   << " / " << global.pick.pop .success << ")\n";
                 double avgQ_push = double(global.qstat.push.value) / global.qstat.push.count;

doc/theses/thierry_delisle_PhD/code/utils.hpp

-              ree06db5c
+              r97392b69
+}
+static inline unsigned rand_bit(unsigned rnum, size_t mask) __attribute__((artificial));
 static inline unsigned rand_bit(unsigned rnum, size_t mask) {
         unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
 …
 #endif
+}
+struct spinlock_t {
+        std::atomic_bool ll = { false };
+        inline void lock() {
+                while( __builtin_expect(ll.exchange(true),false) ) {
+                        while(ll.load(std::memory_order_relaxed))
+                                asm volatile("pause");
+                }
+        }
+        inline bool try_lock() {
+                return false == ll.exchange(true);
+        }
+        inline void unlock() {
+                ll.store(false, std::memory_order_release);
+        }
+        inline explicit operator bool() {
+                return ll.load(std::memory_order_relaxed);
+        }
+};
+static inline bool bts(std::atomic_size_t & target, size_t bit ) {
+        //*
+        int result = 0;
+        asm volatile(
+                "LOCK btsq %[bit], %[target]\n\t"
+                :"=@ccc" (result)
+                : [target] "m" (target), [bit] "r" (bit)
+        );
+        return result != 0;
+        /*/
+        size_t mask = 1ul << bit;
+        size_t ret = target.fetch_or(mask, std::memory_order_relaxed);
+        return (ret & mask) != 0;
+        //*/
+}
+static inline bool btr(std::atomic_size_t & target, size_t bit ) {
+        //*
+        int result = 0;
+        asm volatile(
+                "LOCK btrq %[bit], %[target]\n\t"
+                :"=@ccc" (result)
+                : [target] "m" (target), [bit] "r" (bit)
+        );
+        return result != 0;
+        /*/
+        size_t mask = 1ul << bit;
+        size_t ret = target.fetch_and(~mask, std::memory_order_relaxed);
+        return (ret & mask) != 0;
+        //*/
+}

libcfa/src/Makefile.am

ree06db5c	r97392b69
50	50	thread_headers_nosrc = concurrency/invoke.h
51	51	thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
52		thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/io.cfa concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
	52	thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/io.cfa concurrency/preemption.cfa concurrency/ready_queue.cfa ${thread_headers:.hfa=.cfa}
53	53	else
54	54	headers =

libcfa/src/Makefile.in

-              ree06db5c
+              r97392b69
         concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa \
         concurrency/invoke.c concurrency/io.cfa \
+        concurrency/preemption.cfa concurrency/coroutine.cfa \
+        concurrency/thread.cfa concurrency/kernel.cfa \
+        concurrency/monitor.cfa concurrency/mutex.cfa
+        concurrency/preemption.cfa concurrency/ready_queue.cfa \
+        concurrency/coroutine.cfa concurrency/thread.cfa \
+        concurrency/kernel.cfa concurrency/monitor.cfa \
+        concurrency/mutex.cfa
 @BUILDLIB_TRUE@am__objects_3 = concurrency/coroutine.lo \
 @BUILDLIB_TRUE@ concurrency/thread.lo concurrency/kernel.lo \
 …
 @BUILDLIB_TRUE@ concurrency/alarm.lo concurrency/invoke.lo \
 @BUILDLIB_TRUE@ concurrency/io.lo concurrency/preemption.lo \
 @BUILDLIB_TRUE@ $(am__objects_3)
+@BUILDLIB_TRUE@ concurrency/ready_queue.lo $(am__objects_3)
 am_libcfathread_la_OBJECTS = $(am__objects_4)
 libcfathread_la_OBJECTS = $(am_libcfathread_la_OBJECTS)
 …
 @BUILDLIB_FALSE@thread_headers =
 @BUILDLIB_TRUE@thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
 @BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/io.cfa concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
+@BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/io.cfa concurrency/preemption.cfa concurrency/ready_queue.cfa ${thread_headers:.hfa=.cfa}
 #----------------------------------------------------------------------------------------------------------------
 …
         concurrency/$(DEPDIR)/$(am__dirstamp)
 concurrency/preemption.lo: concurrency/$(am__dirstamp) \
+        concurrency/$(DEPDIR)/$(am__dirstamp)
+concurrency/ready_queue.lo: concurrency/$(am__dirstamp) \
         concurrency/$(DEPDIR)/$(am__dirstamp)
 concurrency/coroutine.lo: concurrency/$(am__dirstamp) \

libcfa/src/bits/debug.hfa

-              ree06db5c
+              r97392b69
                 || defined(__CFA_DEBUG_PRINT_IO__) || defined(__CFA_DEBUG_PRINT_IO_CORE__) \
                 || defined(__CFA_DEBUG_PRINT_MONITOR__) || defined(__CFA_DEBUG_PRINT_PREEMPTION__) \
+                || defined(__CFA_DEBUG_PRINT_RUNTIME_CORE__) || defined(__CFA_DEBUG_PRINT_EXCEPTION__)
+                || defined(__CFA_DEBUG_PRINT_RUNTIME_CORE__) || defined(__CFA_DEBUG_PRINT_EXCEPTION__) \
+                || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
         #include <stdio.h>
         #include <unistd.h>

libcfa/src/bits/defs.hfa

-              ree06db5c
+              r97392b69
     return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
+}
+// #define __CFA_NO_BIT_TEST_AND_SET__
+#if defined( __i386 )
+static inline bool __atomic_bts(volatile unsigned long int * target, unsigned long int bit ) {
+        #if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long int mask = 1ul << bit;
+        unsigned long int ret = __atomic_fetch_or(target, mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+    #else
+        int result = 0;
+        asm volatile(
+            "LOCK btsl %[bit], %[target]\n\t"
+            : "=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+static inline bool __atomic_btr(volatile unsigned long int * target, unsigned long int bit ) {
+        #if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long int mask = 1ul << bit;
+        unsigned long int ret = __atomic_fetch_and(target, ~mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+        #else
+        int result = 0;
+        asm volatile(
+            "LOCK btrl %[bit], %[target]\n\t"
+            :"=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+#elif defined( __x86_64 )
+static inline bool __atomic_bts(volatile unsigned long long int * target, unsigned long long int bit ) {
+        #if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long long int mask = 1ul << bit;
+        unsigned long long int ret = __atomic_fetch_or(target, mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+    #else
+        int result = 0;
+        asm volatile(
+            "LOCK btsq %[bit], %[target]\n\t"
+            : "=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+static inline bool __atomic_btr(volatile unsigned long long int * target, unsigned long long int bit ) {
+        #if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long long int mask = 1ul << bit;
+        unsigned long long int ret = __atomic_fetch_and(target, ~mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+        #else
+        int result = 0;
+        asm volatile(
+            "LOCK btrq %[bit], %[target]\n\t"
+            :"=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+#elif defined( __ARM_ARCH )
+    #error __atomic_bts and __atomic_btr not implemented for arm
+#else
+        #error uknown hardware architecture
+#endif

libcfa/src/concurrency/invoke.h

-              ree06db5c
+              r97392b69
         };
+        // Link lists fields
+        // instrusive link field for threads
+        struct __thread_desc_link {
+                struct $thread * next;
+                struct $thread * prev;
+                volatile unsigned long long ts;
+        };
         struct $thread {
                 // Core threading fields
 …
                 // Link lists fields
                 // instrusive link field for threads
                 struct $thread * next;
+                struct __thread_desc_link link;
                 struct {
 …
         #ifdef __cforall
         extern "Cforall" {
                 static inline $thread *& get_next( $thread & this ) __attribute__((const)) {
                         return this.next;
+                        return this.link.next;
+                }

libcfa/src/concurrency/io.cfa

-              ree06db5c
+              r97392b69
                                         // This is the tricky case
                                         // The thread was preempted and now it is on the ready queue
+                                        /* paranoid */ verify( thrd.next == 1p );                // The thread should be the last on the list
+                                        /* paranoid */ verify( thrd.next != 0p );                // The thread should be the last on the list
                                         /* paranoid */ verify( this.ready_queue.head == &thrd ); // The thread should be the only thing on the list

libcfa/src/concurrency/kernel.cfa

-              ree06db5c
+              r97392b69
 static void __run_thread(processor * this, $thread * dst);
 static $thread * __halt(processor * this);
 static bool __wake_one(cluster * cltr, bool was_empty);
+static bool __wake_one(cluster * cltr);
 static bool __wake_proc(processor *);
 …
         self_mon.recursion = 1;
         self_mon_p = &self_mon;
+        next = 0p;
+        link.next = 0p;
+        link.prev = 0p;
         node.next = 0p;
 …
         this.name = name;
         this.cltr = &cltr;
+        id = -1u;
         terminated{ 0 };
         destroyer = 0p;
 …
         this.preemption_rate = preemption_rate;
         ready_queue{};
         ready_queue_lock{};
+        ready_lock{};
         #if !defined(__CFA_NO_STATISTICS__)
 …
         __cfadbg_print_safe(runtime_core, "Kernel : core %p starting\n", this);
+        // register the processor unless it's the main thread which is handled in the boot sequence
+        if(this != mainProcessor) {
+                this->id = doregister2(this->cltr, this);
+                ready_queue_grow( this->cltr );
+        }
         doregister(this->cltr, this);
 …
                                 /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
                                 /* paranoid */ verifyf( readyThread->state == Ready || readyThread->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", readyThread->state, readyThread->preempted);
                                 /* paranoid */ verifyf( readyThread->next == 0p, "Expected null got %p", readyThread->next );
+                                /* paranoid */ verifyf( readyThread->link.next == 0p, "Expected null got %p", readyThread->link.next );
                                 // We found a thread run it
 …
         V( this->terminated );
+        // unregister the processor unless it's the main thread which is handled in the boot sequence
+        if(this != mainProcessor) {
+                ready_queue_shrink( this->cltr );
+                unregister2(this->cltr, this);
+        }
+        else {
+                // HACK : the coroutine context switch expects this_thread to be set
+                // and it make sense for it to be set in all other cases except here
+                // fake it
+                kernelTLS.this_thread = mainThread;
+        }
         __cfadbg_print_safe(runtime_core, "Kernel : core %p terminated\n", this);
+        // HACK : the coroutine context switch expects this_thread to be set
+        // and it make sense for it to be set in all other cases except here
+        // fake it
+        if( this == mainProcessor ) kernelTLS.this_thread = mainThread;
+        stats_tls_tally(this->cltr);
+}
 …
 // Scheduler routines
 // KERNEL ONLY
+void __schedule_thread( $thread * thrd ) with( *thrd->curr_cluster ) {
+void __schedule_thread( $thread * thrd ) {
+        /* paranoid */ verify( thrd );
+        /* paranoid */ verify( thrd->state != Halted );
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         /* paranoid */ #if defined( __CFA_WITH_VERIFY__ )
         /* paranoid */ if( thrd->state == Blocked || thrd->state == Start ) assertf( thrd->preempted == __NO_PREEMPTION,
                           "Error inactive thread marked as preempted, state %d, preemption %d\n", thrd->state, thrd->preempted );
         /* paranoid */ if( thrd->preempted != __NO_PREEMPTION ) assertf(thrd->state == Active || thrd->state == Rerun,
                           "Error preempted thread marked as not currently running, state %d, preemption %d\n", thrd->state, thrd->preempted );
+        /* paranoid */  if( thrd->state == Blocked || thrd->state == Start ) assertf( thrd->preempted == __NO_PREEMPTION,
+                                        "Error inactive thread marked as preempted, state %d, preemption %d\n", thrd->state, thrd->preempted );
+        /* paranoid */  if( thrd->preempted != __NO_PREEMPTION ) assertf(thrd->state == Active || thrd->state == Rerun,
+                                        "Error preempted thread marked as not currently running, state %d, preemption %d\n", thrd->state, thrd->preempted );
         /* paranoid */ #endif
         /* paranoid */ verifyf( thrd->next == 0p, "Expected null got %p", thrd->next );
+        /* paranoid */ verifyf( thrd->link.next == 0p, "Expected null got %p", thrd->link.next );
         if (thrd->preempted == __NO_PREEMPTION) thrd->state = Ready;
+        lock  ( ready_queue_lock __cfaabi_dbg_ctx2 );
+        bool was_empty = !(ready_queue != 0);
+        append( ready_queue, thrd );
+        unlock( ready_queue_lock );
+        __wake_one(thrd->curr_cluster, was_empty);
+        ready_schedule_lock(thrd->curr_cluster, kernelTLS.this_processor);
+                push( thrd->curr_cluster, thrd );
+                __wake_one(thrd->curr_cluster);
+        ready_schedule_unlock(thrd->curr_cluster, kernelTLS.this_processor);
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 …
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         lock( ready_queue_lock __cfaabi_dbg_ctx2 );
         $thread * head = pop_head( ready_queue );
         unlock( ready_queue_lock );
+        ready_schedule_lock(this, kernelTLS.this_processor);
+                $thread * head = pop( this );
+        ready_schedule_unlock(this, kernelTLS.this_processor);
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 …
         // If that is the case, abandon the preemption.
         bool preempted = false;
         if(thrd->next == 0p) {
+        if(thrd->link.next == 0p) {
                 preempted = true;
                 thrd->preempted = reason;
 …
                 pending_preemption = false;
                 kernel_thread = pthread_self();
+                id = -1u;
                 runner{ &this };
 …
         mainProcessor = (processor *)&storage_mainProcessor;
         (*mainProcessor){};
+        mainProcessor->id = doregister2(mainCluster, mainProcessor);
         //initialize the global state variables
 …
         kernel_stop_preemption();
+        unregister2(mainCluster, mainProcessor);
         // Destroy the main processor and its context in reverse order of construction
         // These were manually constructed so we need manually destroy them
         void ^?{}(processor & this) with( this ){
                 /* paranoid */ verify( this.do_terminate == true );
+                __cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
+        }
 …
         // Final step, destroy the main thread since it is no longer needed
         // Since we provided a stack to this taxk it will not destroy anything
         /* paranoid */ verify(mainThread->self_cor.stack.storage == (__stack_t*)(((uintptr_t)&storage_mainThreadCtx)| 0x1));
 …
 // Wake a thread from the front if there are any
+static bool __wake_one(cluster * this, __attribute__((unused)) bool force) {
+        // if we don't want to force check if we know it's false
+        // if( !this->idles.head && !force ) return false;
+static bool __wake_one(cluster * this) {
         // First, lock the cluster idle
         lock( this->idle_lock __cfaabi_dbg_ctx2 );

libcfa/src/concurrency/kernel.hfa

-              ree06db5c
+              r97392b69
         // Cluster from which to get threads
         struct cluster * cltr;
+        unsigned int id;
         // Name of the processor
 …
         // Link lists fields
         struct __dbg_node_proc {
                 struct processor * next;
                 struct processor * prev;
+        struct __dbg_node_cltr {
+                processor * next;
+                processor * prev;
         } node;
 …
 #define CFA_CLUSTER_IO_BUFFLEN_OFFSET        16
+//-----------------------------------------------------------------------------
+// Cluster Tools
+// Cells use by the reader writer lock
+// while not generic it only relies on a opaque pointer
+struct __processor_id;
+// Reader-Writer lock protecting the ready-queue
+// while this lock is mostly generic some aspects
+// have been hard-coded to for the ready-queue for
+// simplicity and performance
+struct __clusterRWLock_t {
+        // total cachelines allocated
+        unsigned int max;
+        // cachelines currently in use
+        volatile unsigned int alloc;
+        // cachelines ready to itereate over
+        // (!= to alloc when thread is in second half of doregister)
+        volatile unsigned int ready;
+        // writer lock
+        volatile bool lock;
+        // data pointer
+        __processor_id * data;
+};
+void  ?{}(__clusterRWLock_t & this);
+void ^?{}(__clusterRWLock_t & this);
+// Intrusives lanes which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __intrusive_lane_t {
+        // spin lock protecting the queue
+        volatile bool lock;
+        // anchor for the head and the tail of the queue
+        struct __sentinel_t {
+                // Link lists fields
+                // instrusive link field for threads
+                // must be exactly as in $thread
+                __thread_desc_link link;
+        } before, after;
+#if defined(__CFA_WITH_VERIFY__)
+        // id of last processor to acquire the lock
+        // needed only to check for mutual exclusion violations
+        unsigned int last_id;
+        // number of items on this list
+        // needed only to check for deadlocks
+        unsigned int count;
+#endif
+        // Optional statistic counters
+        #if !defined(__CFA_NO_SCHED_STATS__)
+                struct __attribute__((aligned(64))) {
+                        // difference between number of push and pops
+                        ssize_t diff;
+                        // total number of pushes and pops
+                        size_t  push;
+                        size_t  pop ;
+                } stat;
+        #endif
+};
+void  ?{}(__intrusive_lane_t & this);
+void ^?{}(__intrusive_lane_t & this);
+typedef unsigned long long __cfa_readyQ_mask_t;
+// enum {
+//      __cfa_ready_queue_mask_size = (64 - sizeof(size_t)) / sizeof(size_t),
+//      __cfa_max_ready_queues = __cfa_ready_queue_mask_size * 8 * sizeof(size_t)
+// };
+#define __cfa_lane_mask_size ((64 - sizeof(size_t)) / sizeof(__cfa_readyQ_mask_t))
+#define __cfa_max_lanes (__cfa_lane_mask_size * 8 * sizeof(__cfa_readyQ_mask_t))
+//TODO adjust cache size to ARCHITECTURE
+// Structure holding the relaxed ready queue
+struct __attribute__((aligned(128))) __ready_queue_t {
+        // Data tracking how many/which lanes are used
+        // Aligned to 128 for cache locality
+        struct {
+                // number of non-empty lanes
+                volatile size_t count;
+                // bit mask, set bits indentify which lanes are non-empty
+                volatile __cfa_readyQ_mask_t mask[ __cfa_lane_mask_size ];
+        } used;
+        // Data tracking the actual lanes
+        // On a seperate cacheline from the used struct since
+        // used can change on each push/pop but this data
+        // only changes on shrink/grow
+        struct __attribute__((aligned(64))) {
+                // Arary of lanes
+                __intrusive_lane_t * volatile data;
+                // Number of lanes (empty or not)
+                volatile size_t count;
+        } lanes;
+        // Statistics
+        #if !defined(__CFA_NO_STATISTICS__)
+                __attribute__((aligned(64))) struct {
+                        struct {
+                                // Push statistic
+                                struct {
+                                        // number of attemps at pushing something
+                                        volatile size_t attempt;
+                                        // number of successes at pushing
+                                        volatile size_t success;
+                                } push;
+                                // Pop statistic
+                                struct {
+                                        // number of reads of the mask
+                                        // picking an empty __cfa_readyQ_mask_t counts here
+                                        // but not as an attempt
+                                        volatile size_t maskrds;
+                                        // number of attemps at poping something
+                                        volatile size_t attempt;
+                                        // number of successes at poping
+                                        volatile size_t success;
+                                } pop;
+                        } pick;
+                        // stats on the "used" struct of the queue
+                        // tracks average number of queues that are not empty
+                        // when pushing / poping
+                        struct {
+                                volatile size_t value;
+                                volatile size_t count;
+                        } used;
+                } global_stats;
+        #endif
+};
+void  ?{}(__ready_queue_t & this);
+void ^?{}(__ready_queue_t & this);
 //-----------------------------------------------------------------------------
 // Cluster
 struct cluster {
         // Ready queue locks
         __spinlock_t ready_queue_lock;
+        __clusterRWLock_t ready_lock;
         // Ready queue for threads
         __queue_t($thread) ready_queue;
+        __ready_queue_t ready_queue;
         // Name of the cluster

libcfa/src/concurrency/kernel_private.hfa

-              ree06db5c
+              r97392b69
 //-----------------------------------------------------------------------------
 // Utils
 #define KERNEL_STORAGE(T,X) static char storage_##X[sizeof(T)]
+#define KERNEL_STORAGE(T,X) __attribute((aligned(__alignof__(T)))) static char storage_##X[sizeof(T)]
 static inline uint32_t __tls_rand() {
 …
 void unregister( struct cluster * cltr, struct processor * proc );
+//=======================================================================
+// Cluster lock API
+//=======================================================================
+struct __attribute__((aligned(64))) __processor_id {
+        processor * volatile handle;
+        volatile bool lock;
+};
+// Lock-Free registering/unregistering of threads
+// Register a processor to a given cluster and get its unique id in return
+unsigned doregister2( struct cluster * cltr, struct processor * proc );
+// Unregister a processor from a given cluster using its id, getting back the original pointer
+void     unregister2( struct cluster * cltr, struct processor * proc );
+//=======================================================================
+// Reader-writer lock implementation
+// Concurrent with doregister/unregister,
+//    i.e., threads can be added at any point during or between the entry/exit
+//-----------------------------------------------------------------------
+// simple spinlock underlying the RWLock
+// Blocking acquire
+static inline void __atomic_acquire(volatile bool * ll) {
+        while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) {
+                while(__atomic_load_n(ll, (int)__ATOMIC_RELAXED))
+                        asm volatile("pause");
+        }
+        /* paranoid */ verify(*ll);
+}
+// Non-Blocking acquire
+static inline bool __atomic_try_acquire(volatile bool * ll) {
+        return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
+}
+// Release
+static inline void __atomic_unlock(volatile bool * ll) {
+        /* paranoid */ verify(*ll);
+        __atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
+}
+//-----------------------------------------------------------------------
+// Reader side : acquire when using the ready queue to schedule but not
+//  creating/destroying queues
+static inline void ready_schedule_lock( struct cluster * cltr, struct processor * proc) with(cltr->ready_lock) {
+        unsigned iproc = proc->id;
+        /*paranoid*/ verify(data[iproc].handle == proc);
+        /*paranoid*/ verify(iproc < ready);
+        // Step 1 : make sure no writer are in the middle of the critical section
+        while(__atomic_load_n(&lock, (int)__ATOMIC_RELAXED))
+                asm volatile("pause");
+        // Fence needed because we don't want to start trying to acquire the lock
+        // before we read a false.
+        // Not needed on x86
+        // std::atomic_thread_fence(std::memory_order_seq_cst);
+        // Step 2 : acquire our local lock
+        __atomic_acquire( &data[iproc].lock );
+        /*paranoid*/ verify(data[iproc].lock);
+}
+static inline void ready_schedule_unlock( struct cluster * cltr, struct processor * proc) with(cltr->ready_lock) {
+        unsigned iproc = proc->id;
+        /*paranoid*/ verify(data[iproc].handle == proc);
+        /*paranoid*/ verify(iproc < ready);
+        /*paranoid*/ verify(data[iproc].lock);
+        __atomic_unlock(&data[iproc].lock);
+}
+//-----------------------------------------------------------------------
+// Writer side : acquire when changing the ready queue, e.g. adding more
+//  queues or removing them.
+uint_fast32_t ready_mutate_lock( struct cluster & cltr );
+void ready_mutate_unlock( struct cluster & cltr, uint_fast32_t /* value returned by lock */ );
+//=======================================================================
+// Ready-Queue API
+//-----------------------------------------------------------------------
+// push thread onto a ready queue for a cluster
+// returns true if the list was previously empty, false otherwise
+__attribute__((hot)) bool push(struct cluster * cltr, struct $thread * thrd);
+//-----------------------------------------------------------------------
+// pop thread from the ready queue of a cluster
+// returns 0p if empty
+__attribute__((hot)) struct $thread * pop(struct cluster * cltr);
+//-----------------------------------------------------------------------
+// Increase the width of the ready queue (number of lanes) by 4
+void ready_queue_grow  (struct cluster * cltr);
+//-----------------------------------------------------------------------
+// Decrease the width of the ready queue (number of lanes) by 4
+void ready_queue_shrink(struct cluster * cltr);
+//-----------------------------------------------------------------------
+// Statics call at the end of each thread to register statistics
+#if !defined(__CFA_NO_STATISTICS__)
+void stats_tls_tally(struct cluster * cltr);
+#else
+static inline void stats_tls_tally(struct cluster * cltr) {}
+#endif
 // Local Variables: //
 // mode: c //

libcfa/src/concurrency/monitor.cfa

-              ree06db5c
+              r97392b69
                 // Some one else has the monitor, wait in line for it
                 /* paranoid */ verify( thrd->next == 0p );
+                /* paranoid */ verify( thrd->link.next == 0p );
                 append( this->entry_queue, thrd );
                 /* paranoid */ verify( thrd->next == 1p );
+                /* paranoid */ verify( thrd->link.next == 1p );
                 unlock( this->lock );
 …
                 // Some one else has the monitor, wait in line for it
                 /* paranoid */ verify( thrd->next == 0p );
+                /* paranoid */ verify( thrd->link.next == 0p );
                 append( this->entry_queue, thrd );
                 /* paranoid */ verify( thrd->next == 1p );
+                /* paranoid */ verify( thrd->link.next == 1p );
                 unlock( this->lock );
 …
         $thread * new_owner = pop_head( this->entry_queue );
         /* paranoid */ verifyf( !this->owner || kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
         /* paranoid */ verify( !new_owner || new_owner->next == 0p );
+        /* paranoid */ verify( !new_owner || new_owner->link.next == 0p );
         __set_owner( this, new_owner );
 …
+        }
         __cfaabi_dbg_print_safe( "Kernel :  Runing %i (%p)\n", ready2run, ready2run ? node->waiting_thread : 0p );
+        __cfaabi_dbg_print_safe( "Kernel :  Runing %i (%p)\n", ready2run, ready2run ? (thread*)node->waiting_thread : (thread*)0p );
         return ready2run ? node->waiting_thread : 0p;
+}
 …
         // For each thread in the entry-queue
         for(    $thread ** thrd_it = &entry_queue.head;
                 *thrd_it != 1p;
                 thrd_it = &(*thrd_it)->next
+                (*thrd_it) != 1p;
+                thrd_it = &(*thrd_it)->link.next
         ) {
                 // For each acceptable check if it matches

libcfa/src/concurrency/preemption.cfa

ree06db5c	r97392b69
121	121	// If there are still alarms pending, reset the timer
122	122	if( & (*alarms)`first ) {
123		__cfa~~abi_dbg_print_buffer_decl(~~ " KERNEL: @%ju(%ju) resetting alarm to %ju.\n", currtime.tv, __kernel_get_time().tv, (alarms->head->alarm - currtime).tv);
	123	__cfadbg_print_buffer_decl(preemption, " KERNEL: @%ju(%ju) resetting alarm to %ju.\n", currtime.tv, __kernel_get_time().tv, (alarms->head->alarm - currtime).tv);
124	124	Duration delta = (*alarms)`first.alarm - currtime;
125	125	Duration capped = max(delta, 50`us);

libcfa/src/concurrency/thread.cfa

-              ree06db5c
+              r97392b69
         self_mon_p = &self_mon;
         curr_cluster = &cl;
+        next = 0p;
+        link.next = 0p;
+        link.prev = 0p;
         node.next = 0p;

libcfa/src/stdhdr/assert.h

-              ree06db5c
+              r97392b69
         #define verify(x) assert(x)
         #define verifyf(x, ...) assertf(x, __VA_ARGS__)
+        #define verifyfail(...)
         #define __CFA_WITH_VERIFY__
 #else
         #define verify(x)
         #define verifyf(x, ...)
+        #define verifyfail(...)
 #endif

tests/concurrent/examples/datingService.cfa

-              ree06db5c
+              r97392b69
                 signal_block( Boys[ccode] );                                    // restart boy to set phone number
         } // if
         //sout | "Girl:" | PhoneNo | "is dating Boy at" | BoyPhoneNo | "with ccode" | ccode;
+        // sout | "Girl:" | PhoneNo | "is dating Boy at" | BoyPhoneNo | "with ccode" | ccode;
         return BoyPhoneNo;
 } // DatingService girl
 …
                 signal_block( Girls[ccode] );                                   // restart girl to set phone number
         } // if
         //sout | " Boy:" | PhoneNo | "is dating Girl" | GirlPhoneNo | "with ccode" | ccode;
+        // sout | " Boy:" | PhoneNo | "is dating Girl" | GirlPhoneNo | "with ccode" | ccode;
         return GirlPhoneNo;
 } // DatingService boy

tests/concurrent/waitfor/when.cfa

-              ree06db5c
+              r97392b69
 void arbiter( global_t & mutex this ) {
+        // There is a race at start where callers can get in before the arbiter.
+        // It doesn't really matter here so just restart the loop correctly and move on
+        this.last_call = 6;
         for( int i = 0; i < N; i++ ) {
                    when( this.last_call == 6 ) waitfor( call1 : this ) { if( this.last_call != 1) { serr | "Expected last_call to be 1 got" | this.last_call; } }

Context Navigation

Changes in / [ee06db5c:97392b69]

Legend:

doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp

doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp

doc/theses/thierry_delisle_PhD/code/utils.hpp

libcfa/src/Makefile.am

libcfa/src/Makefile.in

libcfa/src/bits/debug.hfa

libcfa/src/bits/defs.hfa

libcfa/src/concurrency/invoke.h

libcfa/src/concurrency/io.cfa

libcfa/src/concurrency/kernel.cfa

libcfa/src/concurrency/kernel.hfa

libcfa/src/concurrency/kernel_private.hfa

libcfa/src/concurrency/monitor.cfa

libcfa/src/concurrency/preemption.cfa

libcfa/src/concurrency/thread.cfa

libcfa/src/stdhdr/assert.h

tests/concurrent/examples/datingService.cfa

tests/concurrent/waitfor/when.cfa

Download in other formats: