Index: doc/theses/thierry_delisle_PhD/code/Makefile =================================================================== --- doc/theses/thierry_delisle_PhD/code/Makefile (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,22 +1,0 @@ - - -CXXFLAGS = -O3 -g -Wall -Wextra -std=c++17 -LDFLAGS = -pthread -latomic - -push: - clang++ relaxed_list.cpp -g -Wall -Wextra -std=c++17 -fsyntax-only && rsync -av relaxed_list.cpp relaxed_list.hpp utils.hpp assert.hpp scale.sh plg7b:~/workspace/sched/. - -relaxed_list: $(firstword $(MAKEFILE_LIST)) | build - clang++ relaxed_list.cpp $(CXXFLAGS) $(LDFLAGS) -lpng -MMD -MF build/$(@).d -o $(@) - --include build/relaxed_list.d - -layout.ast: $(firstword $(MAKEFILE_LIST)) | build - clang++ relaxed_list_layout.cpp $(CXXFLAGS) -MMD -MF build/$(@).d -MT $(@) -E -o build/$(@).ii - clang++ -Xclang -fdump-record-layouts -fsyntax-only $(CXXFLAGS) build/$(@).ii > build/layout.ast.raw - cat build/$(@).raw > $(@) - --include build/layout.ast.d - -build: - mkdir -p build Index: doc/theses/thierry_delisle_PhD/code/assert.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/assert.hpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,22 +1,0 @@ -#pragma once - -#ifndef NDEBUG -#include -#include - -#define sstr(s) #s -#define xstr(s) sstr(s) - -extern const char * __my_progname; - -#define assertf(cond, ...) ({ \ - if(!(cond)) { \ - fprintf(stderr, "%s: " __FILE__ ":" xstr(__LINE__) ": %s: Assertion '" xstr(cond) "' failed.\n", __my_progname, __PRETTY_FUNCTION__); \ - fprintf(stderr, __VA_ARGS__); \ - fprintf(stderr, "\n"); \ - std::abort(); \ - } \ -}) -#else -#define assertf(cond, ...) -#endif Index: doc/theses/thierry_delisle_PhD/code/bitbench/select.cpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/bitbench/select.cpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,186 +1,0 @@ - -#include "../utils.hpp" - -void consume(int i, int j) __attribute__((noinline)); -void consume(int i, int j) { - asm volatile("":: "rm" (i), "rm" (i) ); -} - -static inline unsigned rand_bit_sw(unsigned rnum, size_t mask) { - unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0; - uint64_t v = mask; // Input value to find position with rank r. - unsigned int r = bit + 1;// Input: bit's desired rank [1-64]. - unsigned int s; // Output: Resulting position of bit with rank r [1-64] - uint64_t a, b, c, d; // Intermediate temporaries for bit count. - unsigned int t; // Bit count temporary. - - // Do a normal parallel bit count for a 64-bit integer, - // but store all intermediate steps. - a = v - ((v >> 1) & ~0UL/3); - b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5); - c = (b + (b >> 4)) & ~0UL/0x11; - d = (c + (c >> 8)) & ~0UL/0x101; - - - t = (d >> 32) + (d >> 48); - // Now do branchless select! - s = 64; - s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8)); - t = (d >> (s - 16)) & 0xff; - s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8)); - t = (c >> (s - 8)) & 0xf; - s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8)); - t = (b >> (s - 4)) & 0x7; - s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8)); - t = (a >> (s - 2)) & 0x3; - s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8)); - t = (v >> (s - 1)) & 0x1; - s -= ((t - r) & 256) >> 8; - return s - 1; -} - -static inline unsigned rand_bit_hw(unsigned rnum, size_t mask) { - unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0; - uint64_t picked = _pdep_u64(1ul << bit, mask); - return picked ? __builtin_ctzl(picked) : 0; -} - -struct TLS { - Random rng = { 6 }; -} tls; - -const unsigned numLists = 64; - -static inline void blind() { - int i = tls.rng.next() % numLists; - int j = tls.rng.next() % numLists; - - consume(i, j); -} - -std::atomic_size_t list_mask[7]; -static inline void bitmask_sw() { - unsigned i, j; - { - // Pick two lists at random - unsigned num = ((numLists - 1) >> 6) + 1; - - unsigned ri = tls.rng.next(); - unsigned rj = tls.rng.next(); - - unsigned wdxi = (ri >> 6u) % num; - unsigned wdxj = (rj >> 6u) % num; - - size_t maski = list_mask[wdxi].load(std::memory_order_relaxed); - size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed); - - unsigned bi = rand_bit_sw(ri, maski); - unsigned bj = rand_bit_sw(rj, maskj); - - i = bi | (wdxi << 6); - j = bj | (wdxj << 6); - } - - consume(i, j); -} - -static inline void bitmask_hw() { - #if !defined(__BMI2__) - #warning NO bmi2 for pdep rand_bit - return; - #endif - unsigned i, j; - { - // Pick two lists at random - unsigned num = ((numLists - 1) >> 6) + 1; - - unsigned ri = tls.rng.next(); - unsigned rj = tls.rng.next(); - - unsigned wdxi = (ri >> 6u) % num; - unsigned wdxj = (rj >> 6u) % num; - - size_t maski = list_mask[wdxi].load(std::memory_order_relaxed); - size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed); - - unsigned bi = rand_bit_hw(ri, maski); - unsigned bj = rand_bit_hw(rj, maskj); - - i = bi | (wdxi << 6); - j = bj | (wdxj << 6); - } - - consume(i, j); -} - -struct { - const unsigned mask = 7; - const unsigned depth = 3; - const uint64_t indexes = 0x0706050403020100; - uint64_t masks( unsigned node ) { - return 0xff00ffff00ff; - } -} snzm; -static inline void sparsemask() { - #if !defined(__BMI2__) - #warning NO bmi2 for sparse mask - return; - #endif - unsigned i, j; - { - // Pick two random number - unsigned ri = tls.rng.next(); - unsigned rj = tls.rng.next(); - - // Pick two nodes from it - unsigned wdxi = ri & snzm.mask; - unsigned wdxj = rj & snzm.mask; - - // Get the masks from the nodes - size_t maski = snzm.masks(wdxi); - size_t maskj = snzm.masks(wdxj); - - uint64_t idxsi = _pext_u64(snzm.indexes, maski); - uint64_t idxsj = _pext_u64(snzm.indexes, maskj); - - auto pi = __builtin_popcountll(maski); - auto pj = __builtin_popcountll(maskj); - - ri = pi ? ri & ((pi >> 3) - 1) : 0; - rj = pj ? rj & ((pj >> 3) - 1) : 0; - - unsigned bi = (idxsi >> (ri << 3)) & 0xff; - unsigned bj = (idxsj >> (rj << 3)) & 0xff; - - i = (bi << snzm.depth) | wdxi; - j = (bj << snzm.depth) | wdxj; - } - - consume(i, j); -} - -template -void benchmark( T func, const std::string & name ) { - std::cout << "Starting " << name << std::endl; - auto before = Clock::now(); - const int N = 250'000'000; - for(int i = 0; i < N; i++) { - func(); - } - auto after = Clock::now(); - duration_t durr = after - before; - double duration = durr.count(); - std::cout << "Duration(s) : " << duration << std::endl; - std::cout << "Ops/sec : " << uint64_t(N / duration) << std::endl; - std::cout << "ns/Op : " << double(duration * 1'000'000'000.0 / N) << std::endl; - std::cout << std::endl; -} - -int main() { - std::cout.imbue(std::locale("")); - - benchmark(blind, "Blind guess"); - benchmark(bitmask_sw, "Dense bitmask"); - benchmark(bitmask_hw, "Dense bitmask with Parallel Deposit"); - benchmark(sparsemask, "Parallel Extract bitmask"); -} Index: doc/theses/thierry_delisle_PhD/code/bts.cpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/bts.cpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,279 +1,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "utils.hpp" - -// ================================================================================================ -// UTILS -// ================================================================================================ - -struct local_stat_t { - size_t cnt = 0; -}; - -struct global_stat_t { - std::atomic_size_t cnt = { 0 }; -}; - -void atomic_max(std::atomic_size_t & target, size_t value) { - for(;;) { - size_t expect = target.load(std::memory_order_relaxed); - if(value <= expect) return; - bool success = target.compare_exchange_strong(expect, value); - if(success) return; - } -} - -void atomic_min(std::atomic_size_t & target, size_t value) { - for(;;) { - size_t expect = target.load(std::memory_order_relaxed); - if(value >= expect) return; - bool success = target.compare_exchange_strong(expect, value); - if(success) return; - } -} - -void tally_stats(global_stat_t & global, local_stat_t & local) { - global.cnt += local.cnt; -} - -void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) { - std::cout << "Starting" << std::endl; - auto before = Clock::now(); - barrier.wait(0); - - while(true) { - usleep(100000); - auto now = Clock::now(); - duration_t durr = now - before; - if( durr.count() > duration ) { - done = true; - break; - } - std::cout << "\r" << std::setprecision(4) << durr.count(); - std::cout.flush(); - } - - barrier.wait(0); - auto after = Clock::now(); - duration_t durr = after - before; - duration = durr.count(); - std::cout << "\rClosing down" << std::endl; -} - -void waitfor(double & duration, barrier_t & barrier, const std::atomic_size_t & count) { - std::cout << "Starting" << std::endl; - auto before = Clock::now(); - barrier.wait(0); - - while(true) { - usleep(100000); - size_t c = count.load(); - if( c == 0 ) { - break; - } - std::cout << "\r" << c; - std::cout.flush(); - } - - barrier.wait(0); - auto after = Clock::now(); - duration_t durr = after - before; - duration = durr.count(); - std::cout << "\rClosing down" << std::endl; -} - -void print_stats(double duration, unsigned nthread, global_stat_t & global) { - std::cout << "Done" << std::endl; - - size_t ops = global.cnt; - size_t ops_sec = size_t(double(ops) / duration); - size_t ops_thread = ops_sec / nthread; - auto dur_nano = duration_cast(1.0); - - std::cout << "Duration : " << duration << "s\n"; - std::cout << "ns/Op : " << ( dur_nano / ops_thread )<< "\n"; - std::cout << "Ops/sec/thread: " << ops_thread << "\n"; - std::cout << "Ops/sec : " << ops_sec << "\n"; - std::cout << "Total ops : " << ops << "\n"; -} - -static inline bool bts(std::atomic_size_t & target, size_t bit ) { - /* - int result = 0; - asm volatile( - "LOCK btsq %[bit], %[target]\n\t" - :"=@ccc" (result) - : [target] "m" (target), [bit] "r" (bit) - ); - return result != 0; - /*/ - size_t mask = 1ul << bit; - size_t ret = target.fetch_or(mask, std::memory_order_relaxed); - return (ret & mask) != 0; - //*/ -} - -static inline bool btr(std::atomic_size_t & target, size_t bit ) { - /* - int result = 0; - asm volatile( - "LOCK btrq %[bit], %[target]\n\t" - :"=@ccc" (result) - : [target] "m" (target), [bit] "r" (bit) - ); - return result != 0; - /*/ - size_t mask = 1ul << bit; - size_t ret = target.fetch_and(~mask, std::memory_order_relaxed); - return (ret & mask) != 0; - //*/ -} - -// ================================================================================================ -// EXPERIMENTS -// ================================================================================================ - -// ================================================================================================ -__attribute__((noinline)) void runPingPong_body( - std::atomic& done, - local_stat_t & local, - std::atomic_size_t & target, - size_t id -) { - while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) { - - bool ret; - ret = bts(target, id); - assert(!ret); - - // ----- - - ret = btr(target, id); - assert(ret); - local.cnt++; - } -} - -void run(unsigned nthread, double duration) { - // Barrier for synchronization - barrier_t barrier(nthread + 1); - - // Data to check everything is OK - global_stat_t global; - - // Flag to signal termination - std::atomic_bool done = { false }; - - std::cout << "Initializing "; - // List being tested - std::atomic_size_t word = { 0 }; - { - std::thread * threads[nthread]; - unsigned i = 1; - for(auto & t : threads) { - t = new std::thread([&done, &word, &barrier, &global](unsigned tid) { - local_stat_t local; - - // affinity(tid); - - barrier.wait(tid); - - // EXPERIMENT START - - runPingPong_body(done, local, word, tid - 1); - - // EXPERIMENT END - - barrier.wait(tid); - - tally_stats(global, local); - }, i++); - } - - waitfor(duration, barrier, done); - - for(auto t : threads) { - t->join(); - delete t; - } - } - - print_stats(duration, nthread, global); -} - -// ================================================================================================ - -int main(int argc, char * argv[]) { - - double duration = 5.0; - unsigned nthreads = 2; - - std::cout.imbue(std::locale("")); - - for(;;) { - static struct option options[] = { - {"duration", required_argument, 0, 'd'}, - {"nthreads", required_argument, 0, 't'}, - {0, 0, 0, 0} - }; - - int idx = 0; - int opt = getopt_long(argc, argv, "d:t:", options, &idx); - - std::string arg = optarg ? optarg : ""; - size_t len = 0; - switch(opt) { - case -1: - if(optind != argc) { - std::cerr << "Too many arguments " << argc << " " << idx << std::endl; - goto usage; - } - goto run; - // Numeric Arguments - case 'd': - try { - duration = std::stod(optarg, &len); - if(len != arg.size()) { throw std::invalid_argument(""); } - } catch(std::invalid_argument &) { - std::cerr << "Duration must be a valid double, was " << arg << std::endl; - goto usage; - } - break; - case 't': - try { - nthreads = std::stoul(optarg, &len); - if(len != arg.size() || nthreads > (8 * sizeof(size_t))) { throw std::invalid_argument(""); } - } catch(std::invalid_argument &) { - std::cerr << "Number of threads must be a positive integer less than or equal to " << sizeof(size_t) * 8 << ", was " << arg << std::endl; - goto usage; - } - break; - // Other cases - default: /* ? */ - std::cerr << opt << std::endl; - usage: - std::cerr << "Usage: " << argv[0] << ": [options]" << std::endl; - std::cerr << std::endl; - std::cerr << " -d, --duration=DURATION Duration of the experiment, in seconds" << std::endl; - std::cerr << " -t, --nthreads=NTHREADS Number of kernel threads" << std::endl; - std::exit(1); - } - } - run: - - check_cache_line_size(); - - std::cout << "Running " << nthreads << " threads for " << duration << " seconds" << std::endl; - run(nthreads, duration); - return 0; -} Index: doc/theses/thierry_delisle_PhD/code/bts_test.cpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/bts_test.cpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,32 +1,0 @@ -#include -#include - -bool bts(volatile size_t & target, size_t bit ) { - bool result = false; - asm volatile( - "LOCK btsq %[bit], %[target]\n\t" - :"=c" (result) - : [target] "m" (target), [bit] "r" (bit) - ); - return result; -} - -bool btr(volatile size_t & target, size_t bit ) { - bool result = false; - asm volatile( - "LOCK btrq %[bit], %[target]\n\t" - :"=c" (result) - : [target] "m" (target), [bit] "r" (bit) - ); - return result; -} - -int main() { - volatile size_t i = 0; - std::cout << std::hex << i << std::endl; - assert(bts(i, 31)); - std::cout << std::hex << i << std::endl; - assert(btr(i, 31)); - std::cout << std::hex << i << std::endl; - return 0; -} Index: doc/theses/thierry_delisle_PhD/code/links.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/links.hpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,122 +1,0 @@ -#pragma once - -#include "assert.hpp" -#include "utils.hpp" - -template -struct _LinksFields_t { - node_t * prev = nullptr; - node_t * next = nullptr; - volatile unsigned long long ts = 0; - unsigned hint = (unsigned)-1; -}; - -template -class __attribute__((aligned(128))) intrusive_queue_t { -public: - typedef spinlock_t lock_t; - - struct stat { - ssize_t diff = 0; - size_t push = 0; - size_t pop = 0; - }; - -private: - struct sentinel_t { - _LinksFields_t _links; - }; - -public: - lock_t lock; - -private: - sentinel_t before; - sentinel_t after; - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Winvalid-offsetof" - static constexpr auto fields_offset = offsetof( node_t, _links ); -#pragma GCC diagnostic pop -public: - intrusive_queue_t() - : before{{ nullptr, tail() }} - , after {{ head(), nullptr }} - { - /* paranoid */ assert((reinterpret_cast( head() ) + fields_offset) == reinterpret_cast(&before)); - /* paranoid */ assert((reinterpret_cast( tail() ) + fields_offset) == reinterpret_cast(&after )); - /* paranoid */ assert(head()->_links.prev == nullptr); - /* paranoid */ assert(head()->_links.next == tail() ); - /* paranoid */ assert(tail()->_links.next == nullptr); - /* paranoid */ assert(tail()->_links.prev == head() ); - /* paranoid */ assert(sizeof(*this) == 128); - /* paranoid */ assert((intptr_t(this) % 128) == 0); - } - - ~intrusive_queue_t() = default; - - inline node_t * head() const { - node_t * rhead = reinterpret_cast( - reinterpret_cast( &before ) - fields_offset - ); - assert(rhead); - return rhead; - } - - inline node_t * tail() const { - node_t * rtail = reinterpret_cast( - reinterpret_cast( &after ) - fields_offset - ); - assert(rtail); - return rtail; - } - - inline bool push(node_t * node) { - assert(lock); - assert(node->_links.ts != 0); - node_t * tail = this->tail(); - - node_t * prev = tail->_links.prev; - // assertf(node->_links.ts >= prev->_links.ts, - // "New node has smaller timestamp: %llu < %llu", node->_links.ts, prev->_links.ts); - node->_links.next = tail; - node->_links.prev = prev; - prev->_links.next = node; - tail->_links.prev = node; - - if(before._links.ts == 0l) { - before._links.ts = node->_links.ts; - assert(node->_links.prev == this->head()); - return true; - } - return false; - } - - inline std::pair pop() { - assert(lock); - node_t * head = this->head(); - node_t * tail = this->tail(); - - node_t * node = head->_links.next; - node_t * next = node->_links.next; - if(node == tail) return {nullptr, false}; - - head->_links.next = next; - next->_links.prev = head; - - if(next == tail) { - before._links.ts = 0l; - return {node, true}; - } - else { - assert(next->_links.ts != 0); - before._links.ts = next->_links.ts; - assert(before._links.ts != 0); - return {node, false}; - } - } - - long long ts() const { - return before._links.ts; - } -}; Index: doc/theses/thierry_delisle_PhD/code/prefetch.cpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/prefetch.cpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,106 +1,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -struct __attribute__((aligned(64))) element { - size_t value; -}; - -using block = std::array; - -block * create() { - block * b = new block(); - for(auto & e : *b) { - e.value = rand(); - } - b->back().value = b->size(); - - return b; -} - -static inline size_t find(const block & b) { - size_t r = 0; - for(; r < b.size(); r++) { - if(__builtin_expect(b[r].value == b.size(), false)) break; - } - - return r; -} - -void usage(char * argv[]) { - std::cerr << argv[0] << ": [DURATION (FLOAT:SEC)] [NBLOCKS]" << std::endl;; - std::exit(1); -} - -int main(int argc, char * argv[]) { - size_t nblocks = 1000; - double duration = 5; - - std::cout.imbue(std::locale("")); - - switch (argc) - { - case 3: - nblocks = std::stoul(argv[2]); - [[fallthrough]]; - case 2: - duration = std::stod(argv[1]); - if( duration <= 0.0 ) { - std::cerr << "Duration must be positive, was " << argv[1] << "(" << duration << ")" << std::endl; - usage(argv); - } - [[fallthrough]]; - case 1: - break; - default: - usage(argv); - break; - } - - std::vector> blocks; - for(size_t i = 0; i < nblocks; i++) { - blocks.emplace_back( create() ); - } - std::random_shuffle(blocks.begin(), blocks.end()); - - size_t CRC = 0; - size_t count = 0; - - using clock = std::chrono::high_resolution_clock; - auto before = clock::now(); - - while(true) { - for(const auto & b : blocks) { - CRC += find(*b); - count++; - } - auto now = clock::now(); - std::chrono::duration durr = now - before; - if( durr.count() > duration ) { - break; - } - } - - auto after = clock::now(); - std::chrono::duration durr = after - before; - duration = durr.count(); - - using std::chrono::duration_cast; - using std::chrono::nanoseconds; - - size_t ops_sec = size_t(double(count) / duration); - auto dur_nano = duration_cast(std::chrono::duration(1.0)).count(); - - std::cout << "CRC : " << CRC << "\n"; - std::cout << "Duration : " << duration << "s\n"; - std::cout << "Total ops : " << count << "\n"; - std::cout << "Ops/sec : " << ops_sec << "\n"; - std::cout << "ns/Op : " << ( dur_nano / ops_sec )<< "\n"; -} Index: doc/theses/thierry_delisle_PhD/code/process.sh =================================================================== --- doc/theses/thierry_delisle_PhD/code/process.sh (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,42 +1,0 @@ -#!/bin/bash - -NAME=$1 - -if [ ! -f "raw/${NAME}.out" ]; then - echo "Not output for ${NAME}" - exit 1 -fi - -if [ ! -f "raw/${NAME}.data" ]; then - echo "Not perf record for ${NAME}" - exit 1 -fi - -echo "Processing perf data for ${NAME}" - -OPS=$(grep -e 'Total ops' raw/${NAME}.out) -CPOP=$( echo "Hello $OPS" | \grep -oP ", \K[0-9,]+(?=o)" --color | tr -d ',') -CPUSH=$(echo "Hello $OPS" | \grep -oP "\(\K[0-9,]+(?=i)" --color | tr -d ',') - -REPORT='' -perf report -n --percent-limit 5 --stdio --no-children -i raw/${NAME}.data > raw/.temp -EVENT=$(cat raw/.temp | grep -e '^# Samples'| cut -d ' ' -f 6) -SPOP=$( cat raw/.temp | grep -e '] relaxed_list::pop' | tr -s ' ' | cut -d ' ' -f 3) -SPUSH=$(cat raw/.temp | grep -e '] relaxed_list::push' | tr -s ' ' | cut -d ' ' -f 3) -SARR=$( cat raw/.temp | grep -e '] snz[i|m]_t::node::arrive_h' | tr -s ' ' | cut -d ' ' -f 3) - -echo "$OPS" -echo "Push count: $CPUSH" -echo "Pop count: $CPOP" - -echo "Pop samples: $SPOP" -echo "Push samples: $SPUSH" -echo "Arrive samples: $SARR" - -SpPUSH=$(bc -l <<< "scale=9; $SPUSH / $CPUSH") -SpPOP=$( bc -l <<< "scale=9; $SPOP / $CPOP" ) -SpARR=$( bc -l <<< "scale=9; $SARR / $CPUSH") - -printf "%s per push() : %.9f\n" $EVENT $SpPUSH | sed ':a;s/\B[0-9]\{3\}\>/,&/;ta' -printf "%s per pop() : %.9f\n" $EVENT $SpPOP | sed ':a;s/\B[0-9]\{3\}\>/,&/;ta' -printf "%s per arrive(): %.9f\n" $EVENT $SpARR | sed ':a;s/\B[0-9]\{3\}\>/,&/;ta' Index: doc/theses/thierry_delisle_PhD/code/processor.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/processor.hpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,53 +1,0 @@ -#include - -struct thread {}; - -struct cluster { - void add(); - void remove(); - thread * next(); -}; - -struct processor { - - cluster cluster; - std::atomic stop; - volatile bool idle; -}; - - -void run(thread * ) { - // verify preemption - - // run Thread - - // verify preemption - - // finish Running -} - -void main(processor & self) { - - self.cluster.add(); - - while(!self.stop) { - if(thread * t = self.cluster.next()) { - run(t); - continue; - } - - self.set_idle(); - std::atomic_thread_fence(); - - if(thread * t = self.cluster.next()) { - self.idle = false; - run(t); - continue; - } - - halt(); - } - - self.cluster.remove(); - -} Index: doc/theses/thierry_delisle_PhD/code/processor_list.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/processor_list.hpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,215 +1,0 @@ -#include - -#include -#include -#include - -struct processor; - -struct __attribute__((aligned(64))) processor_id { - std::atomic handle; - std::atomic lock; - - processor_id() = default; - processor_id(processor * proc) : handle(proc), lock() { - /*paranoid*/ assert(std::atomic_is_lock_free(&lock)); - } -}; - -extern unsigned num(); - -#define ERROR throw 1 - -class processor_list { -private: - - static const constexpr std::size_t cache_line_size = 64; - - static_assert(sizeof (processor_id) <= cache_line_size, "ERROR: Instances must fit in one cache line" ); - static_assert(alignof(processor_id) == cache_line_size, "ERROR: Instances must aligned to one cache line" ); - - const unsigned max; // total cachelines allocated - std::atomic_uint alloc; // cachelines currently in use - std::atomic_uint ready; // cachelines ready to iterate over (!= to alloc when thread is in second half of doregister) - std::atomic lock; // writerlock - processor_id * data; // data pointer - -private: - inline void acquire(std::atomic & ll) { - while( __builtin_expect(ll.exchange(true),false) ) { - while(ll.load(std::memory_order_relaxed)) - asm volatile("pause"); - } - /* paranoid */ assert(ll); - } - -public: - processor_list() - : max(num()) - , alloc(0) - , ready(0) - , lock{false} - , data( new processor_id[max] ) - { - /*paranoid*/ assert(num() == max); - /*paranoid*/ assert(std::atomic_is_lock_free(&alloc)); - /*paranoid*/ assert(std::atomic_is_lock_free(&ready)); - } - - ~processor_list() { - delete[] data; - } - - //======================================================================= - // Lock-Free registering/unregistering of threads - unsigned doregister(processor * proc) { - // Step - 1 : check if there is already space in the data - uint_fast32_t s = ready; - - // Check among all the ready - for(uint_fast32_t i = 0; i < s; i++) { - processor * null = nullptr; // Re-write every loop since compare thrashes it - if( data[i].handle.load(std::memory_order_relaxed) == null - && data[i].handle.compare_exchange_strong(null, proc)) { - /*paranoid*/ assert(i < ready); - /*paranoid*/ assert(alignof(decltype(data[i])) == cache_line_size); - /*paranoid*/ assert((uintptr_t(&data[i]) % cache_line_size) == 0); - return i; - } - } - - if(max <= alloc) ERROR; - - // Step - 2 : F&A to get a new spot in the array. - uint_fast32_t n = alloc++; - if(max <= n) ERROR; - - // Step - 3 : Mark space as used and then publish it. - void * storage = &data[n]; - new (storage) processor_id( proc ); - while(true) { - unsigned copy = n; - if( ready.load(std::memory_order_relaxed) == n - && ready.compare_exchange_weak(copy, n + 1) ) - break; - asm volatile("pause"); - } - - // Return new spot. - /*paranoid*/ assert(n < ready); - /*paranoid*/ assert(alignof(decltype(data[n])) == cache_line_size); - /*paranoid*/ assert((uintptr_t(&data[n]) % cache_line_size) == 0); - return n; - } - - processor * unregister(unsigned iproc) { - /*paranoid*/ assert(iproc < ready); - auto ret = data[iproc].handle.load(std::memory_order_relaxed); - data[iproc].handle = nullptr; - return ret; - } - - // Reset all registration - // Unsafe in most cases, use for testing only. - void reset() { - alloc = 0; - ready = 0; - } - - processor * get(unsigned iproc) { - return data[iproc].handle.load(std::memory_order_relaxed); - } - - //======================================================================= - // Reader-writer lock implementation - // Concurrent with doregister/unregister, - // i.e., threads can be added at any point during or between the entry/exit - - //----------------------------------------------------------------------- - // Reader side - void read_lock(unsigned iproc) { - /*paranoid*/ assert(iproc < ready); - - // Step 1 : make sure no writer are in the middle of the critical section - while(lock.load(std::memory_order_relaxed)) - asm volatile("pause"); - - // Fence needed because we don't want to start trying to acquire the lock - // before we read a false. - // Not needed on x86 - // std::atomic_thread_fence(std::memory_order_seq_cst); - - // Step 2 : acquire our local lock - acquire( data[iproc].lock ); - /*paranoid*/ assert(data[iproc].lock); - } - - void read_unlock(unsigned iproc) { - /*paranoid*/ assert(iproc < ready); - /*paranoid*/ assert(data[iproc].lock); - data[iproc].lock.store(false, std::memory_order_release); - } - - //----------------------------------------------------------------------- - // Writer side - uint_fast32_t write_lock() { - // Step 1 : lock global lock - // It is needed to avoid processors that register mid Critical-Section - // to simply lock their own lock and enter. - acquire(lock); - - // Step 2 : lock per-proc lock - // Processors that are currently being registered aren't counted - // but can't be in read_lock or in the critical section. - // All other processors are counted - uint_fast32_t s = ready; - for(uint_fast32_t i = 0; i < s; i++) { - acquire( data[i].lock ); - } - - return s; - } - - void write_unlock(uint_fast32_t last_s) { - // Step 1 : release local locks - // This must be done while the global lock is held to avoid - // threads that where created mid critical section - // to race to lock their local locks and have the writer - // immidiately unlock them - // Alternative solution : return s in write_lock and pass it to write_unlock - for(uint_fast32_t i = 0; i < last_s; i++) { - assert(data[i].lock); - data[i].lock.store(false, std::memory_order_release); - } - - // Step 2 : release global lock - /*paranoid*/ assert(true == lock); - lock.store(false, std::memory_order_release); - } - - //----------------------------------------------------------------------- - // Checking support - uint_fast32_t epoch_check() { - // Step 1 : lock global lock - // It is needed to avoid processors that register mid Critical-Section - // to simply lock their own lock and enter. - while(lock.load(std::memory_order_relaxed)) - asm volatile("pause"); - - // Step 2 : lock per-proc lock - // Processors that are currently being registered aren't counted - // but can't be in read_lock or in the critical section. - // All other processors are counted - uint_fast32_t s = ready; - for(uint_fast32_t i = 0; i < s; i++) { - while(data[i].lock.load(std::memory_order_relaxed)) - asm volatile("pause"); - } - - return s; - } - -public: -}; - -#undef ERROR Index: doc/theses/thierry_delisle_PhD/code/processor_list_fast.cpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/processor_list_fast.cpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,173 +1,0 @@ -#include "processor_list.hpp" - -#include -#include -#include -#include -#include -#include - -#include "utils.hpp" - -unsigned num() { - return 0x1000000; -} - -//------------------- - -struct processor { - unsigned id; -}; -void run(unsigned nthread, double duration, unsigned writes, unsigned epochs) { - assert(writes < 100); - - // List being tested - processor_list list = {}; - - // Barrier for synchronization - barrier_t barrier(nthread + 1); - - // Data to check everything is OK - size_t write_committed = 0ul; - struct { - std::atomic_size_t write = { 0ul }; - std::atomic_size_t read = { 0ul }; - std::atomic_size_t epoch = { 0ul }; - } lock_cnt; - - // Flag to signal termination - std::atomic_bool done = { false }; - - std::thread * threads[nthread]; - unsigned i = 1; - for(auto & t : threads) { - t = new std::thread([&done, &list, &barrier, &write_committed, &lock_cnt, writes, epochs](unsigned tid) { - Random rand(tid + rdtscl()); - processor proc; - proc.id = list.doregister(&proc); - size_t writes_cnt = 0; - size_t reads_cnt = 0; - size_t epoch_cnt = 0; - - affinity(tid); - - barrier.wait(tid); - - while(__builtin_expect(!done, true)) { - auto r = rand.next() % 100; - if (r < writes) { - auto n = list.write_lock(); - write_committed++; - writes_cnt++; - assert(writes_cnt < -2ul); - list.write_unlock(n); - } - else if(r < epochs) { - list.epoch_check(); - epoch_cnt++; - } - else { - list.read_lock(proc.id); - reads_cnt++; - assert(reads_cnt < -2ul); - list.read_unlock(proc.id); - } - } - - barrier.wait(tid); - - auto p = list.unregister(proc.id); - assert(&proc == p); - lock_cnt.write += writes_cnt; - lock_cnt.read += reads_cnt; - lock_cnt.epoch += epoch_cnt; - }, i++); - } - - auto before = Clock::now(); - barrier.wait(0); - - while(true) { - usleep(1000); - auto now = Clock::now(); - duration_t durr = now - before; - if( durr.count() > duration ) { - done = true; - break; - } - } - - barrier.wait(0); - auto after = Clock::now(); - duration_t durr = after - before; - duration = durr.count(); - - for(auto t : threads) { - t->join(); - delete t; - } - - assert(write_committed == lock_cnt.write); - - size_t totalop = lock_cnt.read + lock_cnt.write + lock_cnt.epoch; - size_t ops_sec = size_t(double(totalop) / duration); - size_t ops_thread = ops_sec / nthread; - double dur_nano = duration_cast(1.0); - - std::cout << "Duration : " << duration << "s\n"; - std::cout << "Total ops : " << totalop << "(" << lock_cnt.read << "r, " << lock_cnt.write << "w, " << lock_cnt.epoch << "e)\n"; - std::cout << "Ops/sec : " << ops_sec << "\n"; - std::cout << "Ops/sec/thread: " << ops_thread << "\n"; - std::cout << "ns/Op : " << ( dur_nano / ops_thread )<< "\n"; -} - -void usage(char * argv[]) { - std::cerr << argv[0] << ": [DURATION (FLOAT:SEC)] [NTHREADS] [%WRITES]" << std::endl;; - std::exit(1); -} - -int main(int argc, char * argv[]) { - - double duration = 5.0; - unsigned nthreads = 2; - unsigned writes = 0; - unsigned epochs = 0; - - std::cout.imbue(std::locale("")); - - switch (argc) - { - case 5: - epochs = std::stoul(argv[4]); - [[fallthrough]]; - case 4: - writes = std::stoul(argv[3]); - if( (writes + epochs) > 100 ) { - std::cerr << "Writes + Epochs must be valid percentage, was " << argv[3] << " + " << argv[4] << "(" << writes << " + " << epochs << ")" << std::endl; - usage(argv); - } - [[fallthrough]]; - case 3: - nthreads = std::stoul(argv[2]); - [[fallthrough]]; - case 2: - duration = std::stod(argv[1]); - if( duration <= 0.0 ) { - std::cerr << "Duration must be positive, was " << argv[1] << "(" << duration << ")" << std::endl; - usage(argv); - } - [[fallthrough]]; - case 1: - break; - default: - usage(argv); - break; - } - - check_cache_line_size(); - - std::cout << "Running " << nthreads << " threads for " << duration << " seconds with " << writes << "% writes and " << epochs << "% epochs" << std::endl; - run(nthreads, duration, writes, epochs + writes); - - return 0; -} Index: doc/theses/thierry_delisle_PhD/code/processor_list_good.cpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/processor_list_good.cpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,269 +1,0 @@ -#include "processor_list.hpp" - -#include -#include -#include - -unsigned num() { - return 0x1000000; -} - -// Barrier from -class barrier_t { -public: - barrier_t(size_t total) - : waiting(0) - , total(total) - {} - - void wait(unsigned) { - size_t target = waiting++; - target = (target - (target % total)) + total; - while(waiting < target) - asm volatile("pause"); - - assert(waiting < (1ul << 60)); - } - -private: - std::atomic waiting; - size_t total; -}; - -class Random { -private: - unsigned int seed; -public: - Random(int seed) { - this->seed = seed; - } - - /** returns pseudorandom x satisfying 0 <= x < n. **/ - unsigned int next() { - seed ^= seed << 6; - seed ^= seed >> 21; - seed ^= seed << 7; - return seed; - } -}; - -//------------------- - -struct processor { - unsigned id; -}; - -// Stage 1 -// Make sure that the early registration works correctly -// Registration uses a different process if the act of -// registering the processor makes it the highest processor count -// seen yet. -void stage1(unsigned nthread, unsigned repeats) { - const int n = repeats; - const int nproc = 10; - - // List being tested - processor_list list; - - // Barrier for synchronization - barrier_t barrier(nthread + 1); - - // Seen values to detect duplicattion - std::atomic ids[nthread * nproc]; - for(auto & i : ids) { - i = nullptr; - } - - // Can't pass VLA to lambda - std::atomic * idsp = ids; - - // Threads which will run the code - std::thread * threads[nthread]; - unsigned i = 1; - for(auto & t : threads) { - // Each thread will try to register a processor then add it to the - // list of registerd processor - t = new std::thread([&list, &barrier, idsp, n](unsigned tid){ - processor proc[nproc]; - for(int i = 0; i < n; i++) { - for(auto & p : proc) { - // Register the thread - p.id = list.doregister(&p); - } - - for(auto & p : proc) { - // Make sure no one got this id before - processor * prev = idsp[p.id].exchange(&p); - assert(nullptr == prev); - - // Make sure id is still consistend - assert(&p == list.get(p.id)); - } - - // wait for round to finish - barrier.wait(tid); - - // wait for reset - barrier.wait(tid); - } - }, i++); - } - - for(int i = 0; i < n; i++) { - //Wait for round to finish - barrier.wait(0); - - // Reset list - list.reset(); - - std::cout <join(); - delete t; - } -} - -// Stage 2 -// Check that once churning starts, registration is still consistent. -void stage2(unsigned nthread, unsigned repeats) { - // List being tested - processor_list list; - - // Threads which will run the code - std::thread * threads[nthread]; - unsigned i = 1; - for(auto & t : threads) { - // Each thread will try to register a few processors and - // unregister them, making sure that the registration is - // consistent - t = new std::thread([&list, repeats](unsigned tid){ - processor procs[10]; - for(unsigned i = 0; i < repeats; i++) { - // register the procs and note the id - for(auto & p : procs) { - p.id = list.doregister(&p); - } - - if(1 == tid) std::cout << i << "\r"; - - // check the id is still consistent - for(const auto & p : procs) { - assert(&p == list.get(p.id)); - } - - // unregister and check the id is consistent - for(const auto & p : procs) { - assert(&p == list.unregister(p.id)); - } - } - }, i++); - } - - for(auto t : threads) { - t->join(); - delete t; - } -} - -bool is_writer(); - -// Stage 3 -// Check that the reader writer lock works. -void stage3(unsigned nthread, unsigned repeats) { - // List being tested - processor_list list; - - size_t before = 0; - - std::unique_ptr after( new size_t(0) ); - - std::atomic done ( false ); - - // Threads which will run the code - std::thread * threads[nthread]; - unsigned i = 1; - for(auto & t : threads) { - // Each thread will try to register a few processors and - // unregister them, making sure that the registration is - // consistent - t = new std::thread([&list, repeats, &before, &after, &done](unsigned tid){ - Random rng(tid); - processor proc; - proc.id = list.doregister(&proc); - while(!done) { - - if( (rng.next() % 100) == 0 ) { - auto r = list.write_lock(); - - auto b = before++; - - std::cout <= repeats) done = true; - - list.write_unlock(r); - } - else { - list.read_lock(proc.id); - assert(before == *after); - list.read_unlock(proc.id); - } - - } - - list.unregister(proc.id); - }, i++); - } - - for(auto t : threads) { - t->join(); - delete t; - } -} - -int main(int argc, char * argv[]) { - - unsigned nthreads = 1; - if( argc >= 3 ) { - size_t idx; - nthreads = std::stoul(argv[2], &idx); - assert('\0' == argv[2][idx]); - } - - unsigned repeats = 100; - if( argc >= 2 ) { - size_t idx; - repeats = std::stoul(argv[1], &idx); - assert('\0' == argv[1][idx]); - } - - processor_list::check_cache_line_size(); - - std::cout << "Running " << repeats << " repetitions on " << nthreads << " threads" << std::endl; - std::cout << "Checking registration - early" << std::endl; - stage1(nthreads, repeats); - std::cout << "Done " << std::endl; - - std::cout << "Checking registration - churn" << std::endl; - stage2(nthreads, repeats); - std::cout << "Done " << std::endl; - - std::cout << "Checking RW lock " << std::endl; - stage3(nthreads, repeats); - std::cout << "Done " << std::endl; - - - return 0; -} Index: doc/theses/thierry_delisle_PhD/code/randbit.cpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/randbit.cpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,236 +1,0 @@ -#include -#include -#include - -__attribute__((noinline)) unsigned nthSetBit(size_t mask, unsigned bit) { - uint64_t v = mask; // Input value to find position with rank r. - unsigned int r = bit;// Input: bit's desired rank [1-64]. - unsigned int s; // Output: Resulting position of bit with rank r [1-64] - uint64_t a, b, c, d; // Intermediate temporaries for bit count. - unsigned int t; // Bit count temporary. - - // Do a normal parallel bit count for a 64-bit integer, - // but store all intermediate steps. - // a = (v & 0x5555...) + ((v >> 1) & 0x5555...); - a = v - ((v >> 1) & ~0UL/3); - // b = (a & 0x3333...) + ((a >> 2) & 0x3333...); - b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5); - // c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...); - c = (b + (b >> 4)) & ~0UL/0x11; - // d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...); - d = (c + (c >> 8)) & ~0UL/0x101; - - - t = (d >> 32) + (d >> 48); - // Now do branchless select! - s = 64; - // if (r > t) {s -= 32; r -= t;} - s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8)); - t = (d >> (s - 16)) & 0xff; - // if (r > t) {s -= 16; r -= t;} - s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8)); - t = (c >> (s - 8)) & 0xf; - // if (r > t) {s -= 8; r -= t;} - s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8)); - t = (b >> (s - 4)) & 0x7; - // if (r > t) {s -= 4; r -= t;} - s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8)); - t = (a >> (s - 2)) & 0x3; - // if (r > t) {s -= 2; r -= t;} - s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8)); - t = (v >> (s - 1)) & 0x1; - // if (r > t) s--; - s -= ((t - r) & 256) >> 8; - // s = 65 - s; - return s; -} - -unsigned rand_bit(unsigned rnum, uint64_t mask) { - unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0; -#if defined(BRANCHLESS) - uint64_t v = mask; // Input value to find position with rank r. - unsigned int r = bit + 1;// Input: bit's desired rank [1-64]. - unsigned int s; // Output: Resulting position of bit with rank r [1-64] - uint64_t a, b, c, d; // Intermediate temporaries for bit count. - unsigned int t; // Bit count temporary. - - // Do a normal parallel bit count for a 64-bit integer, - // but store all intermediate steps. - // a = (v & 0x5555...) + ((v >> 1) & 0x5555...); - a = v - ((v >> 1) & ~0UL/3); - // b = (a & 0x3333...) + ((a >> 2) & 0x3333...); - b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5); - // c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...); - c = (b + (b >> 4)) & ~0UL/0x11; - // d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...); - d = (c + (c >> 8)) & ~0UL/0x101; - - - t = (d >> 32) + (d >> 48); - // Now do branchless select! - s = 64; - // if (r > t) {s -= 32; r -= t;} - s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8)); - t = (d >> (s - 16)) & 0xff; - // if (r > t) {s -= 16; r -= t;} - s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8)); - t = (c >> (s - 8)) & 0xf; - // if (r > t) {s -= 8; r -= t;} - s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8)); - t = (b >> (s - 4)) & 0x7; - // if (r > t) {s -= 4; r -= t;} - s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8)); - t = (a >> (s - 2)) & 0x3; - // if (r > t) {s -= 2; r -= t;} - s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8)); - t = (v >> (s - 1)) & 0x1; - // if (r > t) s--; - s -= ((t - r) & 256) >> 8; - // s = 65 - s; - return s - 1; -#elif defined(LOOP) - for(unsigned i = 0; i < bit; i++) { - mask ^= (1ul << (__builtin_ffsl(mask) - 1ul)); - } - return __builtin_ffsl(mask) - 1ul; -#elif defined(PDEP) - uint64_t picked = _pdep_u64(1ul << bit, mask); - return __builtin_ffsl(picked) - 1ul; -#else -#error must define LOOP, PDEP or BRANCHLESS -#endif -} - -#include -#include -#include -#include -#include -#include -#include - -#include - -class barrier_t { -public: - barrier_t(size_t total) - : waiting(0) - , total(total) - {} - - void wait(unsigned) { - size_t target = waiting++; - target = (target - (target % total)) + total; - while(waiting < target) - asm volatile("pause"); - - assert(waiting < (1ul << 60)); - } - -private: - std::atomic waiting; - size_t total; -}; - -class Random { -private: - unsigned int seed; -public: - Random(int seed) { - this->seed = seed; - } - - /** returns pseudorandom x satisfying 0 <= x < n. **/ - unsigned int next() { - seed ^= seed << 6; - seed ^= seed >> 21; - seed ^= seed << 7; - return seed; - } -}; - -using Clock = std::chrono::high_resolution_clock; -using duration_t = std::chrono::duration; -using std::chrono::nanoseconds; - -template -T duration_cast(T seconds) { - return std::chrono::duration_cast>(std::chrono::duration(seconds)).count(); -} - -void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) { - - - std::cout << "Starting" << std::endl; - auto before = Clock::now(); - barrier.wait(0); - - while(true) { - usleep(100000); - auto now = Clock::now(); - duration_t durr = now - before; - if( durr.count() > duration ) { - done = true; - break; - } - std::cout << "\r" << std::setprecision(4) << durr.count(); - std::cout.flush(); - } - - barrier.wait(0); - auto after = Clock::now(); - duration_t durr = after - before; - duration = durr.count(); - std::cout << "\rClosing down" << std::endl; -} - -__attribute__((noinline)) void body(Random & rand) { - uint64_t mask = (uint64_t(rand.next()) << 32ul) | uint64_t(rand.next()); - unsigned idx = rand.next(); - - unsigned bit = rand_bit(idx, mask); - - if(__builtin_expect(((1ul << bit) & mask) == 0, false)) { - std::cerr << std::hex << "Rand " << idx << " from " << mask; - std::cerr << " gave " << (1ul << bit) << "(" << std::dec << bit << ")" << std::endl; - std::abort(); - } -} - -void runRandBit(double duration) { - - std::atomic_bool done = { false }; - barrier_t barrier(2); - - size_t count = 0; - std::thread thread([&done, &barrier, &count]() { - - Random rand(22); - - barrier.wait(1); - - for(;!done; count++) { - body(rand); - } - - barrier.wait(1); - }); - - waitfor(duration, barrier, done); - thread.join(); - - size_t ops = count; - size_t ops_sec = size_t(double(ops) / duration); - auto dur_nano = duration_cast(1.0); - - std::cout << "Duration : " << duration << "s\n"; - std::cout << "ns/Op : " << ( dur_nano / ops )<< "\n"; - std::cout << "Ops/sec : " << ops_sec << "\n"; - std::cout << "Total ops : " << ops << std::endl; - -} - -int main() { - std::cout.imbue(std::locale("")); - runRandBit(5); -} Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/Makefile =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/Makefile (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/Makefile (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,22 @@ + + +CXXFLAGS = -O3 -g -Wall -Wextra -std=c++17 +LDFLAGS = -pthread -latomic + +push: + clang++ relaxed_list.cpp -g -Wall -Wextra -std=c++17 -fsyntax-only && rsync -av relaxed_list.cpp relaxed_list.hpp utils.hpp assert.hpp scale.sh plg7b:~/workspace/sched/. + +relaxed_list: $(firstword $(MAKEFILE_LIST)) | build + clang++ relaxed_list.cpp $(CXXFLAGS) $(LDFLAGS) -lpng -MMD -MF build/$(@).d -o $(@) + +-include build/relaxed_list.d + +layout.ast: $(firstword $(MAKEFILE_LIST)) | build + clang++ relaxed_list_layout.cpp $(CXXFLAGS) -MMD -MF build/$(@).d -MT $(@) -E -o build/$(@).ii + clang++ -Xclang -fdump-record-layouts -fsyntax-only $(CXXFLAGS) build/$(@).ii > build/layout.ast.raw + cat build/$(@).raw > $(@) + +-include build/layout.ast.d + +build: + mkdir -p build Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/assert.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/assert.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/assert.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,22 @@ +#pragma once + +#ifndef NDEBUG +#include +#include + +#define sstr(s) #s +#define xstr(s) sstr(s) + +extern const char * __my_progname; + +#define assertf(cond, ...) ({ \ + if(!(cond)) { \ + fprintf(stderr, "%s: " __FILE__ ":" xstr(__LINE__) ": %s: Assertion '" xstr(cond) "' failed.\n", __my_progname, __PRETTY_FUNCTION__); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + std::abort(); \ + } \ +}) +#else +#define assertf(cond, ...) +#endif Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/bitbench/select.cpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/bitbench/select.cpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/bitbench/select.cpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,186 @@ + +#include "../utils.hpp" + +void consume(int i, int j) __attribute__((noinline)); +void consume(int i, int j) { + asm volatile("":: "rm" (i), "rm" (i) ); +} + +static inline unsigned rand_bit_sw(unsigned rnum, size_t mask) { + unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0; + uint64_t v = mask; // Input value to find position with rank r. + unsigned int r = bit + 1;// Input: bit's desired rank [1-64]. + unsigned int s; // Output: Resulting position of bit with rank r [1-64] + uint64_t a, b, c, d; // Intermediate temporaries for bit count. + unsigned int t; // Bit count temporary. + + // Do a normal parallel bit count for a 64-bit integer, + // but store all intermediate steps. + a = v - ((v >> 1) & ~0UL/3); + b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5); + c = (b + (b >> 4)) & ~0UL/0x11; + d = (c + (c >> 8)) & ~0UL/0x101; + + + t = (d >> 32) + (d >> 48); + // Now do branchless select! + s = 64; + s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8)); + t = (d >> (s - 16)) & 0xff; + s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8)); + t = (c >> (s - 8)) & 0xf; + s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8)); + t = (b >> (s - 4)) & 0x7; + s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8)); + t = (a >> (s - 2)) & 0x3; + s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8)); + t = (v >> (s - 1)) & 0x1; + s -= ((t - r) & 256) >> 8; + return s - 1; +} + +static inline unsigned rand_bit_hw(unsigned rnum, size_t mask) { + unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0; + uint64_t picked = _pdep_u64(1ul << bit, mask); + return picked ? __builtin_ctzl(picked) : 0; +} + +struct TLS { + Random rng = { 6 }; +} tls; + +const unsigned numLists = 64; + +static inline void blind() { + int i = tls.rng.next() % numLists; + int j = tls.rng.next() % numLists; + + consume(i, j); +} + +std::atomic_size_t list_mask[7]; +static inline void bitmask_sw() { + unsigned i, j; + { + // Pick two lists at random + unsigned num = ((numLists - 1) >> 6) + 1; + + unsigned ri = tls.rng.next(); + unsigned rj = tls.rng.next(); + + unsigned wdxi = (ri >> 6u) % num; + unsigned wdxj = (rj >> 6u) % num; + + size_t maski = list_mask[wdxi].load(std::memory_order_relaxed); + size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed); + + unsigned bi = rand_bit_sw(ri, maski); + unsigned bj = rand_bit_sw(rj, maskj); + + i = bi | (wdxi << 6); + j = bj | (wdxj << 6); + } + + consume(i, j); +} + +static inline void bitmask_hw() { + #if !defined(__BMI2__) + #warning NO bmi2 for pdep rand_bit + return; + #endif + unsigned i, j; + { + // Pick two lists at random + unsigned num = ((numLists - 1) >> 6) + 1; + + unsigned ri = tls.rng.next(); + unsigned rj = tls.rng.next(); + + unsigned wdxi = (ri >> 6u) % num; + unsigned wdxj = (rj >> 6u) % num; + + size_t maski = list_mask[wdxi].load(std::memory_order_relaxed); + size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed); + + unsigned bi = rand_bit_hw(ri, maski); + unsigned bj = rand_bit_hw(rj, maskj); + + i = bi | (wdxi << 6); + j = bj | (wdxj << 6); + } + + consume(i, j); +} + +struct { + const unsigned mask = 7; + const unsigned depth = 3; + const uint64_t indexes = 0x0706050403020100; + uint64_t masks( unsigned node ) { + return 0xff00ffff00ff; + } +} snzm; +static inline void sparsemask() { + #if !defined(__BMI2__) + #warning NO bmi2 for sparse mask + return; + #endif + unsigned i, j; + { + // Pick two random number + unsigned ri = tls.rng.next(); + unsigned rj = tls.rng.next(); + + // Pick two nodes from it + unsigned wdxi = ri & snzm.mask; + unsigned wdxj = rj & snzm.mask; + + // Get the masks from the nodes + size_t maski = snzm.masks(wdxi); + size_t maskj = snzm.masks(wdxj); + + uint64_t idxsi = _pext_u64(snzm.indexes, maski); + uint64_t idxsj = _pext_u64(snzm.indexes, maskj); + + auto pi = __builtin_popcountll(maski); + auto pj = __builtin_popcountll(maskj); + + ri = pi ? ri & ((pi >> 3) - 1) : 0; + rj = pj ? rj & ((pj >> 3) - 1) : 0; + + unsigned bi = (idxsi >> (ri << 3)) & 0xff; + unsigned bj = (idxsj >> (rj << 3)) & 0xff; + + i = (bi << snzm.depth) | wdxi; + j = (bj << snzm.depth) | wdxj; + } + + consume(i, j); +} + +template +void benchmark( T func, const std::string & name ) { + std::cout << "Starting " << name << std::endl; + auto before = Clock::now(); + const int N = 250'000'000; + for(int i = 0; i < N; i++) { + func(); + } + auto after = Clock::now(); + duration_t durr = after - before; + double duration = durr.count(); + std::cout << "Duration(s) : " << duration << std::endl; + std::cout << "Ops/sec : " << uint64_t(N / duration) << std::endl; + std::cout << "ns/Op : " << double(duration * 1'000'000'000.0 / N) << std::endl; + std::cout << std::endl; +} + +int main() { + std::cout.imbue(std::locale("")); + + benchmark(blind, "Blind guess"); + benchmark(bitmask_sw, "Dense bitmask"); + benchmark(bitmask_hw, "Dense bitmask with Parallel Deposit"); + benchmark(sparsemask, "Parallel Extract bitmask"); +} Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/bts.cpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/bts.cpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/bts.cpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,279 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "utils.hpp" + +// ================================================================================================ +// UTILS +// ================================================================================================ + +struct local_stat_t { + size_t cnt = 0; +}; + +struct global_stat_t { + std::atomic_size_t cnt = { 0 }; +}; + +void atomic_max(std::atomic_size_t & target, size_t value) { + for(;;) { + size_t expect = target.load(std::memory_order_relaxed); + if(value <= expect) return; + bool success = target.compare_exchange_strong(expect, value); + if(success) return; + } +} + +void atomic_min(std::atomic_size_t & target, size_t value) { + for(;;) { + size_t expect = target.load(std::memory_order_relaxed); + if(value >= expect) return; + bool success = target.compare_exchange_strong(expect, value); + if(success) return; + } +} + +void tally_stats(global_stat_t & global, local_stat_t & local) { + global.cnt += local.cnt; +} + +void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) { + std::cout << "Starting" << std::endl; + auto before = Clock::now(); + barrier.wait(0); + + while(true) { + usleep(100000); + auto now = Clock::now(); + duration_t durr = now - before; + if( durr.count() > duration ) { + done = true; + break; + } + std::cout << "\r" << std::setprecision(4) << durr.count(); + std::cout.flush(); + } + + barrier.wait(0); + auto after = Clock::now(); + duration_t durr = after - before; + duration = durr.count(); + std::cout << "\rClosing down" << std::endl; +} + +void waitfor(double & duration, barrier_t & barrier, const std::atomic_size_t & count) { + std::cout << "Starting" << std::endl; + auto before = Clock::now(); + barrier.wait(0); + + while(true) { + usleep(100000); + size_t c = count.load(); + if( c == 0 ) { + break; + } + std::cout << "\r" << c; + std::cout.flush(); + } + + barrier.wait(0); + auto after = Clock::now(); + duration_t durr = after - before; + duration = durr.count(); + std::cout << "\rClosing down" << std::endl; +} + +void print_stats(double duration, unsigned nthread, global_stat_t & global) { + std::cout << "Done" << std::endl; + + size_t ops = global.cnt; + size_t ops_sec = size_t(double(ops) / duration); + size_t ops_thread = ops_sec / nthread; + auto dur_nano = duration_cast(1.0); + + std::cout << "Duration : " << duration << "s\n"; + std::cout << "ns/Op : " << ( dur_nano / ops_thread )<< "\n"; + std::cout << "Ops/sec/thread: " << ops_thread << "\n"; + std::cout << "Ops/sec : " << ops_sec << "\n"; + std::cout << "Total ops : " << ops << "\n"; +} + +static inline bool bts(std::atomic_size_t & target, size_t bit ) { + /* + int result = 0; + asm volatile( + "LOCK btsq %[bit], %[target]\n\t" + :"=@ccc" (result) + : [target] "m" (target), [bit] "r" (bit) + ); + return result != 0; + /*/ + size_t mask = 1ul << bit; + size_t ret = target.fetch_or(mask, std::memory_order_relaxed); + return (ret & mask) != 0; + //*/ +} + +static inline bool btr(std::atomic_size_t & target, size_t bit ) { + /* + int result = 0; + asm volatile( + "LOCK btrq %[bit], %[target]\n\t" + :"=@ccc" (result) + : [target] "m" (target), [bit] "r" (bit) + ); + return result != 0; + /*/ + size_t mask = 1ul << bit; + size_t ret = target.fetch_and(~mask, std::memory_order_relaxed); + return (ret & mask) != 0; + //*/ +} + +// ================================================================================================ +// EXPERIMENTS +// ================================================================================================ + +// ================================================================================================ +__attribute__((noinline)) void runPingPong_body( + std::atomic& done, + local_stat_t & local, + std::atomic_size_t & target, + size_t id +) { + while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) { + + bool ret; + ret = bts(target, id); + assert(!ret); + + // ----- + + ret = btr(target, id); + assert(ret); + local.cnt++; + } +} + +void run(unsigned nthread, double duration) { + // Barrier for synchronization + barrier_t barrier(nthread + 1); + + // Data to check everything is OK + global_stat_t global; + + // Flag to signal termination + std::atomic_bool done = { false }; + + std::cout << "Initializing "; + // List being tested + std::atomic_size_t word = { 0 }; + { + std::thread * threads[nthread]; + unsigned i = 1; + for(auto & t : threads) { + t = new std::thread([&done, &word, &barrier, &global](unsigned tid) { + local_stat_t local; + + // affinity(tid); + + barrier.wait(tid); + + // EXPERIMENT START + + runPingPong_body(done, local, word, tid - 1); + + // EXPERIMENT END + + barrier.wait(tid); + + tally_stats(global, local); + }, i++); + } + + waitfor(duration, barrier, done); + + for(auto t : threads) { + t->join(); + delete t; + } + } + + print_stats(duration, nthread, global); +} + +// ================================================================================================ + +int main(int argc, char * argv[]) { + + double duration = 5.0; + unsigned nthreads = 2; + + std::cout.imbue(std::locale("")); + + for(;;) { + static struct option options[] = { + {"duration", required_argument, 0, 'd'}, + {"nthreads", required_argument, 0, 't'}, + {0, 0, 0, 0} + }; + + int idx = 0; + int opt = getopt_long(argc, argv, "d:t:", options, &idx); + + std::string arg = optarg ? optarg : ""; + size_t len = 0; + switch(opt) { + case -1: + if(optind != argc) { + std::cerr << "Too many arguments " << argc << " " << idx << std::endl; + goto usage; + } + goto run; + // Numeric Arguments + case 'd': + try { + duration = std::stod(optarg, &len); + if(len != arg.size()) { throw std::invalid_argument(""); } + } catch(std::invalid_argument &) { + std::cerr << "Duration must be a valid double, was " << arg << std::endl; + goto usage; + } + break; + case 't': + try { + nthreads = std::stoul(optarg, &len); + if(len != arg.size() || nthreads > (8 * sizeof(size_t))) { throw std::invalid_argument(""); } + } catch(std::invalid_argument &) { + std::cerr << "Number of threads must be a positive integer less than or equal to " << sizeof(size_t) * 8 << ", was " << arg << std::endl; + goto usage; + } + break; + // Other cases + default: /* ? */ + std::cerr << opt << std::endl; + usage: + std::cerr << "Usage: " << argv[0] << ": [options]" << std::endl; + std::cerr << std::endl; + std::cerr << " -d, --duration=DURATION Duration of the experiment, in seconds" << std::endl; + std::cerr << " -t, --nthreads=NTHREADS Number of kernel threads" << std::endl; + std::exit(1); + } + } + run: + + check_cache_line_size(); + + std::cout << "Running " << nthreads << " threads for " << duration << " seconds" << std::endl; + run(nthreads, duration); + return 0; +} Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/bts_test.cpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/bts_test.cpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/bts_test.cpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,32 @@ +#include +#include + +bool bts(volatile size_t & target, size_t bit ) { + bool result = false; + asm volatile( + "LOCK btsq %[bit], %[target]\n\t" + :"=c" (result) + : [target] "m" (target), [bit] "r" (bit) + ); + return result; +} + +bool btr(volatile size_t & target, size_t bit ) { + bool result = false; + asm volatile( + "LOCK btrq %[bit], %[target]\n\t" + :"=c" (result) + : [target] "m" (target), [bit] "r" (bit) + ); + return result; +} + +int main() { + volatile size_t i = 0; + std::cout << std::hex << i << std::endl; + assert(bts(i, 31)); + std::cout << std::hex << i << std::endl; + assert(btr(i, 31)); + std::cout << std::hex << i << std::endl; + return 0; +} Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/links.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/links.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/links.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,122 @@ +#pragma once + +#include "assert.hpp" +#include "utils.hpp" + +template +struct _LinksFields_t { + node_t * prev = nullptr; + node_t * next = nullptr; + volatile unsigned long long ts = 0; + unsigned hint = (unsigned)-1; +}; + +template +class __attribute__((aligned(128))) intrusive_queue_t { +public: + typedef spinlock_t lock_t; + + struct stat { + ssize_t diff = 0; + size_t push = 0; + size_t pop = 0; + }; + +private: + struct sentinel_t { + _LinksFields_t _links; + }; + +public: + lock_t lock; + +private: + sentinel_t before; + sentinel_t after; + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Winvalid-offsetof" + static constexpr auto fields_offset = offsetof( node_t, _links ); +#pragma GCC diagnostic pop +public: + intrusive_queue_t() + : before{{ nullptr, tail() }} + , after {{ head(), nullptr }} + { + /* paranoid */ assert((reinterpret_cast( head() ) + fields_offset) == reinterpret_cast(&before)); + /* paranoid */ assert((reinterpret_cast( tail() ) + fields_offset) == reinterpret_cast(&after )); + /* paranoid */ assert(head()->_links.prev == nullptr); + /* paranoid */ assert(head()->_links.next == tail() ); + /* paranoid */ assert(tail()->_links.next == nullptr); + /* paranoid */ assert(tail()->_links.prev == head() ); + /* paranoid */ assert(sizeof(*this) == 128); + /* paranoid */ assert((intptr_t(this) % 128) == 0); + } + + ~intrusive_queue_t() = default; + + inline node_t * head() const { + node_t * rhead = reinterpret_cast( + reinterpret_cast( &before ) - fields_offset + ); + assert(rhead); + return rhead; + } + + inline node_t * tail() const { + node_t * rtail = reinterpret_cast( + reinterpret_cast( &after ) - fields_offset + ); + assert(rtail); + return rtail; + } + + inline bool push(node_t * node) { + assert(lock); + assert(node->_links.ts != 0); + node_t * tail = this->tail(); + + node_t * prev = tail->_links.prev; + // assertf(node->_links.ts >= prev->_links.ts, + // "New node has smaller timestamp: %llu < %llu", node->_links.ts, prev->_links.ts); + node->_links.next = tail; + node->_links.prev = prev; + prev->_links.next = node; + tail->_links.prev = node; + + if(before._links.ts == 0l) { + before._links.ts = node->_links.ts; + assert(node->_links.prev == this->head()); + return true; + } + return false; + } + + inline std::pair pop() { + assert(lock); + node_t * head = this->head(); + node_t * tail = this->tail(); + + node_t * node = head->_links.next; + node_t * next = node->_links.next; + if(node == tail) return {nullptr, false}; + + head->_links.next = next; + next->_links.prev = head; + + if(next == tail) { + before._links.ts = 0l; + return {node, true}; + } + else { + assert(next->_links.ts != 0); + before._links.ts = next->_links.ts; + assert(before._links.ts != 0); + return {node, false}; + } + } + + long long ts() const { + return before._links.ts; + } +}; Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/prefetch.cpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/prefetch.cpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/prefetch.cpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,106 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct __attribute__((aligned(64))) element { + size_t value; +}; + +using block = std::array; + +block * create() { + block * b = new block(); + for(auto & e : *b) { + e.value = rand(); + } + b->back().value = b->size(); + + return b; +} + +static inline size_t find(const block & b) { + size_t r = 0; + for(; r < b.size(); r++) { + if(__builtin_expect(b[r].value == b.size(), false)) break; + } + + return r; +} + +void usage(char * argv[]) { + std::cerr << argv[0] << ": [DURATION (FLOAT:SEC)] [NBLOCKS]" << std::endl;; + std::exit(1); +} + +int main(int argc, char * argv[]) { + size_t nblocks = 1000; + double duration = 5; + + std::cout.imbue(std::locale("")); + + switch (argc) + { + case 3: + nblocks = std::stoul(argv[2]); + [[fallthrough]]; + case 2: + duration = std::stod(argv[1]); + if( duration <= 0.0 ) { + std::cerr << "Duration must be positive, was " << argv[1] << "(" << duration << ")" << std::endl; + usage(argv); + } + [[fallthrough]]; + case 1: + break; + default: + usage(argv); + break; + } + + std::vector> blocks; + for(size_t i = 0; i < nblocks; i++) { + blocks.emplace_back( create() ); + } + std::random_shuffle(blocks.begin(), blocks.end()); + + size_t CRC = 0; + size_t count = 0; + + using clock = std::chrono::high_resolution_clock; + auto before = clock::now(); + + while(true) { + for(const auto & b : blocks) { + CRC += find(*b); + count++; + } + auto now = clock::now(); + std::chrono::duration durr = now - before; + if( durr.count() > duration ) { + break; + } + } + + auto after = clock::now(); + std::chrono::duration durr = after - before; + duration = durr.count(); + + using std::chrono::duration_cast; + using std::chrono::nanoseconds; + + size_t ops_sec = size_t(double(count) / duration); + auto dur_nano = duration_cast(std::chrono::duration(1.0)).count(); + + std::cout << "CRC : " << CRC << "\n"; + std::cout << "Duration : " << duration << "s\n"; + std::cout << "Total ops : " << count << "\n"; + std::cout << "Ops/sec : " << ops_sec << "\n"; + std::cout << "ns/Op : " << ( dur_nano / ops_sec )<< "\n"; +} Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/process.sh =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/process.sh (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/process.sh (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,42 @@ +#!/bin/bash + +NAME=$1 + +if [ ! -f "raw/${NAME}.out" ]; then + echo "Not output for ${NAME}" + exit 1 +fi + +if [ ! -f "raw/${NAME}.data" ]; then + echo "Not perf record for ${NAME}" + exit 1 +fi + +echo "Processing perf data for ${NAME}" + +OPS=$(grep -e 'Total ops' raw/${NAME}.out) +CPOP=$( echo "Hello $OPS" | \grep -oP ", \K[0-9,]+(?=o)" --color | tr -d ',') +CPUSH=$(echo "Hello $OPS" | \grep -oP "\(\K[0-9,]+(?=i)" --color | tr -d ',') + +REPORT='' +perf report -n --percent-limit 5 --stdio --no-children -i raw/${NAME}.data > raw/.temp +EVENT=$(cat raw/.temp | grep -e '^# Samples'| cut -d ' ' -f 6) +SPOP=$( cat raw/.temp | grep -e '] relaxed_list::pop' | tr -s ' ' | cut -d ' ' -f 3) +SPUSH=$(cat raw/.temp | grep -e '] relaxed_list::push' | tr -s ' ' | cut -d ' ' -f 3) +SARR=$( cat raw/.temp | grep -e '] snz[i|m]_t::node::arrive_h' | tr -s ' ' | cut -d ' ' -f 3) + +echo "$OPS" +echo "Push count: $CPUSH" +echo "Pop count: $CPOP" + +echo "Pop samples: $SPOP" +echo "Push samples: $SPUSH" +echo "Arrive samples: $SARR" + +SpPUSH=$(bc -l <<< "scale=9; $SPUSH / $CPUSH") +SpPOP=$( bc -l <<< "scale=9; $SPOP / $CPOP" ) +SpARR=$( bc -l <<< "scale=9; $SARR / $CPUSH") + +printf "%s per push() : %.9f\n" $EVENT $SpPUSH | sed ':a;s/\B[0-9]\{3\}\>/,&/;ta' +printf "%s per pop() : %.9f\n" $EVENT $SpPOP | sed ':a;s/\B[0-9]\{3\}\>/,&/;ta' +printf "%s per arrive(): %.9f\n" $EVENT $SpARR | sed ':a;s/\B[0-9]\{3\}\>/,&/;ta' Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,53 @@ +#include + +struct thread {}; + +struct cluster { + void add(); + void remove(); + thread * next(); +}; + +struct processor { + + cluster cluster; + std::atomic stop; + volatile bool idle; +}; + + +void run(thread * ) { + // verify preemption + + // run Thread + + // verify preemption + + // finish Running +} + +void main(processor & self) { + + self.cluster.add(); + + while(!self.stop) { + if(thread * t = self.cluster.next()) { + run(t); + continue; + } + + self.set_idle(); + std::atomic_thread_fence(); + + if(thread * t = self.cluster.next()) { + self.idle = false; + run(t); + continue; + } + + halt(); + } + + self.cluster.remove(); + +} Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,215 @@ +#include + +#include +#include +#include + +struct processor; + +struct __attribute__((aligned(64))) processor_id { + std::atomic handle; + std::atomic lock; + + processor_id() = default; + processor_id(processor * proc) : handle(proc), lock() { + /*paranoid*/ assert(std::atomic_is_lock_free(&lock)); + } +}; + +extern unsigned num(); + +#define ERROR throw 1 + +class processor_list { +private: + + static const constexpr std::size_t cache_line_size = 64; + + static_assert(sizeof (processor_id) <= cache_line_size, "ERROR: Instances must fit in one cache line" ); + static_assert(alignof(processor_id) == cache_line_size, "ERROR: Instances must aligned to one cache line" ); + + const unsigned max; // total cachelines allocated + std::atomic_uint alloc; // cachelines currently in use + std::atomic_uint ready; // cachelines ready to iterate over (!= to alloc when thread is in second half of doregister) + std::atomic lock; // writerlock + processor_id * data; // data pointer + +private: + inline void acquire(std::atomic & ll) { + while( __builtin_expect(ll.exchange(true),false) ) { + while(ll.load(std::memory_order_relaxed)) + asm volatile("pause"); + } + /* paranoid */ assert(ll); + } + +public: + processor_list() + : max(num()) + , alloc(0) + , ready(0) + , lock{false} + , data( new processor_id[max] ) + { + /*paranoid*/ assert(num() == max); + /*paranoid*/ assert(std::atomic_is_lock_free(&alloc)); + /*paranoid*/ assert(std::atomic_is_lock_free(&ready)); + } + + ~processor_list() { + delete[] data; + } + + //======================================================================= + // Lock-Free registering/unregistering of threads + unsigned doregister(processor * proc) { + // Step - 1 : check if there is already space in the data + uint_fast32_t s = ready; + + // Check among all the ready + for(uint_fast32_t i = 0; i < s; i++) { + processor * null = nullptr; // Re-write every loop since compare thrashes it + if( data[i].handle.load(std::memory_order_relaxed) == null + && data[i].handle.compare_exchange_strong(null, proc)) { + /*paranoid*/ assert(i < ready); + /*paranoid*/ assert(alignof(decltype(data[i])) == cache_line_size); + /*paranoid*/ assert((uintptr_t(&data[i]) % cache_line_size) == 0); + return i; + } + } + + if(max <= alloc) ERROR; + + // Step - 2 : F&A to get a new spot in the array. + uint_fast32_t n = alloc++; + if(max <= n) ERROR; + + // Step - 3 : Mark space as used and then publish it. + void * storage = &data[n]; + new (storage) processor_id( proc ); + while(true) { + unsigned copy = n; + if( ready.load(std::memory_order_relaxed) == n + && ready.compare_exchange_weak(copy, n + 1) ) + break; + asm volatile("pause"); + } + + // Return new spot. + /*paranoid*/ assert(n < ready); + /*paranoid*/ assert(alignof(decltype(data[n])) == cache_line_size); + /*paranoid*/ assert((uintptr_t(&data[n]) % cache_line_size) == 0); + return n; + } + + processor * unregister(unsigned iproc) { + /*paranoid*/ assert(iproc < ready); + auto ret = data[iproc].handle.load(std::memory_order_relaxed); + data[iproc].handle = nullptr; + return ret; + } + + // Reset all registration + // Unsafe in most cases, use for testing only. + void reset() { + alloc = 0; + ready = 0; + } + + processor * get(unsigned iproc) { + return data[iproc].handle.load(std::memory_order_relaxed); + } + + //======================================================================= + // Reader-writer lock implementation + // Concurrent with doregister/unregister, + // i.e., threads can be added at any point during or between the entry/exit + + //----------------------------------------------------------------------- + // Reader side + void read_lock(unsigned iproc) { + /*paranoid*/ assert(iproc < ready); + + // Step 1 : make sure no writer are in the middle of the critical section + while(lock.load(std::memory_order_relaxed)) + asm volatile("pause"); + + // Fence needed because we don't want to start trying to acquire the lock + // before we read a false. + // Not needed on x86 + // std::atomic_thread_fence(std::memory_order_seq_cst); + + // Step 2 : acquire our local lock + acquire( data[iproc].lock ); + /*paranoid*/ assert(data[iproc].lock); + } + + void read_unlock(unsigned iproc) { + /*paranoid*/ assert(iproc < ready); + /*paranoid*/ assert(data[iproc].lock); + data[iproc].lock.store(false, std::memory_order_release); + } + + //----------------------------------------------------------------------- + // Writer side + uint_fast32_t write_lock() { + // Step 1 : lock global lock + // It is needed to avoid processors that register mid Critical-Section + // to simply lock their own lock and enter. + acquire(lock); + + // Step 2 : lock per-proc lock + // Processors that are currently being registered aren't counted + // but can't be in read_lock or in the critical section. + // All other processors are counted + uint_fast32_t s = ready; + for(uint_fast32_t i = 0; i < s; i++) { + acquire( data[i].lock ); + } + + return s; + } + + void write_unlock(uint_fast32_t last_s) { + // Step 1 : release local locks + // This must be done while the global lock is held to avoid + // threads that where created mid critical section + // to race to lock their local locks and have the writer + // immidiately unlock them + // Alternative solution : return s in write_lock and pass it to write_unlock + for(uint_fast32_t i = 0; i < last_s; i++) { + assert(data[i].lock); + data[i].lock.store(false, std::memory_order_release); + } + + // Step 2 : release global lock + /*paranoid*/ assert(true == lock); + lock.store(false, std::memory_order_release); + } + + //----------------------------------------------------------------------- + // Checking support + uint_fast32_t epoch_check() { + // Step 1 : lock global lock + // It is needed to avoid processors that register mid Critical-Section + // to simply lock their own lock and enter. + while(lock.load(std::memory_order_relaxed)) + asm volatile("pause"); + + // Step 2 : lock per-proc lock + // Processors that are currently being registered aren't counted + // but can't be in read_lock or in the critical section. + // All other processors are counted + uint_fast32_t s = ready; + for(uint_fast32_t i = 0; i < s; i++) { + while(data[i].lock.load(std::memory_order_relaxed)) + asm volatile("pause"); + } + + return s; + } + +public: +}; + +#undef ERROR Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list_fast.cpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list_fast.cpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list_fast.cpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,173 @@ +#include "processor_list.hpp" + +#include +#include +#include +#include +#include +#include + +#include "utils.hpp" + +unsigned num() { + return 0x1000000; +} + +//------------------- + +struct processor { + unsigned id; +}; +void run(unsigned nthread, double duration, unsigned writes, unsigned epochs) { + assert(writes < 100); + + // List being tested + processor_list list = {}; + + // Barrier for synchronization + barrier_t barrier(nthread + 1); + + // Data to check everything is OK + size_t write_committed = 0ul; + struct { + std::atomic_size_t write = { 0ul }; + std::atomic_size_t read = { 0ul }; + std::atomic_size_t epoch = { 0ul }; + } lock_cnt; + + // Flag to signal termination + std::atomic_bool done = { false }; + + std::thread * threads[nthread]; + unsigned i = 1; + for(auto & t : threads) { + t = new std::thread([&done, &list, &barrier, &write_committed, &lock_cnt, writes, epochs](unsigned tid) { + Random rand(tid + rdtscl()); + processor proc; + proc.id = list.doregister(&proc); + size_t writes_cnt = 0; + size_t reads_cnt = 0; + size_t epoch_cnt = 0; + + affinity(tid); + + barrier.wait(tid); + + while(__builtin_expect(!done, true)) { + auto r = rand.next() % 100; + if (r < writes) { + auto n = list.write_lock(); + write_committed++; + writes_cnt++; + assert(writes_cnt < -2ul); + list.write_unlock(n); + } + else if(r < epochs) { + list.epoch_check(); + epoch_cnt++; + } + else { + list.read_lock(proc.id); + reads_cnt++; + assert(reads_cnt < -2ul); + list.read_unlock(proc.id); + } + } + + barrier.wait(tid); + + auto p = list.unregister(proc.id); + assert(&proc == p); + lock_cnt.write += writes_cnt; + lock_cnt.read += reads_cnt; + lock_cnt.epoch += epoch_cnt; + }, i++); + } + + auto before = Clock::now(); + barrier.wait(0); + + while(true) { + usleep(1000); + auto now = Clock::now(); + duration_t durr = now - before; + if( durr.count() > duration ) { + done = true; + break; + } + } + + barrier.wait(0); + auto after = Clock::now(); + duration_t durr = after - before; + duration = durr.count(); + + for(auto t : threads) { + t->join(); + delete t; + } + + assert(write_committed == lock_cnt.write); + + size_t totalop = lock_cnt.read + lock_cnt.write + lock_cnt.epoch; + size_t ops_sec = size_t(double(totalop) / duration); + size_t ops_thread = ops_sec / nthread; + double dur_nano = duration_cast(1.0); + + std::cout << "Duration : " << duration << "s\n"; + std::cout << "Total ops : " << totalop << "(" << lock_cnt.read << "r, " << lock_cnt.write << "w, " << lock_cnt.epoch << "e)\n"; + std::cout << "Ops/sec : " << ops_sec << "\n"; + std::cout << "Ops/sec/thread: " << ops_thread << "\n"; + std::cout << "ns/Op : " << ( dur_nano / ops_thread )<< "\n"; +} + +void usage(char * argv[]) { + std::cerr << argv[0] << ": [DURATION (FLOAT:SEC)] [NTHREADS] [%WRITES]" << std::endl;; + std::exit(1); +} + +int main(int argc, char * argv[]) { + + double duration = 5.0; + unsigned nthreads = 2; + unsigned writes = 0; + unsigned epochs = 0; + + std::cout.imbue(std::locale("")); + + switch (argc) + { + case 5: + epochs = std::stoul(argv[4]); + [[fallthrough]]; + case 4: + writes = std::stoul(argv[3]); + if( (writes + epochs) > 100 ) { + std::cerr << "Writes + Epochs must be valid percentage, was " << argv[3] << " + " << argv[4] << "(" << writes << " + " << epochs << ")" << std::endl; + usage(argv); + } + [[fallthrough]]; + case 3: + nthreads = std::stoul(argv[2]); + [[fallthrough]]; + case 2: + duration = std::stod(argv[1]); + if( duration <= 0.0 ) { + std::cerr << "Duration must be positive, was " << argv[1] << "(" << duration << ")" << std::endl; + usage(argv); + } + [[fallthrough]]; + case 1: + break; + default: + usage(argv); + break; + } + + check_cache_line_size(); + + std::cout << "Running " << nthreads << " threads for " << duration << " seconds with " << writes << "% writes and " << epochs << "% epochs" << std::endl; + run(nthreads, duration, writes, epochs + writes); + + return 0; +} Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list_good.cpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list_good.cpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list_good.cpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,269 @@ +#include "processor_list.hpp" + +#include +#include +#include + +unsigned num() { + return 0x1000000; +} + +// Barrier from +class barrier_t { +public: + barrier_t(size_t total) + : waiting(0) + , total(total) + {} + + void wait(unsigned) { + size_t target = waiting++; + target = (target - (target % total)) + total; + while(waiting < target) + asm volatile("pause"); + + assert(waiting < (1ul << 60)); + } + +private: + std::atomic waiting; + size_t total; +}; + +class Random { +private: + unsigned int seed; +public: + Random(int seed) { + this->seed = seed; + } + + /** returns pseudorandom x satisfying 0 <= x < n. **/ + unsigned int next() { + seed ^= seed << 6; + seed ^= seed >> 21; + seed ^= seed << 7; + return seed; + } +}; + +//------------------- + +struct processor { + unsigned id; +}; + +// Stage 1 +// Make sure that the early registration works correctly +// Registration uses a different process if the act of +// registering the processor makes it the highest processor count +// seen yet. +void stage1(unsigned nthread, unsigned repeats) { + const int n = repeats; + const int nproc = 10; + + // List being tested + processor_list list; + + // Barrier for synchronization + barrier_t barrier(nthread + 1); + + // Seen values to detect duplicattion + std::atomic ids[nthread * nproc]; + for(auto & i : ids) { + i = nullptr; + } + + // Can't pass VLA to lambda + std::atomic * idsp = ids; + + // Threads which will run the code + std::thread * threads[nthread]; + unsigned i = 1; + for(auto & t : threads) { + // Each thread will try to register a processor then add it to the + // list of registerd processor + t = new std::thread([&list, &barrier, idsp, n](unsigned tid){ + processor proc[nproc]; + for(int i = 0; i < n; i++) { + for(auto & p : proc) { + // Register the thread + p.id = list.doregister(&p); + } + + for(auto & p : proc) { + // Make sure no one got this id before + processor * prev = idsp[p.id].exchange(&p); + assert(nullptr == prev); + + // Make sure id is still consistend + assert(&p == list.get(p.id)); + } + + // wait for round to finish + barrier.wait(tid); + + // wait for reset + barrier.wait(tid); + } + }, i++); + } + + for(int i = 0; i < n; i++) { + //Wait for round to finish + barrier.wait(0); + + // Reset list + list.reset(); + + std::cout <join(); + delete t; + } +} + +// Stage 2 +// Check that once churning starts, registration is still consistent. +void stage2(unsigned nthread, unsigned repeats) { + // List being tested + processor_list list; + + // Threads which will run the code + std::thread * threads[nthread]; + unsigned i = 1; + for(auto & t : threads) { + // Each thread will try to register a few processors and + // unregister them, making sure that the registration is + // consistent + t = new std::thread([&list, repeats](unsigned tid){ + processor procs[10]; + for(unsigned i = 0; i < repeats; i++) { + // register the procs and note the id + for(auto & p : procs) { + p.id = list.doregister(&p); + } + + if(1 == tid) std::cout << i << "\r"; + + // check the id is still consistent + for(const auto & p : procs) { + assert(&p == list.get(p.id)); + } + + // unregister and check the id is consistent + for(const auto & p : procs) { + assert(&p == list.unregister(p.id)); + } + } + }, i++); + } + + for(auto t : threads) { + t->join(); + delete t; + } +} + +bool is_writer(); + +// Stage 3 +// Check that the reader writer lock works. +void stage3(unsigned nthread, unsigned repeats) { + // List being tested + processor_list list; + + size_t before = 0; + + std::unique_ptr after( new size_t(0) ); + + std::atomic done ( false ); + + // Threads which will run the code + std::thread * threads[nthread]; + unsigned i = 1; + for(auto & t : threads) { + // Each thread will try to register a few processors and + // unregister them, making sure that the registration is + // consistent + t = new std::thread([&list, repeats, &before, &after, &done](unsigned tid){ + Random rng(tid); + processor proc; + proc.id = list.doregister(&proc); + while(!done) { + + if( (rng.next() % 100) == 0 ) { + auto r = list.write_lock(); + + auto b = before++; + + std::cout <= repeats) done = true; + + list.write_unlock(r); + } + else { + list.read_lock(proc.id); + assert(before == *after); + list.read_unlock(proc.id); + } + + } + + list.unregister(proc.id); + }, i++); + } + + for(auto t : threads) { + t->join(); + delete t; + } +} + +int main(int argc, char * argv[]) { + + unsigned nthreads = 1; + if( argc >= 3 ) { + size_t idx; + nthreads = std::stoul(argv[2], &idx); + assert('\0' == argv[2][idx]); + } + + unsigned repeats = 100; + if( argc >= 2 ) { + size_t idx; + repeats = std::stoul(argv[1], &idx); + assert('\0' == argv[1][idx]); + } + + processor_list::check_cache_line_size(); + + std::cout << "Running " << repeats << " repetitions on " << nthreads << " threads" << std::endl; + std::cout << "Checking registration - early" << std::endl; + stage1(nthreads, repeats); + std::cout << "Done " << std::endl; + + std::cout << "Checking registration - churn" << std::endl; + stage2(nthreads, repeats); + std::cout << "Done " << std::endl; + + std::cout << "Checking RW lock " << std::endl; + stage3(nthreads, repeats); + std::cout << "Done " << std::endl; + + + return 0; +} Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/randbit.cpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/randbit.cpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/randbit.cpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,236 @@ +#include +#include +#include + +__attribute__((noinline)) unsigned nthSetBit(size_t mask, unsigned bit) { + uint64_t v = mask; // Input value to find position with rank r. + unsigned int r = bit;// Input: bit's desired rank [1-64]. + unsigned int s; // Output: Resulting position of bit with rank r [1-64] + uint64_t a, b, c, d; // Intermediate temporaries for bit count. + unsigned int t; // Bit count temporary. + + // Do a normal parallel bit count for a 64-bit integer, + // but store all intermediate steps. + // a = (v & 0x5555...) + ((v >> 1) & 0x5555...); + a = v - ((v >> 1) & ~0UL/3); + // b = (a & 0x3333...) + ((a >> 2) & 0x3333...); + b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5); + // c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...); + c = (b + (b >> 4)) & ~0UL/0x11; + // d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...); + d = (c + (c >> 8)) & ~0UL/0x101; + + + t = (d >> 32) + (d >> 48); + // Now do branchless select! + s = 64; + // if (r > t) {s -= 32; r -= t;} + s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8)); + t = (d >> (s - 16)) & 0xff; + // if (r > t) {s -= 16; r -= t;} + s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8)); + t = (c >> (s - 8)) & 0xf; + // if (r > t) {s -= 8; r -= t;} + s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8)); + t = (b >> (s - 4)) & 0x7; + // if (r > t) {s -= 4; r -= t;} + s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8)); + t = (a >> (s - 2)) & 0x3; + // if (r > t) {s -= 2; r -= t;} + s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8)); + t = (v >> (s - 1)) & 0x1; + // if (r > t) s--; + s -= ((t - r) & 256) >> 8; + // s = 65 - s; + return s; +} + +unsigned rand_bit(unsigned rnum, uint64_t mask) { + unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0; +#if defined(BRANCHLESS) + uint64_t v = mask; // Input value to find position with rank r. + unsigned int r = bit + 1;// Input: bit's desired rank [1-64]. + unsigned int s; // Output: Resulting position of bit with rank r [1-64] + uint64_t a, b, c, d; // Intermediate temporaries for bit count. + unsigned int t; // Bit count temporary. + + // Do a normal parallel bit count for a 64-bit integer, + // but store all intermediate steps. + // a = (v & 0x5555...) + ((v >> 1) & 0x5555...); + a = v - ((v >> 1) & ~0UL/3); + // b = (a & 0x3333...) + ((a >> 2) & 0x3333...); + b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5); + // c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...); + c = (b + (b >> 4)) & ~0UL/0x11; + // d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...); + d = (c + (c >> 8)) & ~0UL/0x101; + + + t = (d >> 32) + (d >> 48); + // Now do branchless select! + s = 64; + // if (r > t) {s -= 32; r -= t;} + s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8)); + t = (d >> (s - 16)) & 0xff; + // if (r > t) {s -= 16; r -= t;} + s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8)); + t = (c >> (s - 8)) & 0xf; + // if (r > t) {s -= 8; r -= t;} + s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8)); + t = (b >> (s - 4)) & 0x7; + // if (r > t) {s -= 4; r -= t;} + s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8)); + t = (a >> (s - 2)) & 0x3; + // if (r > t) {s -= 2; r -= t;} + s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8)); + t = (v >> (s - 1)) & 0x1; + // if (r > t) s--; + s -= ((t - r) & 256) >> 8; + // s = 65 - s; + return s - 1; +#elif defined(LOOP) + for(unsigned i = 0; i < bit; i++) { + mask ^= (1ul << (__builtin_ffsl(mask) - 1ul)); + } + return __builtin_ffsl(mask) - 1ul; +#elif defined(PDEP) + uint64_t picked = _pdep_u64(1ul << bit, mask); + return __builtin_ffsl(picked) - 1ul; +#else +#error must define LOOP, PDEP or BRANCHLESS +#endif +} + +#include +#include +#include +#include +#include +#include +#include + +#include + +class barrier_t { +public: + barrier_t(size_t total) + : waiting(0) + , total(total) + {} + + void wait(unsigned) { + size_t target = waiting++; + target = (target - (target % total)) + total; + while(waiting < target) + asm volatile("pause"); + + assert(waiting < (1ul << 60)); + } + +private: + std::atomic waiting; + size_t total; +}; + +class Random { +private: + unsigned int seed; +public: + Random(int seed) { + this->seed = seed; + } + + /** returns pseudorandom x satisfying 0 <= x < n. **/ + unsigned int next() { + seed ^= seed << 6; + seed ^= seed >> 21; + seed ^= seed << 7; + return seed; + } +}; + +using Clock = std::chrono::high_resolution_clock; +using duration_t = std::chrono::duration; +using std::chrono::nanoseconds; + +template +T duration_cast(T seconds) { + return std::chrono::duration_cast>(std::chrono::duration(seconds)).count(); +} + +void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) { + + + std::cout << "Starting" << std::endl; + auto before = Clock::now(); + barrier.wait(0); + + while(true) { + usleep(100000); + auto now = Clock::now(); + duration_t durr = now - before; + if( durr.count() > duration ) { + done = true; + break; + } + std::cout << "\r" << std::setprecision(4) << durr.count(); + std::cout.flush(); + } + + barrier.wait(0); + auto after = Clock::now(); + duration_t durr = after - before; + duration = durr.count(); + std::cout << "\rClosing down" << std::endl; +} + +__attribute__((noinline)) void body(Random & rand) { + uint64_t mask = (uint64_t(rand.next()) << 32ul) | uint64_t(rand.next()); + unsigned idx = rand.next(); + + unsigned bit = rand_bit(idx, mask); + + if(__builtin_expect(((1ul << bit) & mask) == 0, false)) { + std::cerr << std::hex << "Rand " << idx << " from " << mask; + std::cerr << " gave " << (1ul << bit) << "(" << std::dec << bit << ")" << std::endl; + std::abort(); + } +} + +void runRandBit(double duration) { + + std::atomic_bool done = { false }; + barrier_t barrier(2); + + size_t count = 0; + std::thread thread([&done, &barrier, &count]() { + + Random rand(22); + + barrier.wait(1); + + for(;!done; count++) { + body(rand); + } + + barrier.wait(1); + }); + + waitfor(duration, barrier, done); + thread.join(); + + size_t ops = count; + size_t ops_sec = size_t(double(ops) / duration); + auto dur_nano = duration_cast(1.0); + + std::cout << "Duration : " << duration << "s\n"; + std::cout << "ns/Op : " << ( dur_nano / ops )<< "\n"; + std::cout << "Ops/sec : " << ops_sec << "\n"; + std::cout << "Total ops : " << ops << std::endl; + +} + +int main() { + std::cout.imbue(std::locale("")); + runRandBit(5); +} Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list.cpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list.cpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list.cpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,1141 @@ +#if !defined(LIST_VARIANT_HPP) +#define LIST_VARIANT_HPP "relaxed_list.hpp" +#endif + +#include LIST_VARIANT_HPP +#if !defined(LIST_VARIANT) +#error not variant selected +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "utils.hpp" + +struct __attribute__((aligned(64))) Node { + static std::atomic_size_t creates; + static std::atomic_size_t destroys; + + _LinksFields_t _links; + + int value; + int id; + + Node() { creates++; } + Node(int value): value(value) { creates++; } + ~Node() { destroys++; } +}; + +std::atomic_size_t Node::creates = { 0 }; +std::atomic_size_t Node::destroys = { 0 }; + +bool enable_stats = false; + +template<> +thread_local LIST_VARIANT::TLS LIST_VARIANT::tls = {}; + +template<> +std::atomic_uint32_t LIST_VARIANT::ticket = { 0 }; + +#ifndef NO_STATS +template<> +LIST_VARIANT::GlobalStats LIST_VARIANT::global_stats = {}; +#endif + +// ================================================================================================ +// UTILS +// ================================================================================================ + +struct local_stat_t { + size_t in = 0; + size_t out = 0; + size_t empty = 0; + size_t crc_in = 0; + size_t crc_out = 0; + size_t valmax = 0; + size_t valmin = 100000000ul; + struct { + size_t val = 0; + size_t cnt = 0; + } comp; + struct { + size_t val = 0; + size_t cnt = 0; + } subm; +}; + +struct global_stat_t { + std::atomic_size_t in = { 0 }; + std::atomic_size_t out = { 0 }; + std::atomic_size_t empty = { 0 }; + std::atomic_size_t crc_in = { 0 }; + std::atomic_size_t crc_out = { 0 }; + std::atomic_size_t valmax = { 0 }; + std::atomic_size_t valmin = { 100000000ul }; + struct { + std::atomic_size_t val = { 0 }; + std::atomic_size_t cnt = { 0 }; + } comp; + struct { + std::atomic_size_t val = { 0 }; + std::atomic_size_t cnt = { 0 }; + } subm; +}; + +void atomic_max(std::atomic_size_t & target, size_t value) { + for(;;) { + size_t expect = target.load(std::memory_order_relaxed); + if(value <= expect) return; + bool success = target.compare_exchange_strong(expect, value); + if(success) return; + } +} + +void atomic_min(std::atomic_size_t & target, size_t value) { + for(;;) { + size_t expect = target.load(std::memory_order_relaxed); + if(value >= expect) return; + bool success = target.compare_exchange_strong(expect, value); + if(success) return; + } +} + +void tally_stats(global_stat_t & global, local_stat_t & local) { + + global.in += local.in; + global.out += local.out; + global.empty += local.empty; + + global.crc_in += local.crc_in; + global.crc_out += local.crc_out; + + global.comp.val += local.comp.val; + global.comp.cnt += local.comp.cnt; + global.subm.val += local.subm.val; + global.subm.cnt += local.subm.cnt; + + atomic_max(global.valmax, local.valmax); + atomic_min(global.valmin, local.valmin); + + LIST_VARIANT::stats_tls_tally(); +} + +void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) { + std::cout << "Starting" << std::endl; + auto before = Clock::now(); + barrier.wait(0); + bool is_tty = isatty(STDOUT_FILENO); + + while(true) { + usleep(100000); + auto now = Clock::now(); + duration_t durr = now - before; + if( durr.count() > duration ) { + done = true; + break; + } + if(is_tty) { + std::cout << "\r" << std::setprecision(4) << durr.count(); + std::cout.flush(); + } + } + + barrier.wait(0); + auto after = Clock::now(); + duration_t durr = after - before; + duration = durr.count(); + std::cout << "\rClosing down" << std::endl; +} + +void waitfor(double & duration, barrier_t & barrier, const std::atomic_size_t & count) { + std::cout << "Starting" << std::endl; + auto before = Clock::now(); + barrier.wait(0); + + while(true) { + usleep(100000); + size_t c = count.load(); + if( c == 0 ) { + break; + } + std::cout << "\r" << c; + std::cout.flush(); + } + + barrier.wait(0); + auto after = Clock::now(); + duration_t durr = after - before; + duration = durr.count(); + std::cout << "\rClosing down" << std::endl; +} + +void print_stats(double duration, unsigned nthread, global_stat_t & global) { + assert(Node::creates == Node::destroys); + assert(global.crc_in == global.crc_out); + + std::cout << "Done" << std::endl; + + size_t ops = global.in + global.out; + size_t ops_sec = size_t(double(ops) / duration); + size_t ops_thread = ops_sec / nthread; + auto dur_nano = duration_cast(1.0); + + if(global.valmax != 0) { + std::cout << "Max runs : " << global.valmax << "\n"; + std::cout << "Min runs : " << global.valmin << "\n"; + } + if(global.comp.cnt != 0) { + std::cout << "Submit count : " << global.subm.cnt << "\n"; + std::cout << "Submit average: " << ((double(global.subm.val)) / global.subm.cnt) << "\n"; + std::cout << "Complete count: " << global.comp.cnt << "\n"; + std::cout << "Complete avg : " << ((double(global.comp.val)) / global.comp.cnt) << "\n"; + } + std::cout << "Duration : " << duration << "s\n"; + std::cout << "ns/Op : " << ( dur_nano / ops_thread )<< "\n"; + std::cout << "Ops/sec/thread: " << ops_thread << "\n"; + std::cout << "Ops/sec : " << ops_sec << "\n"; + std::cout << "Total ops : " << ops << "(" << global.in << "i, " << global.out << "o, " << global.empty << "e)\n"; + #ifndef NO_STATS + LIST_VARIANT::stats_print(std::cout); + #endif +} + +void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output); + +// ================================================================================================ +// EXPERIMENTS +// ================================================================================================ + +// ================================================================================================ +__attribute__((noinline)) void runChurn_body( + std::atomic& done, + Random & rand, + Node * my_nodes[], + unsigned nslots, + local_stat_t & local, + LIST_VARIANT & list +) { + while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) { + int idx = rand.next() % nslots; + if (auto node = my_nodes[idx]) { + local.crc_in += node->value; + list.push(node); + my_nodes[idx] = nullptr; + local.in++; + } + else if(auto node = list.pop()) { + local.crc_out += node->value; + my_nodes[idx] = node; + local.out++; + } + else { + local.empty++; + } + } +} + +void runChurn(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes, const unsigned nslots) { + std::cout << "Churn Benchmark" << std::endl; + assert(nnodes <= nslots); + // List being tested + + // Barrier for synchronization + barrier_t barrier(nthread + 1); + + // Data to check everything is OK + global_stat_t global; + + // Flag to signal termination + std::atomic_bool done = { false }; + + // Prep nodes + std::cout << "Initializing "; + size_t npushed = 0; + LIST_VARIANT list = { nthread, nqueues }; + { + Node** all_nodes[nthread]; + for(auto & nodes : all_nodes) { + nodes = new __attribute__((aligned(64))) Node*[nslots + 8]; + Random rand(rdtscl()); + for(unsigned i = 0; i < nnodes; i++) { + nodes[i] = new Node(rand.next() % 100); + } + + for(unsigned i = nnodes; i < nslots; i++) { + nodes[i] = nullptr; + } + + for(int i = 0; i < 10 && i < (int)nslots; i++) { + int idx = rand.next() % nslots; + if (auto node = nodes[idx]) { + global.crc_in += node->value; + list.push(node); + npushed++; + nodes[idx] = nullptr; + } + } + } + + std::cout << nnodes << " nodes (" << nslots << " slots)" << std::endl; + + enable_stats = true; + + std::thread * threads[nthread]; + unsigned i = 1; + for(auto & t : threads) { + auto & my_nodes = all_nodes[i - 1]; + t = new std::thread([&done, &list, &barrier, &global, &my_nodes, nslots](unsigned tid) { + Random rand(tid + rdtscl()); + + local_stat_t local; + + // affinity(tid); + + barrier.wait(tid); + + // EXPERIMENT START + + runChurn_body(done, rand, my_nodes, nslots, local, list); + + // EXPERIMENT END + + barrier.wait(tid); + + tally_stats(global, local); + + for(unsigned i = 0; i < nslots; i++) { + delete my_nodes[i]; + } + }, i++); + } + + waitfor(duration, barrier, done); + + for(auto t : threads) { + t->join(); + delete t; + } + + enable_stats = false; + + while(auto node = list.pop()) { + global.crc_out += node->value; + delete node; + } + + for(auto nodes : all_nodes) { + delete[] nodes; + } + } + + print_stats(duration, nthread, global); +} + +// ================================================================================================ +__attribute__((noinline)) void runPingPong_body( + std::atomic& done, + Node initial_nodes[], + unsigned nnodes, + local_stat_t & local, + LIST_VARIANT & list +) { + Node * nodes[nnodes]; + { + unsigned i = 0; + for(auto & n : nodes) { + n = &initial_nodes[i++]; + } + } + + while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) { + + for(Node * & node : nodes) { + local.crc_in += node->value; + list.push(node); + local.in++; + } + + // ----- + + for(Node * & node : nodes) { + node = list.pop(); + assert(node); + local.crc_out += node->value; + local.out++; + } + } +} + +void runPingPong(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes) { + std::cout << "PingPong Benchmark" << std::endl; + + + // Barrier for synchronization + barrier_t barrier(nthread + 1); + + // Data to check everything is OK + global_stat_t global; + + // Flag to signal termination + std::atomic_bool done = { false }; + + std::cout << "Initializing "; + // List being tested + LIST_VARIANT list = { nthread, nqueues }; + { + enable_stats = true; + + std::thread * threads[nthread]; + unsigned i = 1; + for(auto & t : threads) { + t = new std::thread([&done, &list, &barrier, &global, nnodes](unsigned tid) { + Random rand(tid + rdtscl()); + + Node nodes[nnodes]; + for(auto & n : nodes) { + n.value = (int)rand.next() % 100; + } + + local_stat_t local; + + // affinity(tid); + + barrier.wait(tid); + + // EXPERIMENT START + + runPingPong_body(done, nodes, nnodes, local, list); + + // EXPERIMENT END + + barrier.wait(tid); + + tally_stats(global, local); + }, i++); + } + + waitfor(duration, barrier, done); + + for(auto t : threads) { + t->join(); + delete t; + } + + enable_stats = false; + } + + print_stats(duration, nthread, global); +} + +// ================================================================================================ +struct __attribute__((aligned(64))) Slot { + Node * volatile node; +}; + +__attribute__((noinline)) void runProducer_body( + std::atomic& done, + Random & rand, + Slot * slots, + int nslots, + local_stat_t & local, + LIST_VARIANT & list +) { + while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) { + + Node * node = list.pop(); + if(!node) { + local.empty ++; + continue; + } + + local.crc_out += node->value; + local.out++; + + if(node->id == 0) { + unsigned cnt = 0; + for(int i = 0; i < nslots; i++) { + Node * found = __atomic_exchange_n( &slots[i].node, nullptr, __ATOMIC_SEQ_CST ); + if( found ) { + local.crc_in += found->value; + local.in++; + cnt++; + list.push( found ); + } + } + + local.crc_in += node->value; + local.in++; + list.push( node ); + + local.comp.cnt++; + local.comp.val += cnt; + } + else { + unsigned len = 0; + while(true) { + auto off = rand.next(); + for(int i = 0; i < nslots; i++) { + Node * expected = nullptr; + int idx = (i + off) % nslots; + Slot & slot = slots[ idx ]; + if( + slot.node == nullptr && + __atomic_compare_exchange_n( &slot.node, &expected, node, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) + ) { + local.subm.cnt++; + local.subm.val += len; + goto LOOP; + } + assert( expected != node ); + len++; + } + } + } + + LOOP:; + } +} + +void runProducer(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes) { + std::cout << "Producer Benchmark" << std::endl; + + // Barrier for synchronization + barrier_t barrier(nthread + 1); + + // Data to check everything is OK + global_stat_t global; + + // Flag to signal termination + std::atomic_bool done = { false }; + + std::cout << "Initializing "; + + int nslots = nnodes * 4; + Slot * slots = new Slot[nslots]; + std::cout << nnodes << " nodes (" << nslots << " slots)" << std::endl; + + // List being tested + LIST_VARIANT list = { nthread, nqueues }; + { + Random rand(rdtscl()); + for(unsigned i = 0; i < nnodes; i++) { + Node * node = new Node(rand.next() % 100); + node->id = i; + global.crc_in += node->value; + list.push(node); + } + + for(int i = 0; i < nslots; i++) { + slots[i].node = nullptr; + } + } + + { + enable_stats = true; + + std::thread * threads[nthread]; + unsigned i = 1; + for(auto & t : threads) { + t = new std::thread([&done, &list, &barrier, &global, slots, nslots](unsigned tid) { + Random rand(tid + rdtscl()); + + local_stat_t local; + barrier.wait(tid); + + // EXPERIMENT START + + runProducer_body(done, rand, slots, nslots, local, list); + + // EXPERIMENT END + + barrier.wait(tid); + + tally_stats(global, local); + }, i++); + } + + waitfor(duration, barrier, done); + + for(auto t : threads) { + t->join(); + delete t; + } + + enable_stats = false; + } + + { + while(Node * node = list.pop()) { + global.crc_out += node->value; + delete node; + } + + for(int i = 0; i < nslots; i++) { + delete slots[i].node; + } + + delete [] slots; + } + + print_stats(duration, nthread, global); +} + +// ================================================================================================ +__attribute__((noinline)) void runFairness_body( + unsigned tid, + size_t width, + size_t length, + int output[], + std::atomic_size_t & count, + Node initial_nodes[], + unsigned nnodes, + local_stat_t & local, + LIST_VARIANT & list +) { + Node * nodes[nnodes]; + { + unsigned i = 0; + for(auto & n : nodes) { + n = &initial_nodes[i++]; + } + } + + while(__builtin_expect(0 != count.load(std::memory_order_relaxed), true)) { + + for(Node * & node : nodes) { + local.crc_in += node->id; + list.push(node); + local.in++; + } + + // ----- + + for(Node * & node : nodes) { + node = list.pop(); + assert(node); + + if (unsigned(node->value) < length) { + size_t idx = (node->value * width) + node->id; + assert(idx < (width * length)); + output[idx] = tid; + } + + node->value++; + if(unsigned(node->value) == length) count--; + + local.crc_out += node->id; + local.out++; + } + } +} + +void runFairness(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes, const std::string & output) { + std::cout << "Fairness Benchmark, outputing to : " << output << std::endl; + + // Barrier for synchronization + barrier_t barrier(nthread + 1); + + // Data to check everything is OK + global_stat_t global; + + std::cout << "Initializing "; + + // Check fairness by creating a png of where the threads ran + size_t width = nthread * nnodes; + size_t length = 100000; + + std::unique_ptr data_out { new int[width * length] }; + + // Flag to signal termination + std::atomic_size_t count = width; + + // List being tested + LIST_VARIANT list = { nthread, nqueues }; + { + enable_stats = true; + + std::thread * threads[nthread]; + unsigned i = 1; + for(auto & t : threads) { + t = new std::thread([&count, &list, &barrier, &global, nnodes, width, length, data_out = data_out.get()](unsigned tid) { + unsigned int start = (tid - 1) * nnodes; + Node nodes[nnodes]; + for(auto & n : nodes) { + n.id = start; + n.value = 0; + start++; + } + + local_stat_t local; + + // affinity(tid); + + barrier.wait(tid); + + // EXPERIMENT START + + runFairness_body(tid, width, length, data_out, count, nodes, nnodes, local, list); + + // EXPERIMENT END + + barrier.wait(tid); + + for(const auto & n : nodes) { + local.valmax = max(local.valmax, size_t(n.value)); + local.valmin = min(local.valmin, size_t(n.value)); + } + + tally_stats(global, local); + }, i++); + } + + waitfor(duration, barrier, count); + + for(auto t : threads) { + t->join(); + delete t; + } + + enable_stats = false; + } + + print_stats(duration, nthread, global); + + // save_fairness(data_out.get(), 100, nthread, width, length, output); +} + +// ================================================================================================ + +bool iequals(const std::string& a, const std::string& b) +{ + return std::equal(a.begin(), a.end(), + b.begin(), b.end(), + [](char a, char b) { + return std::tolower(a) == std::tolower(b); + }); +} + +int main(int argc, char * argv[]) { + + double duration = 5.0; + unsigned nthreads = 2; + unsigned nqueues = 4; + unsigned nnodes = 100; + unsigned nslots = 100; + std::string out = "fairness.png"; + + enum { + Churn, + PingPong, + Producer, + Fairness, + NONE + } benchmark = NONE; + + std::cout.imbue(std::locale("")); + + for(;;) { + static struct option options[] = { + {"duration", required_argument, 0, 'd'}, + {"nthreads", required_argument, 0, 't'}, + {"nqueues", required_argument, 0, 'q'}, + {"benchmark", required_argument, 0, 'b'}, + {0, 0, 0, 0} + }; + + int idx = 0; + int opt = getopt_long(argc, argv, "d:t:q:b:", options, &idx); + + std::string arg = optarg ? optarg : ""; + size_t len = 0; + switch(opt) { + // Exit Case + case -1: + /* paranoid */ assert(optind <= argc); + switch(benchmark) { + case NONE: + std::cerr << "Must specify a benchmark" << std::endl; + goto usage; + case PingPong: + nnodes = 1; + switch(argc - optind) { + case 0: break; + case 1: + try { + arg = optarg = argv[optind]; + nnodes = stoul(optarg, &len); + if(len != arg.size()) { throw std::invalid_argument(""); } + } catch(std::invalid_argument &) { + std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl; + goto usage; + } + break; + default: + std::cerr << "'PingPong' benchmark doesn't accept more than 1 extra arguments" << std::endl; + goto usage; + } + break; + case Producer: + nnodes = 32; + switch(argc - optind) { + case 0: break; + case 1: + try { + arg = optarg = argv[optind]; + nnodes = stoul(optarg, &len); + if(len != arg.size()) { throw std::invalid_argument(""); } + } catch(std::invalid_argument &) { + std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl; + goto usage; + } + break; + default: + std::cerr << "'Producer' benchmark doesn't accept more than 1 extra arguments" << std::endl; + goto usage; + } + break; + case Churn: + nnodes = 100; + nslots = 100; + switch(argc - optind) { + case 0: break; + case 1: + try { + arg = optarg = argv[optind]; + nnodes = stoul(optarg, &len); + if(len != arg.size()) { throw std::invalid_argument(""); } + nslots = nnodes; + } catch(std::invalid_argument &) { + std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl; + goto usage; + } + break; + case 2: + try { + arg = optarg = argv[optind]; + nnodes = stoul(optarg, &len); + if(len != arg.size()) { throw std::invalid_argument(""); } + } catch(std::invalid_argument &) { + std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl; + goto usage; + } + try { + arg = optarg = argv[optind + 1]; + nslots = stoul(optarg, &len); + if(len != arg.size()) { throw std::invalid_argument(""); } + } catch(std::invalid_argument &) { + std::cerr << "Number of slots must be a positive integer, was " << arg << std::endl; + goto usage; + } + break; + default: + std::cerr << "'Churn' benchmark doesn't accept more than 2 extra arguments" << std::endl; + goto usage; + } + break; + case Fairness: + nnodes = 1; + switch(argc - optind) { + case 0: break; + case 1: + arg = optarg = argv[optind]; + out = arg; + break; + default: + std::cerr << "'Churn' benchmark doesn't accept more than 2 extra arguments" << std::endl; + goto usage; + } + } + goto run; + // Benchmarks + case 'b': + if(benchmark != NONE) { + std::cerr << "Only when benchmark can be run" << std::endl; + goto usage; + } + if(iequals(arg, "churn")) { + benchmark = Churn; + break; + } + if(iequals(arg, "pingpong")) { + benchmark = PingPong; + break; + } + if(iequals(arg, "producer")) { + benchmark = Producer; + break; + } + if(iequals(arg, "fairness")) { + benchmark = Fairness; + break; + } + std::cerr << "Unkown benchmark " << arg << std::endl; + goto usage; + // Numeric Arguments + case 'd': + try { + duration = stod(optarg, &len); + if(len != arg.size()) { throw std::invalid_argument(""); } + } catch(std::invalid_argument &) { + std::cerr << "Duration must be a valid double, was " << arg << std::endl; + goto usage; + } + break; + case 't': + try { + nthreads = stoul(optarg, &len); + if(len != arg.size()) { throw std::invalid_argument(""); } + } catch(std::invalid_argument &) { + std::cerr << "Number of threads must be a positive integer, was " << arg << std::endl; + goto usage; + } + break; + case 'q': + try { + nqueues = stoul(optarg, &len); + if(len != arg.size()) { throw std::invalid_argument(""); } + } catch(std::invalid_argument &) { + std::cerr << "Number of queues must be a positive integer, was " << arg << std::endl; + goto usage; + } + break; + // Other cases + default: /* ? */ + std::cerr << opt << std::endl; + usage: + std::cerr << "Usage: " << argv[0] << ": [options] -b churn [NNODES] [NSLOTS = NNODES]" << std::endl; + std::cerr << " or: " << argv[0] << ": [options] -b pingpong [NNODES]" << std::endl; + std::cerr << " or: " << argv[0] << ": [options] -b producer [NNODES]" << std::endl; + std::cerr << std::endl; + std::cerr << " -d, --duration=DURATION Duration of the experiment, in seconds" << std::endl; + std::cerr << " -t, --nthreads=NTHREADS Number of kernel threads" << std::endl; + std::cerr << " -q, --nqueues=NQUEUES Number of queues per threads" << std::endl; + std::exit(1); + } + } + run: + + check_cache_line_size(); + + std::cout << "Running " << nthreads << " threads (" << (nthreads * nqueues) << " queues) for " << duration << " seconds" << std::endl; + std::cout << "Relaxed list variant: " << LIST_VARIANT::name() << std::endl; + switch(benchmark) { + case Churn: + runChurn(nthreads, nqueues, duration, nnodes, nslots); + break; + case PingPong: + runPingPong(nthreads, nqueues, duration, nnodes); + break; + case Producer: + runProducer(nthreads, nqueues, duration, nnodes); + break; + case Fairness: + runFairness(nthreads, nqueues, duration, nnodes, out); + break; + default: + abort(); + } + return 0; +} + +const char * __my_progname = "Relaxed List"; + +struct rgb_t { + double r; // a fraction between 0 and 1 + double g; // a fraction between 0 and 1 + double b; // a fraction between 0 and 1 +}; + +struct hsv_t { + double h; // angle in degrees + double s; // a fraction between 0 and 1 + double v; // a fraction between 0 and 1 +}; + +rgb_t hsv2rgb(hsv_t in) { + double hh, p, q, t, ff; + long i; + rgb_t out; + + if(in.s <= 0.0) { // < is bogus, just shuts up warnings + out.r = in.v; + out.g = in.v; + out.b = in.v; + return out; + } + hh = in.h; + if(hh >= 360.0) hh = 0.0; + hh /= 60.0; + i = (long)hh; + ff = hh - i; + p = in.v * (1.0 - in.s); + q = in.v * (1.0 - (in.s * ff)); + t = in.v * (1.0 - (in.s * (1.0 - ff))); + + switch(i) { + case 0: + out.r = in.v; + out.g = t; + out.b = p; + break; + case 1: + out.r = q; + out.g = in.v; + out.b = p; + break; + case 2: + out.r = p; + out.g = in.v; + out.b = t; + break; + + case 3: + out.r = p; + out.g = q; + out.b = in.v; + break; + case 4: + out.r = t; + out.g = p; + out.b = in.v; + break; + case 5: + default: + out.r = in.v; + out.g = p; + out.b = q; + break; + } + return out; +} + +// void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output) { +// std::ofstream os(output); +// os << "\n"; +// os << "\n"; +// os << "\n"; +// os << "\n"; +// os << "\n"; +// os << "\n"; + +// size_t idx = 0; +// for(size_t r = 0ul; r < rows; r++) { +// os << "\n"; +// for(size_t c = 0ul; c < columns; c++) { +// os << "\n"; +// idx++; +// } +// os << "\n"; +// } + +// os << "

\n"; +// os << "\n"; +// os << "\n"; +// os << std::endl; +// } + +// #include +// #include + +/* +void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output) { + int width = columns * factor; + int height = rows / factor; + + int code = 0; + int idx = 0; + FILE *fp = NULL; + png_structp png_ptr = NULL; + png_infop info_ptr = NULL; + png_bytep row = NULL; + + // Open file for writing (binary mode) + fp = fopen(output.c_str(), "wb"); + if (fp == NULL) { + fprintf(stderr, "Could not open file %s for writing\n", output.c_str()); + code = 1; + goto finalise; + } + + // Initialize write structure + png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); + if (png_ptr == NULL) { + fprintf(stderr, "Could not allocate write struct\n"); + code = 1; + goto finalise; + } + + // Initialize info structure + info_ptr = png_create_info_struct(png_ptr); + if (info_ptr == NULL) { + fprintf(stderr, "Could not allocate info struct\n"); + code = 1; + goto finalise; + } + + // Setup Exception handling + if (setjmp(png_jmpbuf(png_ptr))) { + fprintf(stderr, "Error during png creation\n"); + code = 1; + goto finalise; + } + + png_init_io(png_ptr, fp); + + // Write header (8 bit colour depth) + png_set_IHDR(png_ptr, info_ptr, width, height, + 8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE, + PNG_COMPRESSION_TYPE_BASE, PNG_FILTER_TYPE_BASE); + + png_write_info(png_ptr, info_ptr); + + // Allocate memory for one row (3 bytes per pixel - RGB) + row = (png_bytep) malloc(3 * width * sizeof(png_byte)); + + // Write image data + int x, y; + for (y=0 ; y= 0); + idx++; + + double angle = double(color) / double(nthreads); + + auto c = hsv2rgb({ 360.0 * angle, 0.8, 0.8 }); + + r = char(c.r * 255.0); + g = char(c.g * 255.0); + b = char(c.b * 255.0); + + } + png_write_row(png_ptr, row); + } + + assert(idx == (rows * columns)); + + // End write + png_write_end(png_ptr, NULL); + + finalise: + if (fp != NULL) fclose(fp); + if (info_ptr != NULL) png_free_data(png_ptr, info_ptr, PNG_FREE_ALL, -1); + if (png_ptr != NULL) png_destroy_write_struct(&png_ptr, (png_infopp)NULL); + if (row != NULL) free(row); +} +*/ Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,555 @@ +#pragma once +#define LIST_VARIANT relaxed_list + +#define VANILLA 0 +#define SNZI 1 +#define BITMASK 2 +#define DISCOVER 3 +#define SNZM 4 +#define BIAS 5 +#define BACK 6 +#define BACKBIAS 7 + +#ifndef VARIANT +#define VARIANT VANILLA +#endif + +#ifndef NO_STATS +#include +#endif + +#include +#include +#include +#include +#include +#include + +#include "assert.hpp" +#include "utils.hpp" +#include "links.hpp" +#include "snzi.hpp" +#include "snzi-packed.hpp" +#include "snzm.hpp" + +using namespace std; + +struct pick_stat { + struct { + size_t attempt = 0; + size_t success = 0; + size_t local = 0; + } push; + struct { + size_t attempt = 0; + size_t success = 0; + size_t mask_attempt = 0; + size_t mask_reset = 0; + size_t local = 0; + } pop; +}; + +struct empty_stat { + struct { + size_t value = 0; + size_t count = 0; + } push; + struct { + size_t value = 0; + size_t count = 0; + } pop; +}; + +template +class __attribute__((aligned(128))) relaxed_list { + static_assert(std::is_same>::value, "Node must have a links field"); + +public: + static const char * name() { + const char * names[] = { + "RELAXED: VANILLA", + "RELAXED: SNZI", + "RELAXED: BITMASK", + "RELAXED: SNZI + DISCOVERED MASK", + "RELAXED: SNZI + MASK", + "RELAXED: SNZI + LOCAL BIAS", + "RELAXED: SNZI + REVERSE RNG", + "RELAXED: SNZI + LOCAL BIAS + REVERSE RNG" + }; + return names[VARIANT]; + } + + relaxed_list(unsigned numThreads, unsigned numQueues) + : numLists(numThreads * numQueues) + , lists(new intrusive_queue_t[numLists]) + #if VARIANT == SNZI || VARIANT == BACK + , snzi( std::log2( numLists / (2 * numQueues) ), 2 ) + #elif VARIANT == BIAS || VARIANT == BACKBIAS + #ifdef SNZI_PACKED + , snzi( std::ceil( std::log2(numLists) ) ) + #else + , snzi( std::log2( numLists / (2 * numQueues) ), 2 ) + #endif + #elif VARIANT == SNZM || VARIANT == DISCOVER + , snzm( numLists ) + #endif + { + assertf(7 * 8 * 8 >= numLists, "List currently only supports 448 sublists"); + std::cout << "Constructing Relaxed List with " << numLists << std::endl; + } + + ~relaxed_list() { + std::cout << "Destroying Relaxed List" << std::endl; + lists.reset(); + } + + __attribute__((noinline, hot)) void push(node_t * node) { + node->_links.ts = rdtscl(); + + while(true) { + // Pick a random list + unsigned i = idx_from_r(tls.rng1.next(), VARIANT == BIAS || VARIANT == BACKBIAS); + + #ifndef NO_STATS + tls.pick.push.attempt++; + #endif + + // If we can't lock it retry + if( !lists[i].lock.try_lock() ) continue; + + #if VARIANT == VANILLA || VARIANT == BITMASK + __attribute__((unused)) int num = numNonEmpty; + #endif + + // Actually push it + if(lists[i].push(node)) { + #if VARIANT == DISCOVER + size_t qword = i >> 6ull; + size_t bit = i & 63ull; + assert(qword == 0); + bts(tls.mask, bit); + snzm.arrive(i); + #elif VARIANT == SNZI || VARIANT == BIAS + snzi.arrive(i); + #elif VARIANT == BACK || VARIANT == BACKBIAS + snzi.arrive(i); + tls.rng2.set_raw_state( tls.rng1.get_raw_state()); + #elif VARIANT == SNZM + snzm.arrive(i); + #elif VARIANT == BITMASK + numNonEmpty++; + size_t qword = i >> 6ull; + size_t bit = i & 63ull; + assertf((list_mask[qword] & (1ul << bit)) == 0, "Before set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit)); + __attribute__((unused)) bool ret = bts(list_mask[qword], bit); + assert(!ret); + assertf((list_mask[qword] & (1ul << bit)) != 0, "After set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit)); + #else + numNonEmpty++; + #endif + } + #if VARIANT == VANILLA || VARIANT == BITMASK + assert(numNonEmpty <= (int)numLists); + #endif + + // Unlock and return + lists[i].lock.unlock(); + + #ifndef NO_STATS + tls.pick.push.success++; + #if VARIANT == VANILLA || VARIANT == BITMASK + tls.empty.push.value += num; + tls.empty.push.count += 1; + #endif + #endif + return; + } + } + + __attribute__((noinline, hot)) node_t * pop() { + #if VARIANT == DISCOVER + assert(numLists <= 64); + while(snzm.query()) { + tls.pick.pop.mask_attempt++; + unsigned i, j; + { + // Pick first list totally randomly + i = tls.rng1.next() % numLists; + + // Pick the other according to the bitmask + unsigned r = tls.rng1.next(); + + size_t mask = tls.mask.load(std::memory_order_relaxed); + if(mask == 0) { + tls.pick.pop.mask_reset++; + mask = (1U << numLists) - 1; + tls.mask.store(mask, std::memory_order_relaxed); + } + + unsigned b = rand_bit(r, mask); + + assertf(b < 64, "%zu %u", mask, b); + + j = b; + + assert(j < numLists); + } + + if(auto node = try_pop(i, j)) return node; + } + #elif VARIANT == SNZI + while(snzi.query()) { + // Pick two lists at random + int i = tls.rng1.next() % numLists; + int j = tls.rng1.next() % numLists; + + if(auto node = try_pop(i, j)) return node; + } + + #elif VARIANT == BACK + while(snzi.query()) { + // Pick two lists at random + int i = tls.rng2.prev() % numLists; + int j = tls.rng2.prev() % numLists; + + if(auto node = try_pop(i, j)) return node; + } + + #elif VARIANT == BACKBIAS + while(snzi.query()) { + // Pick two lists at random + int i = idx_from_r(tls.rng2.prev(), true); + int j = idx_from_r(tls.rng2.prev(), true); + + if(auto node = try_pop(i, j)) return node; + } + + #elif VARIANT == BIAS + while(snzi.query()) { + // Pick two lists at random + unsigned ri = tls.rng1.next(); + unsigned i; + unsigned j = tls.rng1.next(); + if(0 == (ri & 0xF)) { + i = (ri >> 4) % numLists; + } else { + i = tls.my_queue + ((ri >> 4) % 4); + j = tls.my_queue + ((j >> 4) % 4); + tls.pick.pop.local++; + } + i %= numLists; + j %= numLists; + + if(auto node = try_pop(i, j)) return node; + } + #elif VARIANT == SNZM + //* + while(snzm.query()) { + tls.pick.pop.mask_attempt++; + unsigned i, j; + { + // Pick two random number + unsigned ri = tls.rng1.next(); + unsigned rj = tls.rng1.next(); + + // Pick two nodes from it + unsigned wdxi = ri & snzm.mask; + // unsigned wdxj = rj & snzm.mask; + + // Get the masks from the nodes + // size_t maski = snzm.masks(wdxi); + size_t maskj = snzm.masks(wdxj); + + if(maski == 0 && maskj == 0) continue; + + #if defined(__BMI2__) + uint64_t idxsi = _pext_u64(snzm.indexes, maski); + // uint64_t idxsj = _pext_u64(snzm.indexes, maskj); + + auto pi = __builtin_popcountll(maski); + // auto pj = __builtin_popcountll(maskj); + + ri = pi ? ri & ((pi >> 3) - 1) : 0; + rj = pj ? rj & ((pj >> 3) - 1) : 0; + + unsigned bi = (idxsi >> (ri << 3)) & 0xff; + unsigned bj = (idxsj >> (rj << 3)) & 0xff; + #else + unsigned bi = rand_bit(ri >> snzm.depth, maski); + unsigned bj = rand_bit(rj >> snzm.depth, maskj); + #endif + + i = (bi << snzm.depth) | wdxi; + j = (bj << snzm.depth) | wdxj; + + /* paranoid */ assertf(i < numLists, "%u %u", bj, wdxi); + /* paranoid */ assertf(j < numLists, "%u %u", bj, wdxj); + } + + if(auto node = try_pop(i, j)) return node; + } + /*/ + while(snzm.query()) { + // Pick two lists at random + int i = tls.rng1.next() % numLists; + int j = tls.rng1.next() % numLists; + + if(auto node = try_pop(i, j)) return node; + } + //*/ + #elif VARIANT == BITMASK + int nnempty; + while(0 != (nnempty = numNonEmpty)) { + tls.pick.pop.mask_attempt++; + unsigned i, j; + { + // Pick two lists at random + unsigned num = ((numLists - 1) >> 6) + 1; + + unsigned ri = tls.rng1.next(); + unsigned rj = tls.rng1.next(); + + unsigned wdxi = (ri >> 6u) % num; + unsigned wdxj = (rj >> 6u) % num; + + size_t maski = list_mask[wdxi].load(std::memory_order_relaxed); + size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed); + + if(maski == 0 && maskj == 0) continue; + + unsigned bi = rand_bit(ri, maski); + unsigned bj = rand_bit(rj, maskj); + + assertf(bi < 64, "%zu %u", maski, bi); + assertf(bj < 64, "%zu %u", maskj, bj); + + i = bi | (wdxi << 6); + j = bj | (wdxj << 6); + + assertf(i < numLists, "%u", wdxi << 6); + assertf(j < numLists, "%u", wdxj << 6); + } + + if(auto node = try_pop(i, j)) return node; + } + #else + while(numNonEmpty != 0) { + // Pick two lists at random + int i = tls.rng1.next() % numLists; + int j = tls.rng1.next() % numLists; + + if(auto node = try_pop(i, j)) return node; + } + #endif + + return nullptr; + } + +private: + node_t * try_pop(unsigned i, unsigned j) { + #ifndef NO_STATS + tls.pick.pop.attempt++; + #endif + + #if VARIANT == DISCOVER + if(lists[i].ts() > 0) bts(tls.mask, i); else btr(tls.mask, i); + if(lists[j].ts() > 0) bts(tls.mask, j); else btr(tls.mask, j); + #endif + + // Pick the bet list + int w = i; + if( __builtin_expect(lists[j].ts() != 0, true) ) { + w = (lists[i].ts() < lists[j].ts()) ? i : j; + } + + auto & list = lists[w]; + // If list looks empty retry + if( list.ts() == 0 ) return nullptr; + + // If we can't get the lock retry + if( !list.lock.try_lock() ) return nullptr; + + #if VARIANT == VANILLA || VARIANT == BITMASK + __attribute__((unused)) int num = numNonEmpty; + #endif + + // If list is empty, unlock and retry + if( list.ts() == 0 ) { + list.lock.unlock(); + return nullptr; + } + + // Actually pop the list + node_t * node; + bool emptied; + std::tie(node, emptied) = list.pop(); + assert(node); + + if(emptied) { + #if VARIANT == DISCOVER + size_t qword = w >> 6ull; + size_t bit = w & 63ull; + assert(qword == 0); + __attribute__((unused)) bool ret = btr(tls.mask, bit); + snzm.depart(w); + #elif VARIANT == SNZI || VARIANT == BIAS || VARIANT == BACK || VARIANT == BACKBIAS + snzi.depart(w); + #elif VARIANT == SNZM + snzm.depart(w); + #elif VARIANT == BITMASK + numNonEmpty--; + size_t qword = w >> 6ull; + size_t bit = w & 63ull; + assert((list_mask[qword] & (1ul << bit)) != 0); + __attribute__((unused)) bool ret = btr(list_mask[qword], bit); + assert(ret); + assert((list_mask[qword] & (1ul << bit)) == 0); + #else + numNonEmpty--; + #endif + } + + // Unlock and return + list.lock.unlock(); + #if VARIANT == VANILLA || VARIANT == BITMASK + assert(numNonEmpty >= 0); + #endif + #ifndef NO_STATS + tls.pick.pop.success++; + #if VARIANT == VANILLA || VARIANT == BITMASK + tls.empty.pop.value += num; + tls.empty.pop.count += 1; + #endif + #endif + return node; + } + + inline unsigned idx_from_r(unsigned r, bool bias) { + unsigned i; + if(bias) { + if(0 == (r & 0x3F)) { + i = r >> 6; + } else { + i = tls.my_queue + ((r >> 6) % 4); + tls.pick.push.local++; + } + } else { + i = r; + } + return i % numLists; + } + +public: + + static __attribute__((aligned(128))) thread_local struct TLS { + Random rng1 = { unsigned(std::hash{}(std::this_thread::get_id()) ^ rdtscl()) }; + Random rng2 = { unsigned(std::hash{}(std::this_thread::get_id()) ^ rdtscl()) }; + unsigned my_queue = (ticket++) * 4; + pick_stat pick; + empty_stat empty; + __attribute__((aligned(64))) std::atomic_size_t mask = { 0 }; + } tls; + +private: + const unsigned numLists; + __attribute__((aligned(64))) std::unique_ptr []> lists; +private: + #if VARIANT == SNZI || VARIANT == BACK + snzi_t snzi; + #elif VARIANT == BIAS || VARIANT == BACKBIAS + #ifdef SNZI_PACKED + snzip_t snzi; + #else + snzi_t snzi; + #endif + #elif VARIANT == SNZM || VARIANT == DISCOVER + snzm_t snzm; + #else + std::atomic_int numNonEmpty = { 0 }; // number of non-empty lists + #endif + #if VARIANT == BITMASK + std::atomic_size_t list_mask[7] = { {0}, {0}, {0}, {0}, {0}, {0}, {0} }; // which queues are empty + #endif + +public: + static const constexpr size_t sizeof_queue = sizeof(intrusive_queue_t); + static std::atomic_uint32_t ticket; + +#ifndef NO_STATS + static void stats_tls_tally() { + global_stats.pick.push.attempt += tls.pick.push.attempt; + global_stats.pick.push.success += tls.pick.push.success; + global_stats.pick.push.local += tls.pick.push.local; + global_stats.pick.pop .attempt += tls.pick.pop.attempt; + global_stats.pick.pop .success += tls.pick.pop.success; + global_stats.pick.pop .mask_attempt += tls.pick.pop.mask_attempt; + global_stats.pick.pop .mask_reset += tls.pick.pop.mask_reset; + global_stats.pick.pop .local += tls.pick.pop.local; + + global_stats.qstat.push.value += tls.empty.push.value; + global_stats.qstat.push.count += tls.empty.push.count; + global_stats.qstat.pop .value += tls.empty.pop .value; + global_stats.qstat.pop .count += tls.empty.pop .count; + } + +private: + static struct GlobalStats { + struct { + struct { + std::atomic_size_t attempt = { 0 }; + std::atomic_size_t success = { 0 }; + std::atomic_size_t local = { 0 }; + } push; + struct { + std::atomic_size_t attempt = { 0 }; + std::atomic_size_t success = { 0 }; + std::atomic_size_t mask_attempt = { 0 }; + std::atomic_size_t mask_reset = { 0 }; + std::atomic_size_t local = { 0 }; + } pop; + } pick; + struct { + struct { + std::atomic_size_t value = { 0 }; + std::atomic_size_t count = { 0 }; + } push; + struct { + std::atomic_size_t value = { 0 }; + std::atomic_size_t count = { 0 }; + } pop; + } qstat; + } global_stats; + +public: + static void stats_print(std::ostream & os ) { + std::cout << "----- Relaxed List Stats -----" << std::endl; + + const auto & global = global_stats; + + double push_sur = (100.0 * double(global.pick.push.success) / global.pick.push.attempt); + double pop_sur = (100.0 * double(global.pick.pop .success) / global.pick.pop .attempt); + double mpop_sur = (100.0 * double(global.pick.pop .success) / global.pick.pop .mask_attempt); + double rpop_sur = (100.0 * double(global.pick.pop .success) / global.pick.pop .mask_reset); + + double push_len = double(global.pick.push.attempt ) / global.pick.push.success; + double pop_len = double(global.pick.pop .attempt ) / global.pick.pop .success; + double mpop_len = double(global.pick.pop .mask_attempt) / global.pick.pop .success; + double rpop_len = double(global.pick.pop .mask_reset ) / global.pick.pop .success; + + os << "Push Pick : " << push_sur << " %, len " << push_len << " (" << global.pick.push.attempt << " / " << global.pick.push.success << ")\n"; + os << "Pop Pick : " << pop_sur << " %, len " << pop_len << " (" << global.pick.pop .attempt << " / " << global.pick.pop .success << ")\n"; + os << "TryPop Pick : " << mpop_sur << " %, len " << mpop_len << " (" << global.pick.pop .mask_attempt << " / " << global.pick.pop .success << ")\n"; + os << "Pop M Reset : " << rpop_sur << " %, len " << rpop_len << " (" << global.pick.pop .mask_reset << " / " << global.pick.pop .success << ")\n"; + + double avgQ_push = double(global.qstat.push.value) / global.qstat.push.count; + double avgQ_pop = double(global.qstat.pop .value) / global.qstat.pop .count; + double avgQ = double(global.qstat.push.value + global.qstat.pop .value) / (global.qstat.push.count + global.qstat.pop .count); + os << "Push Avg Qs : " << avgQ_push << " (" << global.qstat.push.count << "ops)\n"; + os << "Pop Avg Qs : " << avgQ_pop << " (" << global.qstat.pop .count << "ops)\n"; + os << "Global Avg Qs : " << avgQ << " (" << (global.qstat.push.count + global.qstat.pop .count) << "ops)\n"; + + os << "Local Push : " << global.pick.push.local << "\n"; + os << "Local Pop : " << global.pick.pop .local << "\n"; + } +#endif +}; Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list_layout.cpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list_layout.cpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list_layout.cpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,23 @@ +#define NO_IO +#define NDEBUG +#include "relaxed_list.hpp" + +struct __attribute__((aligned(64))) Node { + static std::atomic_size_t creates; + static std::atomic_size_t destroys; + + _LinksFields_t _links; + + int value; + Node(int value): value(value) { + creates++; + } + + ~Node() { + destroys++; + } +}; + +int main() { + return sizeof(relaxed_list) + relaxed_list::sizeof_queue; +} Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/runperf.sh =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/runperf.sh (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/runperf.sh (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,14 @@ +#!/bin/bash +set -e + +name=$1 +event=$2 + +shift 2 + +echo "perf record -F 99 -a -g -o raw/$name.data -e $event -- $@ > raw/$name.out" +perf record -F 99 -a -g -o raw/$name.data -e $event -- $@ > raw/$name.out +echo "==============================" +cat raw/$name.out +echo "==============================" +./process.sh $name Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/scale.sh =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/scale.sh (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/scale.sh (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,7 @@ +#!/bin/bash +taskset -c 24-31 ./a.out -t 1 -b churn | grep --color -E "(ns|Ops|Running)" +taskset -c 24-31 ./a.out -t 2 -b churn | grep --color -E "(ns|Ops|Running)" +taskset -c 24-31 ./a.out -t 4 -b churn | grep --color -E "(ns|Ops|Running)" +taskset -c 24-31 ./a.out -t 8 -b churn | grep --color -E "(ns|Ops|Running)" +taskset -c 16-31 ./a.out -t 16 -b churn | grep --color -E "(ns|Ops|Running)" +taskset -c 0-31 ./a.out -t 32 -b churn | grep --color -E "(ns|Ops|Running)" Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzi-packed.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzi-packed.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzi-packed.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,179 @@ +#pragma once + +#define SNZI_PACKED + +#include "utils.hpp" + + +class snzip_t { + class node; + class node_aligned; +public: + const unsigned mask; + const int root; + std::unique_ptr leafs; + std::unique_ptr nodes; + + snzip_t(unsigned depth); + + void arrive(int idx) { + // idx >>= 1; + idx %= mask; + leafs[idx].arrive(); + } + + void depart(int idx) { + // idx >>= 1; + idx %= mask; + leafs[idx].depart(); + } + + bool query() const { + return nodes[root].query(); + } + + +private: + class __attribute__((aligned(32))) node { + friend class snzip_t; + private: + + union val_t { + static constexpr char Half = -1; + + uint64_t _all; + struct __attribute__((packed)) { + char cnt; + uint64_t ver:56; + }; + + bool cas(val_t & exp, char _cnt, uint64_t _ver) volatile { + val_t t; + t.ver = _ver; + t.cnt = _cnt; + /* paranoid */ assert(t._all == ((_ver << 8) | ((unsigned char)_cnt))); + return __atomic_compare_exchange_n(&this->_all, &exp._all, t._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); + } + + bool cas(val_t & exp, const val_t & tar) volatile { + return __atomic_compare_exchange_n(&this->_all, &exp._all, tar._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); + } + + val_t() : _all(0) {} + val_t(const volatile val_t & o) : _all(o._all) {} + }; + + //-------------------------------------------------- + // Hierarchical node + void arrive_h() { + int undoArr = 0; + bool success = false; + while(!success) { + auto x{ value }; + /* paranoid */ assert(x.cnt <= 120); + if( x.cnt >= 1 ) { + if( value.cas(x, x.cnt + 1, x.ver ) ) { + success = true; + } + } + /* paranoid */ assert(x.cnt <= 120); + if( x.cnt == 0 ) { + if( value.cas(x, val_t::Half, x.ver + 1) ) { + success = true; + x.cnt = val_t::Half; + x.ver = x.ver + 1; + } + } + /* paranoid */ assert(x.cnt <= 120); + if( x.cnt == val_t::Half ) { + /* paranoid */ assert(parent); + if(undoArr == 2) { + undoArr--; + } else { + parent->arrive(); + } + if( !value.cas(x, 1, x.ver) ) { + undoArr = undoArr + 1; + } + } + } + + for(int i = 0; i < undoArr; i++) { + /* paranoid */ assert(parent); + parent->depart(); + } + } + + void depart_h() { + while(true) { + auto x = (const val_t)value; + /* paranoid */ assertf(x.cnt >= 1, "%d", x.cnt); + if( value.cas( x, x.cnt - 1, x.ver ) ) { + if( x.cnt == 1 ) { + /* paranoid */ assert(parent); + parent->depart(); + } + return; + } + } + } + + //-------------------------------------------------- + // Root node + void arrive_r() { + __atomic_fetch_add(&value._all, 1, __ATOMIC_SEQ_CST); + } + + void depart_r() { + __atomic_fetch_sub(&value._all, 1, __ATOMIC_SEQ_CST); + } + + private: + volatile val_t value; + class node * parent = nullptr; + + bool is_root() { + return parent == nullptr; + } + + public: + void arrive() { + if(is_root()) arrive_r(); + else arrive_h(); + } + + void depart() { + if(is_root()) depart_r(); + else depart_h(); + } + + bool query() { + /* paranoid */ assert(is_root()); + return value._all > 0; + } + }; + + class __attribute__((aligned(128))) node_aligned : public node {}; +}; + +snzip_t::snzip_t(unsigned depth) + : mask( std::pow(2, depth) ) + , root( ((std::pow(2, depth + 1) - 1) / (2 -1)) - 1 - mask ) + , leafs(new node[ mask ]()) + , nodes(new node_aligned[ root + 1 ]()) +{ + int width = std::pow(2, depth); + int hwdith = width / 2; + std::cout << "SNZI: " << depth << "x" << width << "(" << mask - 1 << ") " << (sizeof(snzip_t::node) * (root + 1)) << " bytes" << std::endl; + for(int i = 0; i < width; i++) { + int idx = i % hwdith; + std::cout < " << idx + width << std::endl; + leafs[i].parent = &nodes[ idx ]; + } + + for(int i = 0; i < root; i++) { + int idx = (i / 2) + hwdith; + std::cout < " << idx + width << std::endl; + nodes[i].parent = &nodes[ idx ]; + } +} Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzi.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzi.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzi.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,164 @@ +#pragma once + +#include "utils.hpp" + + +class snzi_t { + class node; +public: + const unsigned mask; + const int root; + std::unique_ptr nodes; + + snzi_t(unsigned depth, unsigned base = 2); + + void arrive(int idx) { + idx >>= 2; + idx %= mask; + nodes[idx].arrive(); + } + + void depart(int idx) { + idx >>= 2; + idx %= mask; + nodes[idx].depart(); + } + + bool query() const { + return nodes[root].query(); + } + + +private: + class __attribute__((aligned(128))) node { + friend class snzi_t; + private: + + union val_t { + static constexpr char Half = -1; + + uint64_t _all; + struct __attribute__((packed)) { + char cnt; + uint64_t ver:56; + }; + + bool cas(val_t & exp, char _cnt, uint64_t _ver) volatile { + val_t t; + t.ver = _ver; + t.cnt = _cnt; + /* paranoid */ assert(t._all == ((_ver << 8) | ((unsigned char)_cnt))); + return __atomic_compare_exchange_n(&this->_all, &exp._all, t._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); + } + + bool cas(val_t & exp, const val_t & tar) volatile { + return __atomic_compare_exchange_n(&this->_all, &exp._all, tar._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); + } + + val_t() : _all(0) {} + val_t(const volatile val_t & o) : _all(o._all) {} + }; + + //-------------------------------------------------- + // Hierarchical node + void arrive_h() { + int undoArr = 0; + bool success = false; + while(!success) { + auto x{ value }; + /* paranoid */ assert(x.cnt <= 120); + if( x.cnt >= 1 ) { + if( value.cas(x, x.cnt + 1, x.ver ) ) { + success = true; + } + } + /* paranoid */ assert(x.cnt <= 120); + if( x.cnt == 0 ) { + if( value.cas(x, val_t::Half, x.ver + 1) ) { + success = true; + x.cnt = val_t::Half; + x.ver = x.ver + 1; + } + } + /* paranoid */ assert(x.cnt <= 120); + if( x.cnt == val_t::Half ) { + /* paranoid */ assert(parent); + if(undoArr == 2) { + undoArr--; + } else { + parent->arrive(); + } + if( !value.cas(x, 1, x.ver) ) { + undoArr = undoArr + 1; + } + } + } + + for(int i = 0; i < undoArr; i++) { + /* paranoid */ assert(parent); + parent->depart(); + } + } + + void depart_h() { + while(true) { + auto x = (const val_t)value; + /* paranoid */ assertf(x.cnt >= 1, "%d", x.cnt); + if( value.cas( x, x.cnt - 1, x.ver ) ) { + if( x.cnt == 1 ) { + /* paranoid */ assert(parent); + parent->depart(); + } + return; + } + } + } + + //-------------------------------------------------- + // Root node + void arrive_r() { + __atomic_fetch_add(&value._all, 1, __ATOMIC_SEQ_CST); + } + + void depart_r() { + __atomic_fetch_sub(&value._all, 1, __ATOMIC_SEQ_CST); + } + + private: + volatile val_t value; + class node * parent = nullptr; + + bool is_root() { + return parent == nullptr; + } + + public: + void arrive() { + if(is_root()) arrive_r(); + else arrive_h(); + } + + void depart() { + if(is_root()) depart_r(); + else depart_h(); + } + + bool query() { + /* paranoid */ assert(is_root()); + return value._all > 0; + } + }; +}; + +snzi_t::snzi_t(unsigned depth, unsigned base) + : mask( std::pow(base, depth) ) + , root( ((std::pow(base, depth + 1) - 1) / (base -1)) - 1 ) + , nodes(new node[ root + 1 ]()) +{ + int width = std::pow(base, depth); + std::cout << "SNZI: " << depth << "x" << width << "(" << mask - 1 << ") " << (sizeof(snzi_t::node) * (root + 1)) << " bytes" << std::endl; + for(int i = 0; i < root; i++) { + std::cout < " << (i / base) + width << std::endl; + nodes[i].parent = &nodes[(i / base) + width]; + } +} Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzm.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzm.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzm.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,213 @@ +#pragma once + +#include "utils.hpp" + + +class snzm_t { + class node; +public: + const unsigned depth; + const unsigned mask; + const int root; + std::unique_ptr nodes; + + #if defined(__BMI2__) + const uint64_t indexes = 0x0706050403020100; + #endif + + snzm_t(unsigned numLists); + + void arrive(int idx) { + int i = idx & mask; + nodes[i].arrive( idx >> depth); + } + + void depart(int idx) { + int i = idx & mask; + nodes[i].depart( idx >> depth ); + } + + bool query() const { + return nodes[root].query(); + } + + uint64_t masks( unsigned node ) { + /* paranoid */ assert( (node & mask) == node ); + #if defined(__BMI2__) + return nodes[node].mask_all; + #else + return nodes[node].mask; + #endif + } + +private: + class __attribute__((aligned(128))) node { + friend class snzm_t; + private: + + union val_t { + static constexpr char Half = -1; + + uint64_t _all; + struct __attribute__((packed)) { + char cnt; + uint64_t ver:56; + }; + + bool cas(val_t & exp, char _cnt, uint64_t _ver) volatile { + val_t t; + t.ver = _ver; + t.cnt = _cnt; + /* paranoid */ assert(t._all == ((_ver << 8) | ((unsigned char)_cnt))); + return __atomic_compare_exchange_n(&this->_all, &exp._all, t._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); + } + + bool cas(val_t & exp, const val_t & tar) volatile { + return __atomic_compare_exchange_n(&this->_all, &exp._all, tar._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); + } + + val_t() : _all(0) {} + val_t(const volatile val_t & o) : _all(o._all) {} + }; + + //-------------------------------------------------- + // Hierarchical node + void arrive_h() { + int undoArr = 0; + bool success = false; + while(!success) { + auto x{ value }; + /* paranoid */ assert(x.cnt <= 120); + if( x.cnt >= 1 ) { + if( value.cas(x, x.cnt + 1, x.ver ) ) { + success = true; + } + } + /* paranoid */ assert(x.cnt <= 120); + if( x.cnt == 0 ) { + if( value.cas(x, val_t::Half, x.ver + 1) ) { + success = true; + x.cnt = val_t::Half; + x.ver = x.ver + 1; + } + } + /* paranoid */ assert(x.cnt <= 120); + if( x.cnt == val_t::Half ) { + /* paranoid */ assert(parent); + parent->arrive(); + if( !value.cas(x, 1, x.ver) ) { + undoArr = undoArr + 1; + } + } + } + + for(int i = 0; i < undoArr; i++) { + /* paranoid */ assert(parent); + parent->depart(); + } + } + + void depart_h() { + while(true) { + auto x = (const val_t)value; + /* paranoid */ assertf(x.cnt >= 1, "%d", x.cnt); + if( value.cas( x, x.cnt - 1, x.ver ) ) { + if( x.cnt == 1 ) { + /* paranoid */ assert(parent); + parent->depart(); + } + return; + } + } + } + + //-------------------------------------------------- + // Root node + void arrive_r() { + __atomic_fetch_add(&value._all, 1, __ATOMIC_SEQ_CST); + } + + void depart_r() { + __atomic_fetch_sub(&value._all, 1, __ATOMIC_SEQ_CST); + } + + //-------------------------------------------------- + // Interface node + void arrive() { + /* paranoid */ assert(!is_leaf); + if(is_root()) arrive_r(); + else arrive_h(); + } + + void depart() { + /* paranoid */ assert(!is_leaf); + if(is_root()) depart_r(); + else depart_h(); + } + + private: + volatile val_t value; + #if defined(__BMI2__) + union __attribute__((packed)) { + volatile uint8_t mask[8]; + volatile uint64_t mask_all; + }; + #else + volatile size_t mask = 0; + #endif + + class node * parent = nullptr; + bool is_leaf = false; + + bool is_root() { + return parent == nullptr; + } + + public: + void arrive( int bit ) { + /* paranoid */ assert( is_leaf ); + + arrive_h(); + #if defined(__BMI2__) + /* paranoid */ assert( bit < 8 ); + mask[bit] = 0xff; + #else + /* paranoid */ assert( (mask & ( 1 << bit )) == 0 ); + __atomic_fetch_add( &mask, 1 << bit, __ATOMIC_RELAXED ); + #endif + + } + + void depart( int bit ) { + /* paranoid */ assert( is_leaf ); + + #if defined(__BMI2__) + /* paranoid */ assert( bit < 8 ); + mask[bit] = 0x00; + #else + /* paranoid */ assert( (mask & ( 1 << bit )) != 0 ); + __atomic_fetch_sub( &mask, 1 << bit, __ATOMIC_RELAXED ); + #endif + depart_h(); + } + + bool query() { + /* paranoid */ assert(is_root()); + return value._all > 0; + } + }; +}; + +snzm_t::snzm_t(unsigned numLists) + : depth( std::log2( numLists / 8 ) ) + , mask( (1 << depth) - 1 ) + , root( (1 << (depth + 1)) - 2 ) + , nodes(new node[ root + 1 ]()) +{ + int width = 1 << depth; + std::cout << "SNZI with Mask: " << depth << "x" << width << "(" << mask << ")" << std::endl; + for(int i = 0; i < root; i++) { + nodes[i].is_leaf = i < width; + nodes[i].parent = &nodes[(i / 2) + width ]; + } +} Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/utils.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/utils.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/utils.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,250 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +// Barrier from +class barrier_t { +public: + barrier_t(size_t total) + : waiting(0) + , total(total) + {} + + void wait(unsigned) { + size_t target = waiting++; + target = (target - (target % total)) + total; + while(waiting < target) + asm volatile("pause"); + + assert(waiting < (1ul << 60)); + } + +private: + std::atomic waiting; + size_t total; +}; + +// class Random { +// private: +// unsigned int seed; +// public: +// Random(int seed) { +// this->seed = seed; +// } + +// /** returns pseudorandom x satisfying 0 <= x < n. **/ +// unsigned int next() { +// seed ^= seed << 6; +// seed ^= seed >> 21; +// seed ^= seed << 7; +// return seed; +// } +// }; + +constexpr uint64_t extendedEuclidY(uint64_t a, uint64_t b); +constexpr uint64_t extendedEuclidX(uint64_t a, uint64_t b){ + return (b==0) ? 1 : extendedEuclidY(b, a - b * (a / b)); +} +constexpr uint64_t extendedEuclidY(uint64_t a, uint64_t b){ + return (b==0) ? 0 : extendedEuclidX(b, a - b * (a / b)) - (a / b) * extendedEuclidY(b, a - b * (a / b)); +} + +class Random { +private: + uint64_t x; + + static constexpr const uint64_t M = 1ul << 48ul; + static constexpr const uint64_t A = 25214903917; + static constexpr const uint64_t C = 11; + static constexpr const uint64_t D = 16; + +public: + static constexpr const uint64_t m = M; + static constexpr const uint64_t a = A; + static constexpr const uint64_t c = C; + static constexpr const uint64_t d = D; + static constexpr const uint64_t ai = extendedEuclidX(A, M); +public: + Random(unsigned int seed) { + this->x = seed * a; + } + + /** returns pseudorandom x satisfying 0 <= x < n. **/ + unsigned int next() { + //nextx = (a * x + c) % m; + x = (A * x + C) & (M - 1); + return x >> D; + } + unsigned int prev() { + //prevx = (ainverse * (x - c)) mod m + unsigned int r = x >> D; + x = ai * (x - C) & (M - 1); + return r; + } + + void set_raw_state(uint64_t _x) { + this->x = _x; + } + + uint64_t get_raw_state() { + return this->x; + } +}; + +static inline long long rdtscl(void) { + unsigned int lo, hi; + __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); + return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); +} + +static inline void affinity(int tid) { + static int cpus = get_nprocs(); + + cpu_set_t mask; + CPU_ZERO(&mask); + int cpu = cpus - tid; // Set CPU affinity to tid, starting from the end + CPU_SET(cpu, &mask); + auto result = sched_setaffinity(0, sizeof(mask), &mask); + if(result != 0) { + std::cerr << "Affinity set failed with " << result<< ", wanted " << cpu << std::endl; + } +} + +static const constexpr std::size_t cache_line_size = 64; +static inline void check_cache_line_size() { + std::cout << "Checking cache line size" << std::endl; + const std::string cache_file = "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size"; + + std::ifstream ifs (cache_file, std::ifstream::in); + + if(!ifs.good()) { + std::cerr << "Could not open file to check cache line size" << std::endl; + std::cerr << "Looking for: " << cache_file << std::endl; + std::exit(2); + } + + size_t got; + ifs >> got; + + ifs.close(); + + if(cache_line_size != got) { + std::cerr << "Cache line has incorrect size : " << got << std::endl; + std::exit(1); + } + + std::cout << "Done" << std::endl; +} + +using Clock = std::chrono::high_resolution_clock; +using duration_t = std::chrono::duration; +using std::chrono::nanoseconds; + +template +T duration_cast(T seconds) { + return std::chrono::duration_cast>(std::chrono::duration(seconds)).count(); +} + +static inline unsigned rand_bit(unsigned rnum, size_t mask) __attribute__((artificial)); +static inline unsigned rand_bit(unsigned rnum, size_t mask) { + unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0; +#if !defined(__BMI2__) + uint64_t v = mask; // Input value to find position with rank r. + unsigned int r = bit + 1;// Input: bit's desired rank [1-64]. + unsigned int s; // Output: Resulting position of bit with rank r [1-64] + uint64_t a, b, c, d; // Intermediate temporaries for bit count. + unsigned int t; // Bit count temporary. + + // Do a normal parallel bit count for a 64-bit integer, + // but store all intermediate steps. + a = v - ((v >> 1) & ~0UL/3); + b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5); + c = (b + (b >> 4)) & ~0UL/0x11; + d = (c + (c >> 8)) & ~0UL/0x101; + + + t = (d >> 32) + (d >> 48); + // Now do branchless select! + s = 64; + s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8)); + t = (d >> (s - 16)) & 0xff; + s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8)); + t = (c >> (s - 8)) & 0xf; + s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8)); + t = (b >> (s - 4)) & 0x7; + s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8)); + t = (a >> (s - 2)) & 0x3; + s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8)); + t = (v >> (s - 1)) & 0x1; + s -= ((t - r) & 256) >> 8; + return s - 1; +#else + uint64_t picked = _pdep_u64(1ul << bit, mask); + return picked ? __builtin_ctzl(picked) : 0; +#endif +} + +struct spinlock_t { + std::atomic_bool ll = { false }; + + inline void lock() { + while( __builtin_expect(ll.exchange(true),false) ) { + while(ll.load(std::memory_order_relaxed)) + asm volatile("pause"); + } + } + + inline bool try_lock() { + return false == ll.exchange(true); + } + + inline void unlock() { + ll.store(false, std::memory_order_release); + } + + inline explicit operator bool() { + return ll.load(std::memory_order_relaxed); + } +}; + +static inline bool bts(std::atomic_size_t & target, size_t bit ) { + //* + int result = 0; + asm volatile( + "LOCK btsq %[bit], %[target]\n\t" + :"=@ccc" (result) + : [target] "m" (target), [bit] "r" (bit) + ); + return result != 0; + /*/ + size_t mask = 1ul << bit; + size_t ret = target.fetch_or(mask, std::memory_order_relaxed); + return (ret & mask) != 0; + //*/ +} + +static inline bool btr(std::atomic_size_t & target, size_t bit ) { + //* + int result = 0; + asm volatile( + "LOCK btrq %[bit], %[target]\n\t" + :"=@ccc" (result) + : [target] "m" (target), [bit] "r" (bit) + ); + return result != 0; + /*/ + size_t mask = 1ul << bit; + size_t ret = target.fetch_and(~mask, std::memory_order_relaxed); + return (ret & mask) != 0; + //*/ +} Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/work_stealing.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/readyQ_proto/work_stealing.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) +++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/work_stealing.hpp (revision f9f3775fd90af89c066bf99ce0152f83c082b8f0) @@ -0,0 +1,222 @@ +#pragma once +#define LIST_VARIANT work_stealing + +#include +#include +#include +#include +#include + +#include "assert.hpp" +#include "utils.hpp" +#include "links.hpp" +#include "snzi.hpp" + +using namespace std; + +template +class __attribute__((aligned(128))) work_stealing { + static_assert(std::is_same>::value, "Node must have a links field"); + +public: + static const char * name() { + return "Work Stealing"; + } + + work_stealing(unsigned _numThreads, unsigned) + : numThreads(_numThreads) + , lists(new intrusive_queue_t[numThreads]) + , snzi( std::log2( numThreads / 2 ), 2 ) + + { + std::cout << "Constructing Work Stealer with " << numThreads << std::endl; + } + + ~work_stealing() { + std::cout << "Destroying Work Stealer" << std::endl; + lists.reset(); + } + + __attribute__((noinline, hot)) void push(node_t * node) { + node->_links.ts = rdtscl(); + if( node->_links.hint > numThreads ) { + node->_links.hint = tls.rng.next() % numThreads; + tls.stat.push.nhint++; + } + + unsigned i = node->_links.hint; + auto & list = lists[i]; + list.lock.lock(); + + if(list.push( node )) { + snzi.arrive(i); + } + + list.lock.unlock(); + } + + __attribute__((noinline, hot)) node_t * pop() { + node_t * node; + while(true) { + if(!snzi.query()) { + return nullptr; + } + + { + unsigned i = tls.my_queue; + auto & list = lists[i]; + if( list.ts() != 0 ) { + list.lock.lock(); + if((node = try_pop(i))) { + tls.stat.pop.local.success++; + break; + } + else { + tls.stat.pop.local.elock++; + } + } + else { + tls.stat.pop.local.espec++; + } + } + + tls.stat.pop.steal.tried++; + + int i = tls.rng.next() % numThreads; + auto & list = lists[i]; + if( list.ts() == 0 ) { + tls.stat.pop.steal.empty++; + continue; + } + + if( !list.lock.try_lock() ) { + tls.stat.pop.steal.locked++; + continue; + } + + if((node = try_pop(i))) { + tls.stat.pop.steal.success++; + break; + } + } + + #if defined(READ) + const unsigned f = READ; + if(0 == (tls.it % f)) { + unsigned i = tls.it / f; + lists[i % numThreads].ts(); + } + // lists[tls.it].ts(); + tls.it++; + #endif + + + return node; + } + +private: + node_t * try_pop(unsigned i) { + auto & list = lists[i]; + + // If list is empty, unlock and retry + if( list.ts() == 0 ) { + list.lock.unlock(); + return nullptr; + } + + // Actually pop the list + node_t * node; + bool emptied; + std::tie(node, emptied) = list.pop(); + assert(node); + + if(emptied) { + snzi.depart(i); + } + + // Unlock and return + list.lock.unlock(); + return node; + } + + +public: + + static std::atomic_uint32_t ticket; + static __attribute__((aligned(128))) thread_local struct TLS { + Random rng = { int(rdtscl()) }; + unsigned my_queue = ticket++; + #if defined(READ) + unsigned it = 0; + #endif + struct { + struct { + std::size_t nhint = { 0 }; + } push; + struct { + struct { + std::size_t success = { 0 }; + std::size_t espec = { 0 }; + std::size_t elock = { 0 }; + } local; + struct { + std::size_t tried = { 0 }; + std::size_t locked = { 0 }; + std::size_t empty = { 0 }; + std::size_t success = { 0 }; + } steal; + } pop; + } stat; + } tls; + +private: + const unsigned numThreads; + std::unique_ptr []> lists; + __attribute__((aligned(64))) snzi_t snzi; + +#ifndef NO_STATS +private: + static struct GlobalStats { + struct { + std::atomic_size_t nhint = { 0 }; + } push; + struct { + struct { + std::atomic_size_t success = { 0 }; + std::atomic_size_t espec = { 0 }; + std::atomic_size_t elock = { 0 }; + } local; + struct { + std::atomic_size_t tried = { 0 }; + std::atomic_size_t locked = { 0 }; + std::atomic_size_t empty = { 0 }; + std::atomic_size_t success = { 0 }; + } steal; + } pop; + } global_stats; + +public: + static void stats_tls_tally() { + global_stats.push.nhint += tls.stat.push.nhint; + global_stats.pop.local.success += tls.stat.pop.local.success; + global_stats.pop.local.espec += tls.stat.pop.local.espec ; + global_stats.pop.local.elock += tls.stat.pop.local.elock ; + global_stats.pop.steal.tried += tls.stat.pop.steal.tried ; + global_stats.pop.steal.locked += tls.stat.pop.steal.locked ; + global_stats.pop.steal.empty += tls.stat.pop.steal.empty ; + global_stats.pop.steal.success += tls.stat.pop.steal.success; + } + + static void stats_print(std::ostream & os ) { + std::cout << "----- Work Stealing Stats -----" << std::endl; + + double stealSucc = double(global_stats.pop.steal.success) / global_stats.pop.steal.tried; + os << "Push to new Q : " << std::setw(15) << global_stats.push.nhint << "\n"; + os << "Local Pop : " << std::setw(15) << global_stats.pop.local.success << "\n"; + os << "Steal Pop : " << std::setw(15) << global_stats.pop.steal.success << "(" << global_stats.pop.local.espec << "s, " << global_stats.pop.local.elock << "l)\n"; + os << "Steal Success : " << std::setw(15) << stealSucc << "(" << global_stats.pop.steal.tried << " tries)\n"; + os << "Steal Fails : " << std::setw(15) << global_stats.pop.steal.empty << "e, " << global_stats.pop.steal.locked << "l\n"; + } +private: +#endif +}; Index: doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,1141 +1,0 @@ -#if !defined(LIST_VARIANT_HPP) -#define LIST_VARIANT_HPP "relaxed_list.hpp" -#endif - -#include LIST_VARIANT_HPP -#if !defined(LIST_VARIANT) -#error not variant selected -#endif - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "utils.hpp" - -struct __attribute__((aligned(64))) Node { - static std::atomic_size_t creates; - static std::atomic_size_t destroys; - - _LinksFields_t _links; - - int value; - int id; - - Node() { creates++; } - Node(int value): value(value) { creates++; } - ~Node() { destroys++; } -}; - -std::atomic_size_t Node::creates = { 0 }; -std::atomic_size_t Node::destroys = { 0 }; - -bool enable_stats = false; - -template<> -thread_local LIST_VARIANT::TLS LIST_VARIANT::tls = {}; - -template<> -std::atomic_uint32_t LIST_VARIANT::ticket = { 0 }; - -#ifndef NO_STATS -template<> -LIST_VARIANT::GlobalStats LIST_VARIANT::global_stats = {}; -#endif - -// ================================================================================================ -// UTILS -// ================================================================================================ - -struct local_stat_t { - size_t in = 0; - size_t out = 0; - size_t empty = 0; - size_t crc_in = 0; - size_t crc_out = 0; - size_t valmax = 0; - size_t valmin = 100000000ul; - struct { - size_t val = 0; - size_t cnt = 0; - } comp; - struct { - size_t val = 0; - size_t cnt = 0; - } subm; -}; - -struct global_stat_t { - std::atomic_size_t in = { 0 }; - std::atomic_size_t out = { 0 }; - std::atomic_size_t empty = { 0 }; - std::atomic_size_t crc_in = { 0 }; - std::atomic_size_t crc_out = { 0 }; - std::atomic_size_t valmax = { 0 }; - std::atomic_size_t valmin = { 100000000ul }; - struct { - std::atomic_size_t val = { 0 }; - std::atomic_size_t cnt = { 0 }; - } comp; - struct { - std::atomic_size_t val = { 0 }; - std::atomic_size_t cnt = { 0 }; - } subm; -}; - -void atomic_max(std::atomic_size_t & target, size_t value) { - for(;;) { - size_t expect = target.load(std::memory_order_relaxed); - if(value <= expect) return; - bool success = target.compare_exchange_strong(expect, value); - if(success) return; - } -} - -void atomic_min(std::atomic_size_t & target, size_t value) { - for(;;) { - size_t expect = target.load(std::memory_order_relaxed); - if(value >= expect) return; - bool success = target.compare_exchange_strong(expect, value); - if(success) return; - } -} - -void tally_stats(global_stat_t & global, local_stat_t & local) { - - global.in += local.in; - global.out += local.out; - global.empty += local.empty; - - global.crc_in += local.crc_in; - global.crc_out += local.crc_out; - - global.comp.val += local.comp.val; - global.comp.cnt += local.comp.cnt; - global.subm.val += local.subm.val; - global.subm.cnt += local.subm.cnt; - - atomic_max(global.valmax, local.valmax); - atomic_min(global.valmin, local.valmin); - - LIST_VARIANT::stats_tls_tally(); -} - -void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) { - std::cout << "Starting" << std::endl; - auto before = Clock::now(); - barrier.wait(0); - bool is_tty = isatty(STDOUT_FILENO); - - while(true) { - usleep(100000); - auto now = Clock::now(); - duration_t durr = now - before; - if( durr.count() > duration ) { - done = true; - break; - } - if(is_tty) { - std::cout << "\r" << std::setprecision(4) << durr.count(); - std::cout.flush(); - } - } - - barrier.wait(0); - auto after = Clock::now(); - duration_t durr = after - before; - duration = durr.count(); - std::cout << "\rClosing down" << std::endl; -} - -void waitfor(double & duration, barrier_t & barrier, const std::atomic_size_t & count) { - std::cout << "Starting" << std::endl; - auto before = Clock::now(); - barrier.wait(0); - - while(true) { - usleep(100000); - size_t c = count.load(); - if( c == 0 ) { - break; - } - std::cout << "\r" << c; - std::cout.flush(); - } - - barrier.wait(0); - auto after = Clock::now(); - duration_t durr = after - before; - duration = durr.count(); - std::cout << "\rClosing down" << std::endl; -} - -void print_stats(double duration, unsigned nthread, global_stat_t & global) { - assert(Node::creates == Node::destroys); - assert(global.crc_in == global.crc_out); - - std::cout << "Done" << std::endl; - - size_t ops = global.in + global.out; - size_t ops_sec = size_t(double(ops) / duration); - size_t ops_thread = ops_sec / nthread; - auto dur_nano = duration_cast(1.0); - - if(global.valmax != 0) { - std::cout << "Max runs : " << global.valmax << "\n"; - std::cout << "Min runs : " << global.valmin << "\n"; - } - if(global.comp.cnt != 0) { - std::cout << "Submit count : " << global.subm.cnt << "\n"; - std::cout << "Submit average: " << ((double(global.subm.val)) / global.subm.cnt) << "\n"; - std::cout << "Complete count: " << global.comp.cnt << "\n"; - std::cout << "Complete avg : " << ((double(global.comp.val)) / global.comp.cnt) << "\n"; - } - std::cout << "Duration : " << duration << "s\n"; - std::cout << "ns/Op : " << ( dur_nano / ops_thread )<< "\n"; - std::cout << "Ops/sec/thread: " << ops_thread << "\n"; - std::cout << "Ops/sec : " << ops_sec << "\n"; - std::cout << "Total ops : " << ops << "(" << global.in << "i, " << global.out << "o, " << global.empty << "e)\n"; - #ifndef NO_STATS - LIST_VARIANT::stats_print(std::cout); - #endif -} - -void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output); - -// ================================================================================================ -// EXPERIMENTS -// ================================================================================================ - -// ================================================================================================ -__attribute__((noinline)) void runChurn_body( - std::atomic& done, - Random & rand, - Node * my_nodes[], - unsigned nslots, - local_stat_t & local, - LIST_VARIANT & list -) { - while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) { - int idx = rand.next() % nslots; - if (auto node = my_nodes[idx]) { - local.crc_in += node->value; - list.push(node); - my_nodes[idx] = nullptr; - local.in++; - } - else if(auto node = list.pop()) { - local.crc_out += node->value; - my_nodes[idx] = node; - local.out++; - } - else { - local.empty++; - } - } -} - -void runChurn(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes, const unsigned nslots) { - std::cout << "Churn Benchmark" << std::endl; - assert(nnodes <= nslots); - // List being tested - - // Barrier for synchronization - barrier_t barrier(nthread + 1); - - // Data to check everything is OK - global_stat_t global; - - // Flag to signal termination - std::atomic_bool done = { false }; - - // Prep nodes - std::cout << "Initializing "; - size_t npushed = 0; - LIST_VARIANT list = { nthread, nqueues }; - { - Node** all_nodes[nthread]; - for(auto & nodes : all_nodes) { - nodes = new __attribute__((aligned(64))) Node*[nslots + 8]; - Random rand(rdtscl()); - for(unsigned i = 0; i < nnodes; i++) { - nodes[i] = new Node(rand.next() % 100); - } - - for(unsigned i = nnodes; i < nslots; i++) { - nodes[i] = nullptr; - } - - for(int i = 0; i < 10 && i < (int)nslots; i++) { - int idx = rand.next() % nslots; - if (auto node = nodes[idx]) { - global.crc_in += node->value; - list.push(node); - npushed++; - nodes[idx] = nullptr; - } - } - } - - std::cout << nnodes << " nodes (" << nslots << " slots)" << std::endl; - - enable_stats = true; - - std::thread * threads[nthread]; - unsigned i = 1; - for(auto & t : threads) { - auto & my_nodes = all_nodes[i - 1]; - t = new std::thread([&done, &list, &barrier, &global, &my_nodes, nslots](unsigned tid) { - Random rand(tid + rdtscl()); - - local_stat_t local; - - // affinity(tid); - - barrier.wait(tid); - - // EXPERIMENT START - - runChurn_body(done, rand, my_nodes, nslots, local, list); - - // EXPERIMENT END - - barrier.wait(tid); - - tally_stats(global, local); - - for(unsigned i = 0; i < nslots; i++) { - delete my_nodes[i]; - } - }, i++); - } - - waitfor(duration, barrier, done); - - for(auto t : threads) { - t->join(); - delete t; - } - - enable_stats = false; - - while(auto node = list.pop()) { - global.crc_out += node->value; - delete node; - } - - for(auto nodes : all_nodes) { - delete[] nodes; - } - } - - print_stats(duration, nthread, global); -} - -// ================================================================================================ -__attribute__((noinline)) void runPingPong_body( - std::atomic& done, - Node initial_nodes[], - unsigned nnodes, - local_stat_t & local, - LIST_VARIANT & list -) { - Node * nodes[nnodes]; - { - unsigned i = 0; - for(auto & n : nodes) { - n = &initial_nodes[i++]; - } - } - - while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) { - - for(Node * & node : nodes) { - local.crc_in += node->value; - list.push(node); - local.in++; - } - - // ----- - - for(Node * & node : nodes) { - node = list.pop(); - assert(node); - local.crc_out += node->value; - local.out++; - } - } -} - -void runPingPong(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes) { - std::cout << "PingPong Benchmark" << std::endl; - - - // Barrier for synchronization - barrier_t barrier(nthread + 1); - - // Data to check everything is OK - global_stat_t global; - - // Flag to signal termination - std::atomic_bool done = { false }; - - std::cout << "Initializing "; - // List being tested - LIST_VARIANT list = { nthread, nqueues }; - { - enable_stats = true; - - std::thread * threads[nthread]; - unsigned i = 1; - for(auto & t : threads) { - t = new std::thread([&done, &list, &barrier, &global, nnodes](unsigned tid) { - Random rand(tid + rdtscl()); - - Node nodes[nnodes]; - for(auto & n : nodes) { - n.value = (int)rand.next() % 100; - } - - local_stat_t local; - - // affinity(tid); - - barrier.wait(tid); - - // EXPERIMENT START - - runPingPong_body(done, nodes, nnodes, local, list); - - // EXPERIMENT END - - barrier.wait(tid); - - tally_stats(global, local); - }, i++); - } - - waitfor(duration, barrier, done); - - for(auto t : threads) { - t->join(); - delete t; - } - - enable_stats = false; - } - - print_stats(duration, nthread, global); -} - -// ================================================================================================ -struct __attribute__((aligned(64))) Slot { - Node * volatile node; -}; - -__attribute__((noinline)) void runProducer_body( - std::atomic& done, - Random & rand, - Slot * slots, - int nslots, - local_stat_t & local, - LIST_VARIANT & list -) { - while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) { - - Node * node = list.pop(); - if(!node) { - local.empty ++; - continue; - } - - local.crc_out += node->value; - local.out++; - - if(node->id == 0) { - unsigned cnt = 0; - for(int i = 0; i < nslots; i++) { - Node * found = __atomic_exchange_n( &slots[i].node, nullptr, __ATOMIC_SEQ_CST ); - if( found ) { - local.crc_in += found->value; - local.in++; - cnt++; - list.push( found ); - } - } - - local.crc_in += node->value; - local.in++; - list.push( node ); - - local.comp.cnt++; - local.comp.val += cnt; - } - else { - unsigned len = 0; - while(true) { - auto off = rand.next(); - for(int i = 0; i < nslots; i++) { - Node * expected = nullptr; - int idx = (i + off) % nslots; - Slot & slot = slots[ idx ]; - if( - slot.node == nullptr && - __atomic_compare_exchange_n( &slot.node, &expected, node, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) - ) { - local.subm.cnt++; - local.subm.val += len; - goto LOOP; - } - assert( expected != node ); - len++; - } - } - } - - LOOP:; - } -} - -void runProducer(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes) { - std::cout << "Producer Benchmark" << std::endl; - - // Barrier for synchronization - barrier_t barrier(nthread + 1); - - // Data to check everything is OK - global_stat_t global; - - // Flag to signal termination - std::atomic_bool done = { false }; - - std::cout << "Initializing "; - - int nslots = nnodes * 4; - Slot * slots = new Slot[nslots]; - std::cout << nnodes << " nodes (" << nslots << " slots)" << std::endl; - - // List being tested - LIST_VARIANT list = { nthread, nqueues }; - { - Random rand(rdtscl()); - for(unsigned i = 0; i < nnodes; i++) { - Node * node = new Node(rand.next() % 100); - node->id = i; - global.crc_in += node->value; - list.push(node); - } - - for(int i = 0; i < nslots; i++) { - slots[i].node = nullptr; - } - } - - { - enable_stats = true; - - std::thread * threads[nthread]; - unsigned i = 1; - for(auto & t : threads) { - t = new std::thread([&done, &list, &barrier, &global, slots, nslots](unsigned tid) { - Random rand(tid + rdtscl()); - - local_stat_t local; - barrier.wait(tid); - - // EXPERIMENT START - - runProducer_body(done, rand, slots, nslots, local, list); - - // EXPERIMENT END - - barrier.wait(tid); - - tally_stats(global, local); - }, i++); - } - - waitfor(duration, barrier, done); - - for(auto t : threads) { - t->join(); - delete t; - } - - enable_stats = false; - } - - { - while(Node * node = list.pop()) { - global.crc_out += node->value; - delete node; - } - - for(int i = 0; i < nslots; i++) { - delete slots[i].node; - } - - delete [] slots; - } - - print_stats(duration, nthread, global); -} - -// ================================================================================================ -__attribute__((noinline)) void runFairness_body( - unsigned tid, - size_t width, - size_t length, - int output[], - std::atomic_size_t & count, - Node initial_nodes[], - unsigned nnodes, - local_stat_t & local, - LIST_VARIANT & list -) { - Node * nodes[nnodes]; - { - unsigned i = 0; - for(auto & n : nodes) { - n = &initial_nodes[i++]; - } - } - - while(__builtin_expect(0 != count.load(std::memory_order_relaxed), true)) { - - for(Node * & node : nodes) { - local.crc_in += node->id; - list.push(node); - local.in++; - } - - // ----- - - for(Node * & node : nodes) { - node = list.pop(); - assert(node); - - if (unsigned(node->value) < length) { - size_t idx = (node->value * width) + node->id; - assert(idx < (width * length)); - output[idx] = tid; - } - - node->value++; - if(unsigned(node->value) == length) count--; - - local.crc_out += node->id; - local.out++; - } - } -} - -void runFairness(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes, const std::string & output) { - std::cout << "Fairness Benchmark, outputing to : " << output << std::endl; - - // Barrier for synchronization - barrier_t barrier(nthread + 1); - - // Data to check everything is OK - global_stat_t global; - - std::cout << "Initializing "; - - // Check fairness by creating a png of where the threads ran - size_t width = nthread * nnodes; - size_t length = 100000; - - std::unique_ptr data_out { new int[width * length] }; - - // Flag to signal termination - std::atomic_size_t count = width; - - // List being tested - LIST_VARIANT list = { nthread, nqueues }; - { - enable_stats = true; - - std::thread * threads[nthread]; - unsigned i = 1; - for(auto & t : threads) { - t = new std::thread([&count, &list, &barrier, &global, nnodes, width, length, data_out = data_out.get()](unsigned tid) { - unsigned int start = (tid - 1) * nnodes; - Node nodes[nnodes]; - for(auto & n : nodes) { - n.id = start; - n.value = 0; - start++; - } - - local_stat_t local; - - // affinity(tid); - - barrier.wait(tid); - - // EXPERIMENT START - - runFairness_body(tid, width, length, data_out, count, nodes, nnodes, local, list); - - // EXPERIMENT END - - barrier.wait(tid); - - for(const auto & n : nodes) { - local.valmax = max(local.valmax, size_t(n.value)); - local.valmin = min(local.valmin, size_t(n.value)); - } - - tally_stats(global, local); - }, i++); - } - - waitfor(duration, barrier, count); - - for(auto t : threads) { - t->join(); - delete t; - } - - enable_stats = false; - } - - print_stats(duration, nthread, global); - - // save_fairness(data_out.get(), 100, nthread, width, length, output); -} - -// ================================================================================================ - -bool iequals(const std::string& a, const std::string& b) -{ - return std::equal(a.begin(), a.end(), - b.begin(), b.end(), - [](char a, char b) { - return std::tolower(a) == std::tolower(b); - }); -} - -int main(int argc, char * argv[]) { - - double duration = 5.0; - unsigned nthreads = 2; - unsigned nqueues = 4; - unsigned nnodes = 100; - unsigned nslots = 100; - std::string out = "fairness.png"; - - enum { - Churn, - PingPong, - Producer, - Fairness, - NONE - } benchmark = NONE; - - std::cout.imbue(std::locale("")); - - for(;;) { - static struct option options[] = { - {"duration", required_argument, 0, 'd'}, - {"nthreads", required_argument, 0, 't'}, - {"nqueues", required_argument, 0, 'q'}, - {"benchmark", required_argument, 0, 'b'}, - {0, 0, 0, 0} - }; - - int idx = 0; - int opt = getopt_long(argc, argv, "d:t:q:b:", options, &idx); - - std::string arg = optarg ? optarg : ""; - size_t len = 0; - switch(opt) { - // Exit Case - case -1: - /* paranoid */ assert(optind <= argc); - switch(benchmark) { - case NONE: - std::cerr << "Must specify a benchmark" << std::endl; - goto usage; - case PingPong: - nnodes = 1; - switch(argc - optind) { - case 0: break; - case 1: - try { - arg = optarg = argv[optind]; - nnodes = stoul(optarg, &len); - if(len != arg.size()) { throw std::invalid_argument(""); } - } catch(std::invalid_argument &) { - std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl; - goto usage; - } - break; - default: - std::cerr << "'PingPong' benchmark doesn't accept more than 1 extra arguments" << std::endl; - goto usage; - } - break; - case Producer: - nnodes = 32; - switch(argc - optind) { - case 0: break; - case 1: - try { - arg = optarg = argv[optind]; - nnodes = stoul(optarg, &len); - if(len != arg.size()) { throw std::invalid_argument(""); } - } catch(std::invalid_argument &) { - std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl; - goto usage; - } - break; - default: - std::cerr << "'Producer' benchmark doesn't accept more than 1 extra arguments" << std::endl; - goto usage; - } - break; - case Churn: - nnodes = 100; - nslots = 100; - switch(argc - optind) { - case 0: break; - case 1: - try { - arg = optarg = argv[optind]; - nnodes = stoul(optarg, &len); - if(len != arg.size()) { throw std::invalid_argument(""); } - nslots = nnodes; - } catch(std::invalid_argument &) { - std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl; - goto usage; - } - break; - case 2: - try { - arg = optarg = argv[optind]; - nnodes = stoul(optarg, &len); - if(len != arg.size()) { throw std::invalid_argument(""); } - } catch(std::invalid_argument &) { - std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl; - goto usage; - } - try { - arg = optarg = argv[optind + 1]; - nslots = stoul(optarg, &len); - if(len != arg.size()) { throw std::invalid_argument(""); } - } catch(std::invalid_argument &) { - std::cerr << "Number of slots must be a positive integer, was " << arg << std::endl; - goto usage; - } - break; - default: - std::cerr << "'Churn' benchmark doesn't accept more than 2 extra arguments" << std::endl; - goto usage; - } - break; - case Fairness: - nnodes = 1; - switch(argc - optind) { - case 0: break; - case 1: - arg = optarg = argv[optind]; - out = arg; - break; - default: - std::cerr << "'Churn' benchmark doesn't accept more than 2 extra arguments" << std::endl; - goto usage; - } - } - goto run; - // Benchmarks - case 'b': - if(benchmark != NONE) { - std::cerr << "Only when benchmark can be run" << std::endl; - goto usage; - } - if(iequals(arg, "churn")) { - benchmark = Churn; - break; - } - if(iequals(arg, "pingpong")) { - benchmark = PingPong; - break; - } - if(iequals(arg, "producer")) { - benchmark = Producer; - break; - } - if(iequals(arg, "fairness")) { - benchmark = Fairness; - break; - } - std::cerr << "Unkown benchmark " << arg << std::endl; - goto usage; - // Numeric Arguments - case 'd': - try { - duration = stod(optarg, &len); - if(len != arg.size()) { throw std::invalid_argument(""); } - } catch(std::invalid_argument &) { - std::cerr << "Duration must be a valid double, was " << arg << std::endl; - goto usage; - } - break; - case 't': - try { - nthreads = stoul(optarg, &len); - if(len != arg.size()) { throw std::invalid_argument(""); } - } catch(std::invalid_argument &) { - std::cerr << "Number of threads must be a positive integer, was " << arg << std::endl; - goto usage; - } - break; - case 'q': - try { - nqueues = stoul(optarg, &len); - if(len != arg.size()) { throw std::invalid_argument(""); } - } catch(std::invalid_argument &) { - std::cerr << "Number of queues must be a positive integer, was " << arg << std::endl; - goto usage; - } - break; - // Other cases - default: /* ? */ - std::cerr << opt << std::endl; - usage: - std::cerr << "Usage: " << argv[0] << ": [options] -b churn [NNODES] [NSLOTS = NNODES]" << std::endl; - std::cerr << " or: " << argv[0] << ": [options] -b pingpong [NNODES]" << std::endl; - std::cerr << " or: " << argv[0] << ": [options] -b producer [NNODES]" << std::endl; - std::cerr << std::endl; - std::cerr << " -d, --duration=DURATION Duration of the experiment, in seconds" << std::endl; - std::cerr << " -t, --nthreads=NTHREADS Number of kernel threads" << std::endl; - std::cerr << " -q, --nqueues=NQUEUES Number of queues per threads" << std::endl; - std::exit(1); - } - } - run: - - check_cache_line_size(); - - std::cout << "Running " << nthreads << " threads (" << (nthreads * nqueues) << " queues) for " << duration << " seconds" << std::endl; - std::cout << "Relaxed list variant: " << LIST_VARIANT::name() << std::endl; - switch(benchmark) { - case Churn: - runChurn(nthreads, nqueues, duration, nnodes, nslots); - break; - case PingPong: - runPingPong(nthreads, nqueues, duration, nnodes); - break; - case Producer: - runProducer(nthreads, nqueues, duration, nnodes); - break; - case Fairness: - runFairness(nthreads, nqueues, duration, nnodes, out); - break; - default: - abort(); - } - return 0; -} - -const char * __my_progname = "Relaxed List"; - -struct rgb_t { - double r; // a fraction between 0 and 1 - double g; // a fraction between 0 and 1 - double b; // a fraction between 0 and 1 -}; - -struct hsv_t { - double h; // angle in degrees - double s; // a fraction between 0 and 1 - double v; // a fraction between 0 and 1 -}; - -rgb_t hsv2rgb(hsv_t in) { - double hh, p, q, t, ff; - long i; - rgb_t out; - - if(in.s <= 0.0) { // < is bogus, just shuts up warnings - out.r = in.v; - out.g = in.v; - out.b = in.v; - return out; - } - hh = in.h; - if(hh >= 360.0) hh = 0.0; - hh /= 60.0; - i = (long)hh; - ff = hh - i; - p = in.v * (1.0 - in.s); - q = in.v * (1.0 - (in.s * ff)); - t = in.v * (1.0 - (in.s * (1.0 - ff))); - - switch(i) { - case 0: - out.r = in.v; - out.g = t; - out.b = p; - break; - case 1: - out.r = q; - out.g = in.v; - out.b = p; - break; - case 2: - out.r = p; - out.g = in.v; - out.b = t; - break; - - case 3: - out.r = p; - out.g = q; - out.b = in.v; - break; - case 4: - out.r = t; - out.g = p; - out.b = in.v; - break; - case 5: - default: - out.r = in.v; - out.g = p; - out.b = q; - break; - } - return out; -} - -// void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output) { -// std::ofstream os(output); -// os << "\n"; -// os << "\n"; -// os << "\n"; -// os << "\n"; -// os << "\n"; -// os << "\n"; - -// size_t idx = 0; -// for(size_t r = 0ul; r < rows; r++) { -// os << "\n"; -// for(size_t c = 0ul; c < columns; c++) { -// os << "\n"; -// idx++; -// } -// os << "\n"; -// } - -// os << "

\n"; -// os << "\n"; -// os << "\n"; -// os << std::endl; -// } - -// #include -// #include - -/* -void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output) { - int width = columns * factor; - int height = rows / factor; - - int code = 0; - int idx = 0; - FILE *fp = NULL; - png_structp png_ptr = NULL; - png_infop info_ptr = NULL; - png_bytep row = NULL; - - // Open file for writing (binary mode) - fp = fopen(output.c_str(), "wb"); - if (fp == NULL) { - fprintf(stderr, "Could not open file %s for writing\n", output.c_str()); - code = 1; - goto finalise; - } - - // Initialize write structure - png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); - if (png_ptr == NULL) { - fprintf(stderr, "Could not allocate write struct\n"); - code = 1; - goto finalise; - } - - // Initialize info structure - info_ptr = png_create_info_struct(png_ptr); - if (info_ptr == NULL) { - fprintf(stderr, "Could not allocate info struct\n"); - code = 1; - goto finalise; - } - - // Setup Exception handling - if (setjmp(png_jmpbuf(png_ptr))) { - fprintf(stderr, "Error during png creation\n"); - code = 1; - goto finalise; - } - - png_init_io(png_ptr, fp); - - // Write header (8 bit colour depth) - png_set_IHDR(png_ptr, info_ptr, width, height, - 8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE, - PNG_COMPRESSION_TYPE_BASE, PNG_FILTER_TYPE_BASE); - - png_write_info(png_ptr, info_ptr); - - // Allocate memory for one row (3 bytes per pixel - RGB) - row = (png_bytep) malloc(3 * width * sizeof(png_byte)); - - // Write image data - int x, y; - for (y=0 ; y= 0); - idx++; - - double angle = double(color) / double(nthreads); - - auto c = hsv2rgb({ 360.0 * angle, 0.8, 0.8 }); - - r = char(c.r * 255.0); - g = char(c.g * 255.0); - b = char(c.b * 255.0); - - } - png_write_row(png_ptr, row); - } - - assert(idx == (rows * columns)); - - // End write - png_write_end(png_ptr, NULL); - - finalise: - if (fp != NULL) fclose(fp); - if (info_ptr != NULL) png_free_data(png_ptr, info_ptr, PNG_FREE_ALL, -1); - if (png_ptr != NULL) png_destroy_write_struct(&png_ptr, (png_infopp)NULL); - if (row != NULL) free(row); -} -*/ Index: doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,555 +1,0 @@ -#pragma once -#define LIST_VARIANT relaxed_list - -#define VANILLA 0 -#define SNZI 1 -#define BITMASK 2 -#define DISCOVER 3 -#define SNZM 4 -#define BIAS 5 -#define BACK 6 -#define BACKBIAS 7 - -#ifndef VARIANT -#define VARIANT VANILLA -#endif - -#ifndef NO_STATS -#include -#endif - -#include -#include -#include -#include -#include -#include - -#include "assert.hpp" -#include "utils.hpp" -#include "links.hpp" -#include "snzi.hpp" -#include "snzi-packed.hpp" -#include "snzm.hpp" - -using namespace std; - -struct pick_stat { - struct { - size_t attempt = 0; - size_t success = 0; - size_t local = 0; - } push; - struct { - size_t attempt = 0; - size_t success = 0; - size_t mask_attempt = 0; - size_t mask_reset = 0; - size_t local = 0; - } pop; -}; - -struct empty_stat { - struct { - size_t value = 0; - size_t count = 0; - } push; - struct { - size_t value = 0; - size_t count = 0; - } pop; -}; - -template -class __attribute__((aligned(128))) relaxed_list { - static_assert(std::is_same>::value, "Node must have a links field"); - -public: - static const char * name() { - const char * names[] = { - "RELAXED: VANILLA", - "RELAXED: SNZI", - "RELAXED: BITMASK", - "RELAXED: SNZI + DISCOVERED MASK", - "RELAXED: SNZI + MASK", - "RELAXED: SNZI + LOCAL BIAS", - "RELAXED: SNZI + REVERSE RNG", - "RELAXED: SNZI + LOCAL BIAS + REVERSE RNG" - }; - return names[VARIANT]; - } - - relaxed_list(unsigned numThreads, unsigned numQueues) - : numLists(numThreads * numQueues) - , lists(new intrusive_queue_t[numLists]) - #if VARIANT == SNZI || VARIANT == BACK - , snzi( std::log2( numLists / (2 * numQueues) ), 2 ) - #elif VARIANT == BIAS || VARIANT == BACKBIAS - #ifdef SNZI_PACKED - , snzi( std::ceil( std::log2(numLists) ) ) - #else - , snzi( std::log2( numLists / (2 * numQueues) ), 2 ) - #endif - #elif VARIANT == SNZM || VARIANT == DISCOVER - , snzm( numLists ) - #endif - { - assertf(7 * 8 * 8 >= numLists, "List currently only supports 448 sublists"); - std::cout << "Constructing Relaxed List with " << numLists << std::endl; - } - - ~relaxed_list() { - std::cout << "Destroying Relaxed List" << std::endl; - lists.reset(); - } - - __attribute__((noinline, hot)) void push(node_t * node) { - node->_links.ts = rdtscl(); - - while(true) { - // Pick a random list - unsigned i = idx_from_r(tls.rng1.next(), VARIANT == BIAS || VARIANT == BACKBIAS); - - #ifndef NO_STATS - tls.pick.push.attempt++; - #endif - - // If we can't lock it retry - if( !lists[i].lock.try_lock() ) continue; - - #if VARIANT == VANILLA || VARIANT == BITMASK - __attribute__((unused)) int num = numNonEmpty; - #endif - - // Actually push it - if(lists[i].push(node)) { - #if VARIANT == DISCOVER - size_t qword = i >> 6ull; - size_t bit = i & 63ull; - assert(qword == 0); - bts(tls.mask, bit); - snzm.arrive(i); - #elif VARIANT == SNZI || VARIANT == BIAS - snzi.arrive(i); - #elif VARIANT == BACK || VARIANT == BACKBIAS - snzi.arrive(i); - tls.rng2.set_raw_state( tls.rng1.get_raw_state()); - #elif VARIANT == SNZM - snzm.arrive(i); - #elif VARIANT == BITMASK - numNonEmpty++; - size_t qword = i >> 6ull; - size_t bit = i & 63ull; - assertf((list_mask[qword] & (1ul << bit)) == 0, "Before set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit)); - __attribute__((unused)) bool ret = bts(list_mask[qword], bit); - assert(!ret); - assertf((list_mask[qword] & (1ul << bit)) != 0, "After set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit)); - #else - numNonEmpty++; - #endif - } - #if VARIANT == VANILLA || VARIANT == BITMASK - assert(numNonEmpty <= (int)numLists); - #endif - - // Unlock and return - lists[i].lock.unlock(); - - #ifndef NO_STATS - tls.pick.push.success++; - #if VARIANT == VANILLA || VARIANT == BITMASK - tls.empty.push.value += num; - tls.empty.push.count += 1; - #endif - #endif - return; - } - } - - __attribute__((noinline, hot)) node_t * pop() { - #if VARIANT == DISCOVER - assert(numLists <= 64); - while(snzm.query()) { - tls.pick.pop.mask_attempt++; - unsigned i, j; - { - // Pick first list totally randomly - i = tls.rng1.next() % numLists; - - // Pick the other according to the bitmask - unsigned r = tls.rng1.next(); - - size_t mask = tls.mask.load(std::memory_order_relaxed); - if(mask == 0) { - tls.pick.pop.mask_reset++; - mask = (1U << numLists) - 1; - tls.mask.store(mask, std::memory_order_relaxed); - } - - unsigned b = rand_bit(r, mask); - - assertf(b < 64, "%zu %u", mask, b); - - j = b; - - assert(j < numLists); - } - - if(auto node = try_pop(i, j)) return node; - } - #elif VARIANT == SNZI - while(snzi.query()) { - // Pick two lists at random - int i = tls.rng1.next() % numLists; - int j = tls.rng1.next() % numLists; - - if(auto node = try_pop(i, j)) return node; - } - - #elif VARIANT == BACK - while(snzi.query()) { - // Pick two lists at random - int i = tls.rng2.prev() % numLists; - int j = tls.rng2.prev() % numLists; - - if(auto node = try_pop(i, j)) return node; - } - - #elif VARIANT == BACKBIAS - while(snzi.query()) { - // Pick two lists at random - int i = idx_from_r(tls.rng2.prev(), true); - int j = idx_from_r(tls.rng2.prev(), true); - - if(auto node = try_pop(i, j)) return node; - } - - #elif VARIANT == BIAS - while(snzi.query()) { - // Pick two lists at random - unsigned ri = tls.rng1.next(); - unsigned i; - unsigned j = tls.rng1.next(); - if(0 == (ri & 0xF)) { - i = (ri >> 4) % numLists; - } else { - i = tls.my_queue + ((ri >> 4) % 4); - j = tls.my_queue + ((j >> 4) % 4); - tls.pick.pop.local++; - } - i %= numLists; - j %= numLists; - - if(auto node = try_pop(i, j)) return node; - } - #elif VARIANT == SNZM - //* - while(snzm.query()) { - tls.pick.pop.mask_attempt++; - unsigned i, j; - { - // Pick two random number - unsigned ri = tls.rng1.next(); - unsigned rj = tls.rng1.next(); - - // Pick two nodes from it - unsigned wdxi = ri & snzm.mask; - // unsigned wdxj = rj & snzm.mask; - - // Get the masks from the nodes - // size_t maski = snzm.masks(wdxi); - size_t maskj = snzm.masks(wdxj); - - if(maski == 0 && maskj == 0) continue; - - #if defined(__BMI2__) - uint64_t idxsi = _pext_u64(snzm.indexes, maski); - // uint64_t idxsj = _pext_u64(snzm.indexes, maskj); - - auto pi = __builtin_popcountll(maski); - // auto pj = __builtin_popcountll(maskj); - - ri = pi ? ri & ((pi >> 3) - 1) : 0; - rj = pj ? rj & ((pj >> 3) - 1) : 0; - - unsigned bi = (idxsi >> (ri << 3)) & 0xff; - unsigned bj = (idxsj >> (rj << 3)) & 0xff; - #else - unsigned bi = rand_bit(ri >> snzm.depth, maski); - unsigned bj = rand_bit(rj >> snzm.depth, maskj); - #endif - - i = (bi << snzm.depth) | wdxi; - j = (bj << snzm.depth) | wdxj; - - /* paranoid */ assertf(i < numLists, "%u %u", bj, wdxi); - /* paranoid */ assertf(j < numLists, "%u %u", bj, wdxj); - } - - if(auto node = try_pop(i, j)) return node; - } - /*/ - while(snzm.query()) { - // Pick two lists at random - int i = tls.rng1.next() % numLists; - int j = tls.rng1.next() % numLists; - - if(auto node = try_pop(i, j)) return node; - } - //*/ - #elif VARIANT == BITMASK - int nnempty; - while(0 != (nnempty = numNonEmpty)) { - tls.pick.pop.mask_attempt++; - unsigned i, j; - { - // Pick two lists at random - unsigned num = ((numLists - 1) >> 6) + 1; - - unsigned ri = tls.rng1.next(); - unsigned rj = tls.rng1.next(); - - unsigned wdxi = (ri >> 6u) % num; - unsigned wdxj = (rj >> 6u) % num; - - size_t maski = list_mask[wdxi].load(std::memory_order_relaxed); - size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed); - - if(maski == 0 && maskj == 0) continue; - - unsigned bi = rand_bit(ri, maski); - unsigned bj = rand_bit(rj, maskj); - - assertf(bi < 64, "%zu %u", maski, bi); - assertf(bj < 64, "%zu %u", maskj, bj); - - i = bi | (wdxi << 6); - j = bj | (wdxj << 6); - - assertf(i < numLists, "%u", wdxi << 6); - assertf(j < numLists, "%u", wdxj << 6); - } - - if(auto node = try_pop(i, j)) return node; - } - #else - while(numNonEmpty != 0) { - // Pick two lists at random - int i = tls.rng1.next() % numLists; - int j = tls.rng1.next() % numLists; - - if(auto node = try_pop(i, j)) return node; - } - #endif - - return nullptr; - } - -private: - node_t * try_pop(unsigned i, unsigned j) { - #ifndef NO_STATS - tls.pick.pop.attempt++; - #endif - - #if VARIANT == DISCOVER - if(lists[i].ts() > 0) bts(tls.mask, i); else btr(tls.mask, i); - if(lists[j].ts() > 0) bts(tls.mask, j); else btr(tls.mask, j); - #endif - - // Pick the bet list - int w = i; - if( __builtin_expect(lists[j].ts() != 0, true) ) { - w = (lists[i].ts() < lists[j].ts()) ? i : j; - } - - auto & list = lists[w]; - // If list looks empty retry - if( list.ts() == 0 ) return nullptr; - - // If we can't get the lock retry - if( !list.lock.try_lock() ) return nullptr; - - #if VARIANT == VANILLA || VARIANT == BITMASK - __attribute__((unused)) int num = numNonEmpty; - #endif - - // If list is empty, unlock and retry - if( list.ts() == 0 ) { - list.lock.unlock(); - return nullptr; - } - - // Actually pop the list - node_t * node; - bool emptied; - std::tie(node, emptied) = list.pop(); - assert(node); - - if(emptied) { - #if VARIANT == DISCOVER - size_t qword = w >> 6ull; - size_t bit = w & 63ull; - assert(qword == 0); - __attribute__((unused)) bool ret = btr(tls.mask, bit); - snzm.depart(w); - #elif VARIANT == SNZI || VARIANT == BIAS || VARIANT == BACK || VARIANT == BACKBIAS - snzi.depart(w); - #elif VARIANT == SNZM - snzm.depart(w); - #elif VARIANT == BITMASK - numNonEmpty--; - size_t qword = w >> 6ull; - size_t bit = w & 63ull; - assert((list_mask[qword] & (1ul << bit)) != 0); - __attribute__((unused)) bool ret = btr(list_mask[qword], bit); - assert(ret); - assert((list_mask[qword] & (1ul << bit)) == 0); - #else - numNonEmpty--; - #endif - } - - // Unlock and return - list.lock.unlock(); - #if VARIANT == VANILLA || VARIANT == BITMASK - assert(numNonEmpty >= 0); - #endif - #ifndef NO_STATS - tls.pick.pop.success++; - #if VARIANT == VANILLA || VARIANT == BITMASK - tls.empty.pop.value += num; - tls.empty.pop.count += 1; - #endif - #endif - return node; - } - - inline unsigned idx_from_r(unsigned r, bool bias) { - unsigned i; - if(bias) { - if(0 == (r & 0x3F)) { - i = r >> 6; - } else { - i = tls.my_queue + ((r >> 6) % 4); - tls.pick.push.local++; - } - } else { - i = r; - } - return i % numLists; - } - -public: - - static __attribute__((aligned(128))) thread_local struct TLS { - Random rng1 = { unsigned(std::hash{}(std::this_thread::get_id()) ^ rdtscl()) }; - Random rng2 = { unsigned(std::hash{}(std::this_thread::get_id()) ^ rdtscl()) }; - unsigned my_queue = (ticket++) * 4; - pick_stat pick; - empty_stat empty; - __attribute__((aligned(64))) std::atomic_size_t mask = { 0 }; - } tls; - -private: - const unsigned numLists; - __attribute__((aligned(64))) std::unique_ptr []> lists; -private: - #if VARIANT == SNZI || VARIANT == BACK - snzi_t snzi; - #elif VARIANT == BIAS || VARIANT == BACKBIAS - #ifdef SNZI_PACKED - snzip_t snzi; - #else - snzi_t snzi; - #endif - #elif VARIANT == SNZM || VARIANT == DISCOVER - snzm_t snzm; - #else - std::atomic_int numNonEmpty = { 0 }; // number of non-empty lists - #endif - #if VARIANT == BITMASK - std::atomic_size_t list_mask[7] = { {0}, {0}, {0}, {0}, {0}, {0}, {0} }; // which queues are empty - #endif - -public: - static const constexpr size_t sizeof_queue = sizeof(intrusive_queue_t); - static std::atomic_uint32_t ticket; - -#ifndef NO_STATS - static void stats_tls_tally() { - global_stats.pick.push.attempt += tls.pick.push.attempt; - global_stats.pick.push.success += tls.pick.push.success; - global_stats.pick.push.local += tls.pick.push.local; - global_stats.pick.pop .attempt += tls.pick.pop.attempt; - global_stats.pick.pop .success += tls.pick.pop.success; - global_stats.pick.pop .mask_attempt += tls.pick.pop.mask_attempt; - global_stats.pick.pop .mask_reset += tls.pick.pop.mask_reset; - global_stats.pick.pop .local += tls.pick.pop.local; - - global_stats.qstat.push.value += tls.empty.push.value; - global_stats.qstat.push.count += tls.empty.push.count; - global_stats.qstat.pop .value += tls.empty.pop .value; - global_stats.qstat.pop .count += tls.empty.pop .count; - } - -private: - static struct GlobalStats { - struct { - struct { - std::atomic_size_t attempt = { 0 }; - std::atomic_size_t success = { 0 }; - std::atomic_size_t local = { 0 }; - } push; - struct { - std::atomic_size_t attempt = { 0 }; - std::atomic_size_t success = { 0 }; - std::atomic_size_t mask_attempt = { 0 }; - std::atomic_size_t mask_reset = { 0 }; - std::atomic_size_t local = { 0 }; - } pop; - } pick; - struct { - struct { - std::atomic_size_t value = { 0 }; - std::atomic_size_t count = { 0 }; - } push; - struct { - std::atomic_size_t value = { 0 }; - std::atomic_size_t count = { 0 }; - } pop; - } qstat; - } global_stats; - -public: - static void stats_print(std::ostream & os ) { - std::cout << "----- Relaxed List Stats -----" << std::endl; - - const auto & global = global_stats; - - double push_sur = (100.0 * double(global.pick.push.success) / global.pick.push.attempt); - double pop_sur = (100.0 * double(global.pick.pop .success) / global.pick.pop .attempt); - double mpop_sur = (100.0 * double(global.pick.pop .success) / global.pick.pop .mask_attempt); - double rpop_sur = (100.0 * double(global.pick.pop .success) / global.pick.pop .mask_reset); - - double push_len = double(global.pick.push.attempt ) / global.pick.push.success; - double pop_len = double(global.pick.pop .attempt ) / global.pick.pop .success; - double mpop_len = double(global.pick.pop .mask_attempt) / global.pick.pop .success; - double rpop_len = double(global.pick.pop .mask_reset ) / global.pick.pop .success; - - os << "Push Pick : " << push_sur << " %, len " << push_len << " (" << global.pick.push.attempt << " / " << global.pick.push.success << ")\n"; - os << "Pop Pick : " << pop_sur << " %, len " << pop_len << " (" << global.pick.pop .attempt << " / " << global.pick.pop .success << ")\n"; - os << "TryPop Pick : " << mpop_sur << " %, len " << mpop_len << " (" << global.pick.pop .mask_attempt << " / " << global.pick.pop .success << ")\n"; - os << "Pop M Reset : " << rpop_sur << " %, len " << rpop_len << " (" << global.pick.pop .mask_reset << " / " << global.pick.pop .success << ")\n"; - - double avgQ_push = double(global.qstat.push.value) / global.qstat.push.count; - double avgQ_pop = double(global.qstat.pop .value) / global.qstat.pop .count; - double avgQ = double(global.qstat.push.value + global.qstat.pop .value) / (global.qstat.push.count + global.qstat.pop .count); - os << "Push Avg Qs : " << avgQ_push << " (" << global.qstat.push.count << "ops)\n"; - os << "Pop Avg Qs : " << avgQ_pop << " (" << global.qstat.pop .count << "ops)\n"; - os << "Global Avg Qs : " << avgQ << " (" << (global.qstat.push.count + global.qstat.pop .count) << "ops)\n"; - - os << "Local Push : " << global.pick.push.local << "\n"; - os << "Local Pop : " << global.pick.pop .local << "\n"; - } -#endif -}; Index: doc/theses/thierry_delisle_PhD/code/relaxed_list_layout.cpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/relaxed_list_layout.cpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,23 +1,0 @@ -#define NO_IO -#define NDEBUG -#include "relaxed_list.hpp" - -struct __attribute__((aligned(64))) Node { - static std::atomic_size_t creates; - static std::atomic_size_t destroys; - - _LinksFields_t _links; - - int value; - Node(int value): value(value) { - creates++; - } - - ~Node() { - destroys++; - } -}; - -int main() { - return sizeof(relaxed_list) + relaxed_list::sizeof_queue; -} Index: doc/theses/thierry_delisle_PhD/code/runperf.sh =================================================================== --- doc/theses/thierry_delisle_PhD/code/runperf.sh (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,14 +1,0 @@ -#!/bin/bash -set -e - -name=$1 -event=$2 - -shift 2 - -echo "perf record -F 99 -a -g -o raw/$name.data -e $event -- $@ > raw/$name.out" -perf record -F 99 -a -g -o raw/$name.data -e $event -- $@ > raw/$name.out -echo "==============================" -cat raw/$name.out -echo "==============================" -./process.sh $name Index: doc/theses/thierry_delisle_PhD/code/scale.sh =================================================================== --- doc/theses/thierry_delisle_PhD/code/scale.sh (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,7 +1,0 @@ -#!/bin/bash -taskset -c 24-31 ./a.out -t 1 -b churn | grep --color -E "(ns|Ops|Running)" -taskset -c 24-31 ./a.out -t 2 -b churn | grep --color -E "(ns|Ops|Running)" -taskset -c 24-31 ./a.out -t 4 -b churn | grep --color -E "(ns|Ops|Running)" -taskset -c 24-31 ./a.out -t 8 -b churn | grep --color -E "(ns|Ops|Running)" -taskset -c 16-31 ./a.out -t 16 -b churn | grep --color -E "(ns|Ops|Running)" -taskset -c 0-31 ./a.out -t 32 -b churn | grep --color -E "(ns|Ops|Running)" Index: doc/theses/thierry_delisle_PhD/code/snzi-packed.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/snzi-packed.hpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,179 +1,0 @@ -#pragma once - -#define SNZI_PACKED - -#include "utils.hpp" - - -class snzip_t { - class node; - class node_aligned; -public: - const unsigned mask; - const int root; - std::unique_ptr leafs; - std::unique_ptr nodes; - - snzip_t(unsigned depth); - - void arrive(int idx) { - // idx >>= 1; - idx %= mask; - leafs[idx].arrive(); - } - - void depart(int idx) { - // idx >>= 1; - idx %= mask; - leafs[idx].depart(); - } - - bool query() const { - return nodes[root].query(); - } - - -private: - class __attribute__((aligned(32))) node { - friend class snzip_t; - private: - - union val_t { - static constexpr char Half = -1; - - uint64_t _all; - struct __attribute__((packed)) { - char cnt; - uint64_t ver:56; - }; - - bool cas(val_t & exp, char _cnt, uint64_t _ver) volatile { - val_t t; - t.ver = _ver; - t.cnt = _cnt; - /* paranoid */ assert(t._all == ((_ver << 8) | ((unsigned char)_cnt))); - return __atomic_compare_exchange_n(&this->_all, &exp._all, t._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); - } - - bool cas(val_t & exp, const val_t & tar) volatile { - return __atomic_compare_exchange_n(&this->_all, &exp._all, tar._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); - } - - val_t() : _all(0) {} - val_t(const volatile val_t & o) : _all(o._all) {} - }; - - //-------------------------------------------------- - // Hierarchical node - void arrive_h() { - int undoArr = 0; - bool success = false; - while(!success) { - auto x{ value }; - /* paranoid */ assert(x.cnt <= 120); - if( x.cnt >= 1 ) { - if( value.cas(x, x.cnt + 1, x.ver ) ) { - success = true; - } - } - /* paranoid */ assert(x.cnt <= 120); - if( x.cnt == 0 ) { - if( value.cas(x, val_t::Half, x.ver + 1) ) { - success = true; - x.cnt = val_t::Half; - x.ver = x.ver + 1; - } - } - /* paranoid */ assert(x.cnt <= 120); - if( x.cnt == val_t::Half ) { - /* paranoid */ assert(parent); - if(undoArr == 2) { - undoArr--; - } else { - parent->arrive(); - } - if( !value.cas(x, 1, x.ver) ) { - undoArr = undoArr + 1; - } - } - } - - for(int i = 0; i < undoArr; i++) { - /* paranoid */ assert(parent); - parent->depart(); - } - } - - void depart_h() { - while(true) { - auto x = (const val_t)value; - /* paranoid */ assertf(x.cnt >= 1, "%d", x.cnt); - if( value.cas( x, x.cnt - 1, x.ver ) ) { - if( x.cnt == 1 ) { - /* paranoid */ assert(parent); - parent->depart(); - } - return; - } - } - } - - //-------------------------------------------------- - // Root node - void arrive_r() { - __atomic_fetch_add(&value._all, 1, __ATOMIC_SEQ_CST); - } - - void depart_r() { - __atomic_fetch_sub(&value._all, 1, __ATOMIC_SEQ_CST); - } - - private: - volatile val_t value; - class node * parent = nullptr; - - bool is_root() { - return parent == nullptr; - } - - public: - void arrive() { - if(is_root()) arrive_r(); - else arrive_h(); - } - - void depart() { - if(is_root()) depart_r(); - else depart_h(); - } - - bool query() { - /* paranoid */ assert(is_root()); - return value._all > 0; - } - }; - - class __attribute__((aligned(128))) node_aligned : public node {}; -}; - -snzip_t::snzip_t(unsigned depth) - : mask( std::pow(2, depth) ) - , root( ((std::pow(2, depth + 1) - 1) / (2 -1)) - 1 - mask ) - , leafs(new node[ mask ]()) - , nodes(new node_aligned[ root + 1 ]()) -{ - int width = std::pow(2, depth); - int hwdith = width / 2; - std::cout << "SNZI: " << depth << "x" << width << "(" << mask - 1 << ") " << (sizeof(snzip_t::node) * (root + 1)) << " bytes" << std::endl; - for(int i = 0; i < width; i++) { - int idx = i % hwdith; - std::cout < " << idx + width << std::endl; - leafs[i].parent = &nodes[ idx ]; - } - - for(int i = 0; i < root; i++) { - int idx = (i / 2) + hwdith; - std::cout < " << idx + width << std::endl; - nodes[i].parent = &nodes[ idx ]; - } -} Index: doc/theses/thierry_delisle_PhD/code/snzi.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/snzi.hpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,164 +1,0 @@ -#pragma once - -#include "utils.hpp" - - -class snzi_t { - class node; -public: - const unsigned mask; - const int root; - std::unique_ptr nodes; - - snzi_t(unsigned depth, unsigned base = 2); - - void arrive(int idx) { - idx >>= 2; - idx %= mask; - nodes[idx].arrive(); - } - - void depart(int idx) { - idx >>= 2; - idx %= mask; - nodes[idx].depart(); - } - - bool query() const { - return nodes[root].query(); - } - - -private: - class __attribute__((aligned(128))) node { - friend class snzi_t; - private: - - union val_t { - static constexpr char Half = -1; - - uint64_t _all; - struct __attribute__((packed)) { - char cnt; - uint64_t ver:56; - }; - - bool cas(val_t & exp, char _cnt, uint64_t _ver) volatile { - val_t t; - t.ver = _ver; - t.cnt = _cnt; - /* paranoid */ assert(t._all == ((_ver << 8) | ((unsigned char)_cnt))); - return __atomic_compare_exchange_n(&this->_all, &exp._all, t._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); - } - - bool cas(val_t & exp, const val_t & tar) volatile { - return __atomic_compare_exchange_n(&this->_all, &exp._all, tar._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); - } - - val_t() : _all(0) {} - val_t(const volatile val_t & o) : _all(o._all) {} - }; - - //-------------------------------------------------- - // Hierarchical node - void arrive_h() { - int undoArr = 0; - bool success = false; - while(!success) { - auto x{ value }; - /* paranoid */ assert(x.cnt <= 120); - if( x.cnt >= 1 ) { - if( value.cas(x, x.cnt + 1, x.ver ) ) { - success = true; - } - } - /* paranoid */ assert(x.cnt <= 120); - if( x.cnt == 0 ) { - if( value.cas(x, val_t::Half, x.ver + 1) ) { - success = true; - x.cnt = val_t::Half; - x.ver = x.ver + 1; - } - } - /* paranoid */ assert(x.cnt <= 120); - if( x.cnt == val_t::Half ) { - /* paranoid */ assert(parent); - if(undoArr == 2) { - undoArr--; - } else { - parent->arrive(); - } - if( !value.cas(x, 1, x.ver) ) { - undoArr = undoArr + 1; - } - } - } - - for(int i = 0; i < undoArr; i++) { - /* paranoid */ assert(parent); - parent->depart(); - } - } - - void depart_h() { - while(true) { - auto x = (const val_t)value; - /* paranoid */ assertf(x.cnt >= 1, "%d", x.cnt); - if( value.cas( x, x.cnt - 1, x.ver ) ) { - if( x.cnt == 1 ) { - /* paranoid */ assert(parent); - parent->depart(); - } - return; - } - } - } - - //-------------------------------------------------- - // Root node - void arrive_r() { - __atomic_fetch_add(&value._all, 1, __ATOMIC_SEQ_CST); - } - - void depart_r() { - __atomic_fetch_sub(&value._all, 1, __ATOMIC_SEQ_CST); - } - - private: - volatile val_t value; - class node * parent = nullptr; - - bool is_root() { - return parent == nullptr; - } - - public: - void arrive() { - if(is_root()) arrive_r(); - else arrive_h(); - } - - void depart() { - if(is_root()) depart_r(); - else depart_h(); - } - - bool query() { - /* paranoid */ assert(is_root()); - return value._all > 0; - } - }; -}; - -snzi_t::snzi_t(unsigned depth, unsigned base) - : mask( std::pow(base, depth) ) - , root( ((std::pow(base, depth + 1) - 1) / (base -1)) - 1 ) - , nodes(new node[ root + 1 ]()) -{ - int width = std::pow(base, depth); - std::cout << "SNZI: " << depth << "x" << width << "(" << mask - 1 << ") " << (sizeof(snzi_t::node) * (root + 1)) << " bytes" << std::endl; - for(int i = 0; i < root; i++) { - std::cout < " << (i / base) + width << std::endl; - nodes[i].parent = &nodes[(i / base) + width]; - } -} Index: doc/theses/thierry_delisle_PhD/code/snzm.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/snzm.hpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,213 +1,0 @@ -#pragma once - -#include "utils.hpp" - - -class snzm_t { - class node; -public: - const unsigned depth; - const unsigned mask; - const int root; - std::unique_ptr nodes; - - #if defined(__BMI2__) - const uint64_t indexes = 0x0706050403020100; - #endif - - snzm_t(unsigned numLists); - - void arrive(int idx) { - int i = idx & mask; - nodes[i].arrive( idx >> depth); - } - - void depart(int idx) { - int i = idx & mask; - nodes[i].depart( idx >> depth ); - } - - bool query() const { - return nodes[root].query(); - } - - uint64_t masks( unsigned node ) { - /* paranoid */ assert( (node & mask) == node ); - #if defined(__BMI2__) - return nodes[node].mask_all; - #else - return nodes[node].mask; - #endif - } - -private: - class __attribute__((aligned(128))) node { - friend class snzm_t; - private: - - union val_t { - static constexpr char Half = -1; - - uint64_t _all; - struct __attribute__((packed)) { - char cnt; - uint64_t ver:56; - }; - - bool cas(val_t & exp, char _cnt, uint64_t _ver) volatile { - val_t t; - t.ver = _ver; - t.cnt = _cnt; - /* paranoid */ assert(t._all == ((_ver << 8) | ((unsigned char)_cnt))); - return __atomic_compare_exchange_n(&this->_all, &exp._all, t._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); - } - - bool cas(val_t & exp, const val_t & tar) volatile { - return __atomic_compare_exchange_n(&this->_all, &exp._all, tar._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); - } - - val_t() : _all(0) {} - val_t(const volatile val_t & o) : _all(o._all) {} - }; - - //-------------------------------------------------- - // Hierarchical node - void arrive_h() { - int undoArr = 0; - bool success = false; - while(!success) { - auto x{ value }; - /* paranoid */ assert(x.cnt <= 120); - if( x.cnt >= 1 ) { - if( value.cas(x, x.cnt + 1, x.ver ) ) { - success = true; - } - } - /* paranoid */ assert(x.cnt <= 120); - if( x.cnt == 0 ) { - if( value.cas(x, val_t::Half, x.ver + 1) ) { - success = true; - x.cnt = val_t::Half; - x.ver = x.ver + 1; - } - } - /* paranoid */ assert(x.cnt <= 120); - if( x.cnt == val_t::Half ) { - /* paranoid */ assert(parent); - parent->arrive(); - if( !value.cas(x, 1, x.ver) ) { - undoArr = undoArr + 1; - } - } - } - - for(int i = 0; i < undoArr; i++) { - /* paranoid */ assert(parent); - parent->depart(); - } - } - - void depart_h() { - while(true) { - auto x = (const val_t)value; - /* paranoid */ assertf(x.cnt >= 1, "%d", x.cnt); - if( value.cas( x, x.cnt - 1, x.ver ) ) { - if( x.cnt == 1 ) { - /* paranoid */ assert(parent); - parent->depart(); - } - return; - } - } - } - - //-------------------------------------------------- - // Root node - void arrive_r() { - __atomic_fetch_add(&value._all, 1, __ATOMIC_SEQ_CST); - } - - void depart_r() { - __atomic_fetch_sub(&value._all, 1, __ATOMIC_SEQ_CST); - } - - //-------------------------------------------------- - // Interface node - void arrive() { - /* paranoid */ assert(!is_leaf); - if(is_root()) arrive_r(); - else arrive_h(); - } - - void depart() { - /* paranoid */ assert(!is_leaf); - if(is_root()) depart_r(); - else depart_h(); - } - - private: - volatile val_t value; - #if defined(__BMI2__) - union __attribute__((packed)) { - volatile uint8_t mask[8]; - volatile uint64_t mask_all; - }; - #else - volatile size_t mask = 0; - #endif - - class node * parent = nullptr; - bool is_leaf = false; - - bool is_root() { - return parent == nullptr; - } - - public: - void arrive( int bit ) { - /* paranoid */ assert( is_leaf ); - - arrive_h(); - #if defined(__BMI2__) - /* paranoid */ assert( bit < 8 ); - mask[bit] = 0xff; - #else - /* paranoid */ assert( (mask & ( 1 << bit )) == 0 ); - __atomic_fetch_add( &mask, 1 << bit, __ATOMIC_RELAXED ); - #endif - - } - - void depart( int bit ) { - /* paranoid */ assert( is_leaf ); - - #if defined(__BMI2__) - /* paranoid */ assert( bit < 8 ); - mask[bit] = 0x00; - #else - /* paranoid */ assert( (mask & ( 1 << bit )) != 0 ); - __atomic_fetch_sub( &mask, 1 << bit, __ATOMIC_RELAXED ); - #endif - depart_h(); - } - - bool query() { - /* paranoid */ assert(is_root()); - return value._all > 0; - } - }; -}; - -snzm_t::snzm_t(unsigned numLists) - : depth( std::log2( numLists / 8 ) ) - , mask( (1 << depth) - 1 ) - , root( (1 << (depth + 1)) - 2 ) - , nodes(new node[ root + 1 ]()) -{ - int width = 1 << depth; - std::cout << "SNZI with Mask: " << depth << "x" << width << "(" << mask << ")" << std::endl; - for(int i = 0; i < root; i++) { - nodes[i].is_leaf = i < width; - nodes[i].parent = &nodes[(i / 2) + width ]; - } -} Index: doc/theses/thierry_delisle_PhD/code/utils.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/utils.hpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,250 +1,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -// Barrier from -class barrier_t { -public: - barrier_t(size_t total) - : waiting(0) - , total(total) - {} - - void wait(unsigned) { - size_t target = waiting++; - target = (target - (target % total)) + total; - while(waiting < target) - asm volatile("pause"); - - assert(waiting < (1ul << 60)); - } - -private: - std::atomic waiting; - size_t total; -}; - -// class Random { -// private: -// unsigned int seed; -// public: -// Random(int seed) { -// this->seed = seed; -// } - -// /** returns pseudorandom x satisfying 0 <= x < n. **/ -// unsigned int next() { -// seed ^= seed << 6; -// seed ^= seed >> 21; -// seed ^= seed << 7; -// return seed; -// } -// }; - -constexpr uint64_t extendedEuclidY(uint64_t a, uint64_t b); -constexpr uint64_t extendedEuclidX(uint64_t a, uint64_t b){ - return (b==0) ? 1 : extendedEuclidY(b, a - b * (a / b)); -} -constexpr uint64_t extendedEuclidY(uint64_t a, uint64_t b){ - return (b==0) ? 0 : extendedEuclidX(b, a - b * (a / b)) - (a / b) * extendedEuclidY(b, a - b * (a / b)); -} - -class Random { -private: - uint64_t x; - - static constexpr const uint64_t M = 1ul << 48ul; - static constexpr const uint64_t A = 25214903917; - static constexpr const uint64_t C = 11; - static constexpr const uint64_t D = 16; - -public: - static constexpr const uint64_t m = M; - static constexpr const uint64_t a = A; - static constexpr const uint64_t c = C; - static constexpr const uint64_t d = D; - static constexpr const uint64_t ai = extendedEuclidX(A, M); -public: - Random(unsigned int seed) { - this->x = seed * a; - } - - /** returns pseudorandom x satisfying 0 <= x < n. **/ - unsigned int next() { - //nextx = (a * x + c) % m; - x = (A * x + C) & (M - 1); - return x >> D; - } - unsigned int prev() { - //prevx = (ainverse * (x - c)) mod m - unsigned int r = x >> D; - x = ai * (x - C) & (M - 1); - return r; - } - - void set_raw_state(uint64_t _x) { - this->x = _x; - } - - uint64_t get_raw_state() { - return this->x; - } -}; - -static inline long long rdtscl(void) { - unsigned int lo, hi; - __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); - return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); -} - -static inline void affinity(int tid) { - static int cpus = get_nprocs(); - - cpu_set_t mask; - CPU_ZERO(&mask); - int cpu = cpus - tid; // Set CPU affinity to tid, starting from the end - CPU_SET(cpu, &mask); - auto result = sched_setaffinity(0, sizeof(mask), &mask); - if(result != 0) { - std::cerr << "Affinity set failed with " << result<< ", wanted " << cpu << std::endl; - } -} - -static const constexpr std::size_t cache_line_size = 64; -static inline void check_cache_line_size() { - std::cout << "Checking cache line size" << std::endl; - const std::string cache_file = "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size"; - - std::ifstream ifs (cache_file, std::ifstream::in); - - if(!ifs.good()) { - std::cerr << "Could not open file to check cache line size" << std::endl; - std::cerr << "Looking for: " << cache_file << std::endl; - std::exit(2); - } - - size_t got; - ifs >> got; - - ifs.close(); - - if(cache_line_size != got) { - std::cerr << "Cache line has incorrect size : " << got << std::endl; - std::exit(1); - } - - std::cout << "Done" << std::endl; -} - -using Clock = std::chrono::high_resolution_clock; -using duration_t = std::chrono::duration; -using std::chrono::nanoseconds; - -template -T duration_cast(T seconds) { - return std::chrono::duration_cast>(std::chrono::duration(seconds)).count(); -} - -static inline unsigned rand_bit(unsigned rnum, size_t mask) __attribute__((artificial)); -static inline unsigned rand_bit(unsigned rnum, size_t mask) { - unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0; -#if !defined(__BMI2__) - uint64_t v = mask; // Input value to find position with rank r. - unsigned int r = bit + 1;// Input: bit's desired rank [1-64]. - unsigned int s; // Output: Resulting position of bit with rank r [1-64] - uint64_t a, b, c, d; // Intermediate temporaries for bit count. - unsigned int t; // Bit count temporary. - - // Do a normal parallel bit count for a 64-bit integer, - // but store all intermediate steps. - a = v - ((v >> 1) & ~0UL/3); - b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5); - c = (b + (b >> 4)) & ~0UL/0x11; - d = (c + (c >> 8)) & ~0UL/0x101; - - - t = (d >> 32) + (d >> 48); - // Now do branchless select! - s = 64; - s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8)); - t = (d >> (s - 16)) & 0xff; - s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8)); - t = (c >> (s - 8)) & 0xf; - s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8)); - t = (b >> (s - 4)) & 0x7; - s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8)); - t = (a >> (s - 2)) & 0x3; - s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8)); - t = (v >> (s - 1)) & 0x1; - s -= ((t - r) & 256) >> 8; - return s - 1; -#else - uint64_t picked = _pdep_u64(1ul << bit, mask); - return picked ? __builtin_ctzl(picked) : 0; -#endif -} - -struct spinlock_t { - std::atomic_bool ll = { false }; - - inline void lock() { - while( __builtin_expect(ll.exchange(true),false) ) { - while(ll.load(std::memory_order_relaxed)) - asm volatile("pause"); - } - } - - inline bool try_lock() { - return false == ll.exchange(true); - } - - inline void unlock() { - ll.store(false, std::memory_order_release); - } - - inline explicit operator bool() { - return ll.load(std::memory_order_relaxed); - } -}; - -static inline bool bts(std::atomic_size_t & target, size_t bit ) { - //* - int result = 0; - asm volatile( - "LOCK btsq %[bit], %[target]\n\t" - :"=@ccc" (result) - : [target] "m" (target), [bit] "r" (bit) - ); - return result != 0; - /*/ - size_t mask = 1ul << bit; - size_t ret = target.fetch_or(mask, std::memory_order_relaxed); - return (ret & mask) != 0; - //*/ -} - -static inline bool btr(std::atomic_size_t & target, size_t bit ) { - //* - int result = 0; - asm volatile( - "LOCK btrq %[bit], %[target]\n\t" - :"=@ccc" (result) - : [target] "m" (target), [bit] "r" (bit) - ); - return result != 0; - /*/ - size_t mask = 1ul << bit; - size_t ret = target.fetch_and(~mask, std::memory_order_relaxed); - return (ret & mask) != 0; - //*/ -} Index: doc/theses/thierry_delisle_PhD/code/work_stealing.hpp =================================================================== --- doc/theses/thierry_delisle_PhD/code/work_stealing.hpp (revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b) +++ (revision ) @@ -1,222 +1,0 @@ -#pragma once -#define LIST_VARIANT work_stealing - -#include -#include -#include -#include -#include - -#include "assert.hpp" -#include "utils.hpp" -#include "links.hpp" -#include "snzi.hpp" - -using namespace std; - -template -class __attribute__((aligned(128))) work_stealing { - static_assert(std::is_same>::value, "Node must have a links field"); - -public: - static const char * name() { - return "Work Stealing"; - } - - work_stealing(unsigned _numThreads, unsigned) - : numThreads(_numThreads) - , lists(new intrusive_queue_t[numThreads]) - , snzi( std::log2( numThreads / 2 ), 2 ) - - { - std::cout << "Constructing Work Stealer with " << numThreads << std::endl; - } - - ~work_stealing() { - std::cout << "Destroying Work Stealer" << std::endl; - lists.reset(); - } - - __attribute__((noinline, hot)) void push(node_t * node) { - node->_links.ts = rdtscl(); - if( node->_links.hint > numThreads ) { - node->_links.hint = tls.rng.next() % numThreads; - tls.stat.push.nhint++; - } - - unsigned i = node->_links.hint; - auto & list = lists[i]; - list.lock.lock(); - - if(list.push( node )) { - snzi.arrive(i); - } - - list.lock.unlock(); - } - - __attribute__((noinline, hot)) node_t * pop() { - node_t * node; - while(true) { - if(!snzi.query()) { - return nullptr; - } - - { - unsigned i = tls.my_queue; - auto & list = lists[i]; - if( list.ts() != 0 ) { - list.lock.lock(); - if((node = try_pop(i))) { - tls.stat.pop.local.success++; - break; - } - else { - tls.stat.pop.local.elock++; - } - } - else { - tls.stat.pop.local.espec++; - } - } - - tls.stat.pop.steal.tried++; - - int i = tls.rng.next() % numThreads; - auto & list = lists[i]; - if( list.ts() == 0 ) { - tls.stat.pop.steal.empty++; - continue; - } - - if( !list.lock.try_lock() ) { - tls.stat.pop.steal.locked++; - continue; - } - - if((node = try_pop(i))) { - tls.stat.pop.steal.success++; - break; - } - } - - #if defined(READ) - const unsigned f = READ; - if(0 == (tls.it % f)) { - unsigned i = tls.it / f; - lists[i % numThreads].ts(); - } - // lists[tls.it].ts(); - tls.it++; - #endif - - - return node; - } - -private: - node_t * try_pop(unsigned i) { - auto & list = lists[i]; - - // If list is empty, unlock and retry - if( list.ts() == 0 ) { - list.lock.unlock(); - return nullptr; - } - - // Actually pop the list - node_t * node; - bool emptied; - std::tie(node, emptied) = list.pop(); - assert(node); - - if(emptied) { - snzi.depart(i); - } - - // Unlock and return - list.lock.unlock(); - return node; - } - - -public: - - static std::atomic_uint32_t ticket; - static __attribute__((aligned(128))) thread_local struct TLS { - Random rng = { int(rdtscl()) }; - unsigned my_queue = ticket++; - #if defined(READ) - unsigned it = 0; - #endif - struct { - struct { - std::size_t nhint = { 0 }; - } push; - struct { - struct { - std::size_t success = { 0 }; - std::size_t espec = { 0 }; - std::size_t elock = { 0 }; - } local; - struct { - std::size_t tried = { 0 }; - std::size_t locked = { 0 }; - std::size_t empty = { 0 }; - std::size_t success = { 0 }; - } steal; - } pop; - } stat; - } tls; - -private: - const unsigned numThreads; - std::unique_ptr []> lists; - __attribute__((aligned(64))) snzi_t snzi; - -#ifndef NO_STATS -private: - static struct GlobalStats { - struct { - std::atomic_size_t nhint = { 0 }; - } push; - struct { - struct { - std::atomic_size_t success = { 0 }; - std::atomic_size_t espec = { 0 }; - std::atomic_size_t elock = { 0 }; - } local; - struct { - std::atomic_size_t tried = { 0 }; - std::atomic_size_t locked = { 0 }; - std::atomic_size_t empty = { 0 }; - std::atomic_size_t success = { 0 }; - } steal; - } pop; - } global_stats; - -public: - static void stats_tls_tally() { - global_stats.push.nhint += tls.stat.push.nhint; - global_stats.pop.local.success += tls.stat.pop.local.success; - global_stats.pop.local.espec += tls.stat.pop.local.espec ; - global_stats.pop.local.elock += tls.stat.pop.local.elock ; - global_stats.pop.steal.tried += tls.stat.pop.steal.tried ; - global_stats.pop.steal.locked += tls.stat.pop.steal.locked ; - global_stats.pop.steal.empty += tls.stat.pop.steal.empty ; - global_stats.pop.steal.success += tls.stat.pop.steal.success; - } - - static void stats_print(std::ostream & os ) { - std::cout << "----- Work Stealing Stats -----" << std::endl; - - double stealSucc = double(global_stats.pop.steal.success) / global_stats.pop.steal.tried; - os << "Push to new Q : " << std::setw(15) << global_stats.push.nhint << "\n"; - os << "Local Pop : " << std::setw(15) << global_stats.pop.local.success << "\n"; - os << "Steal Pop : " << std::setw(15) << global_stats.pop.steal.success << "(" << global_stats.pop.local.espec << "s, " << global_stats.pop.local.elock << "l)\n"; - os << "Steal Success : " << std::setw(15) << stealSucc << "(" << global_stats.pop.steal.tried << " tries)\n"; - os << "Steal Fails : " << std::setw(15) << global_stats.pop.steal.empty << "e, " << global_stats.pop.steal.locked << "l\n"; - } -private: -#endif -};