#pragma once #include #include #include #include #include #include #include #include #include // Barrier from class barrier_t { public: barrier_t(size_t total) : waiting(0) , total(total) {} void wait(unsigned) { size_t target = waiting++; target = (target - (target % total)) + total; while(waiting < target) asm volatile("pause"); assert(waiting < (1ul << 60)); } private: std::atomic waiting; size_t total; }; // class Random { // private: // unsigned int seed; // public: // Random(int seed) { // this->seed = seed; // } // /** returns pseudorandom x satisfying 0 <= x < n. **/ // unsigned int next() { // seed ^= seed << 6; // seed ^= seed >> 21; // seed ^= seed << 7; // return seed; // } // }; constexpr uint64_t extendedEuclidY(uint64_t a, uint64_t b); constexpr uint64_t extendedEuclidX(uint64_t a, uint64_t b){ return (b==0) ? 1 : extendedEuclidY(b, a - b * (a / b)); } constexpr uint64_t extendedEuclidY(uint64_t a, uint64_t b){ return (b==0) ? 0 : extendedEuclidX(b, a - b * (a / b)) - (a / b) * extendedEuclidY(b, a - b * (a / b)); } class Random { private: uint64_t x; static constexpr const uint64_t M = 1ul << 48ul; static constexpr const uint64_t A = 25214903917; static constexpr const uint64_t C = 11; static constexpr const uint64_t D = 16; public: static constexpr const uint64_t m = M; static constexpr const uint64_t a = A; static constexpr const uint64_t c = C; static constexpr const uint64_t d = D; static constexpr const uint64_t ai = extendedEuclidX(A, M); public: Random(unsigned int seed) { this->x = seed * a; } /** returns pseudorandom x satisfying 0 <= x < n. **/ unsigned int next() { //nextx = (a * x + c) % m; x = (A * x + C) & (M - 1); return x >> D; } unsigned int prev() { //prevx = (ainverse * (x - c)) mod m unsigned int r = x >> D; x = ai * (x - C) & (M - 1); return r; } void set_raw_state(uint64_t _x) { this->x = _x; } uint64_t get_raw_state() { return this->x; } }; static inline long long rdtscl(void) { unsigned int lo, hi; __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); } static inline void affinity(int tid) { static int cpus = get_nprocs(); cpu_set_t mask; CPU_ZERO(&mask); int cpu = cpus - tid; // Set CPU affinity to tid, starting from the end CPU_SET(cpu, &mask); auto result = sched_setaffinity(0, sizeof(mask), &mask); if(result != 0) { std::cerr << "Affinity set failed with " << result<< ", wanted " << cpu << std::endl; } } static const constexpr std::size_t cache_line_size = 64; static inline void check_cache_line_size() { std::cout << "Checking cache line size" << std::endl; const std::string cache_file = "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size"; std::ifstream ifs (cache_file, std::ifstream::in); if(!ifs.good()) { std::cerr << "Could not open file to check cache line size" << std::endl; std::cerr << "Looking for: " << cache_file << std::endl; std::exit(2); } size_t got; ifs >> got; ifs.close(); if(cache_line_size != got) { std::cerr << "Cache line has incorrect size : " << got << std::endl; std::exit(1); } std::cout << "Done" << std::endl; } using Clock = std::chrono::high_resolution_clock; using duration_t = std::chrono::duration; using std::chrono::nanoseconds; template T duration_cast(T seconds) { return std::chrono::duration_cast>(std::chrono::duration(seconds)).count(); } static inline unsigned rand_bit(unsigned rnum, size_t mask) __attribute__((artificial)); static inline unsigned rand_bit(unsigned rnum, size_t mask) { unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0; #if !defined(__BMI2__) uint64_t v = mask; // Input value to find position with rank r. unsigned int r = bit + 1;// Input: bit's desired rank [1-64]. unsigned int s; // Output: Resulting position of bit with rank r [1-64] uint64_t a, b, c, d; // Intermediate temporaries for bit count. unsigned int t; // Bit count temporary. // Do a normal parallel bit count for a 64-bit integer, // but store all intermediate steps. a = v - ((v >> 1) & ~0UL/3); b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5); c = (b + (b >> 4)) & ~0UL/0x11; d = (c + (c >> 8)) & ~0UL/0x101; t = (d >> 32) + (d >> 48); // Now do branchless select! s = 64; s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8)); t = (d >> (s - 16)) & 0xff; s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8)); t = (c >> (s - 8)) & 0xf; s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8)); t = (b >> (s - 4)) & 0x7; s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8)); t = (a >> (s - 2)) & 0x3; s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8)); t = (v >> (s - 1)) & 0x1; s -= ((t - r) & 256) >> 8; return s - 1; #else uint64_t picked = _pdep_u64(1ul << bit, mask); return picked ? __builtin_ctzl(picked) : 0; #endif } struct spinlock_t { std::atomic_bool ll = { false }; inline void lock() { while( __builtin_expect(ll.exchange(true),false) ) { while(ll.load(std::memory_order_relaxed)) asm volatile("pause"); } } inline bool try_lock() { return false == ll.exchange(true); } inline void unlock() { ll.store(false, std::memory_order_release); } inline explicit operator bool() { return ll.load(std::memory_order_relaxed); } }; static inline bool bts(std::atomic_size_t & target, size_t bit ) { //* int result = 0; asm volatile( "LOCK btsq %[bit], %[target]\n\t" :"=@ccc" (result) : [target] "m" (target), [bit] "r" (bit) ); return result != 0; /*/ size_t mask = 1ul << bit; size_t ret = target.fetch_or(mask, std::memory_order_relaxed); return (ret & mask) != 0; //*/ } static inline bool btr(std::atomic_size_t & target, size_t bit ) { //* int result = 0; asm volatile( "LOCK btrq %[bit], %[target]\n\t" :"=@ccc" (result) : [target] "m" (target), [bit] "r" (bit) ); return result != 0; /*/ size_t mask = 1ul << bit; size_t ret = target.fetch_and(~mask, std::memory_order_relaxed); return (ret & mask) != 0; //*/ }