#pragma once #include #include #include #include #include #include #include #include #include // Barrier from class barrier_t { public: barrier_t(size_t total) : waiting(0) , total(total) {} void wait(unsigned) { size_t target = waiting++; target = (target - (target % total)) + total; while(waiting < target) asm volatile("pause"); assert(waiting < (1ul << 60)); } private: std::atomic waiting; size_t total; }; class Random { private: unsigned int seed; public: Random(int seed) { this->seed = seed; } /** returns pseudorandom x satisfying 0 <= x < n. **/ unsigned int next() { seed ^= seed << 6; seed ^= seed >> 21; seed ^= seed << 7; return seed; } }; static inline long long rdtscl(void) { unsigned int lo, hi; __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); } static inline void affinity(int tid) { static int cpus = get_nprocs(); cpu_set_t mask; CPU_ZERO(&mask); int cpu = cpus - tid; // Set CPU affinity to tid, starting from the end CPU_SET(cpu, &mask); auto result = sched_setaffinity(0, sizeof(mask), &mask); if(result != 0) { std::cerr << "Affinity set failed with " << result<< ", wanted " << cpu << std::endl; } } static const constexpr std::size_t cache_line_size = 64; static inline void check_cache_line_size() { std::cout << "Checking cache line size" << std::endl; const std::string cache_file = "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size"; std::ifstream ifs (cache_file, std::ifstream::in); if(!ifs.good()) { std::cerr << "Could not open file to check cache line size" << std::endl; std::cerr << "Looking for: " << cache_file << std::endl; std::exit(2); } size_t got; ifs >> got; ifs.close(); if(cache_line_size != got) { std::cerr << "Cache line has incorrect size : " << got << std::endl; std::exit(1); } std::cout << "Done" << std::endl; } using Clock = std::chrono::high_resolution_clock; using duration_t = std::chrono::duration; using std::chrono::nanoseconds; template T duration_cast(T seconds) { return std::chrono::duration_cast>(std::chrono::duration(seconds)).count(); } static inline unsigned rand_bit(unsigned rnum, size_t mask) { unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0; #if !defined(__BMI2__) uint64_t v = mask; // Input value to find position with rank r. unsigned int r = bit + 1;// Input: bit's desired rank [1-64]. unsigned int s; // Output: Resulting position of bit with rank r [1-64] uint64_t a, b, c, d; // Intermediate temporaries for bit count. unsigned int t; // Bit count temporary. // Do a normal parallel bit count for a 64-bit integer, // but store all intermediate steps. a = v - ((v >> 1) & ~0UL/3); b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5); c = (b + (b >> 4)) & ~0UL/0x11; d = (c + (c >> 8)) & ~0UL/0x101; t = (d >> 32) + (d >> 48); // Now do branchless select! s = 64; s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8)); t = (d >> (s - 16)) & 0xff; s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8)); t = (c >> (s - 8)) & 0xf; s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8)); t = (b >> (s - 4)) & 0x7; s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8)); t = (a >> (s - 2)) & 0x3; s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8)); t = (v >> (s - 1)) & 0x1; s -= ((t - r) & 256) >> 8; return s - 1; #else uint64_t picked = _pdep_u64(1ul << bit, mask); return picked ? __builtin_ctzl(picked) : 0; #endif }