#pragma once #include #include #include #include #include #include #include #include #include // Barrier from class barrier_t { public: barrier_t(size_t total) : waiting(0) , total(total) {} void wait(unsigned) { size_t target = waiting++; target = (target - (target % total)) + total; while(waiting < target) asm volatile("pause"); assert(waiting < (1ul << 60)); } private: std::atomic waiting; size_t total; }; class Random { private: unsigned int seed; public: Random(int seed) { this->seed = seed; } /** returns pseudorandom x satisfying 0 <= x < n. **/ unsigned int next() { seed ^= seed << 6; seed ^= seed >> 21; seed ^= seed << 7; return seed; } }; static inline long long rdtscl(void) { unsigned int lo, hi; __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); } void affinity(int tid) { static int cpus = get_nprocs(); cpu_set_t mask; CPU_ZERO(&mask); int cpu = cpus - tid; // Set CPU affinity to tid, starting from the end CPU_SET(cpu, &mask); auto result = sched_setaffinity(0, sizeof(mask), &mask); if(result != 0) { std::cerr << "Affinity set failed with " << result<< ", wanted " << cpu << std::endl; } } static const constexpr std::size_t cache_line_size = 64; void check_cache_line_size() { std::cout << "Checking cache line size" << std::endl; const std::string cache_file = "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size"; std::ifstream ifs (cache_file, std::ifstream::in); if(!ifs.good()) { std::cerr << "Could not open file to check cache line size" << std::endl; std::cerr << "Looking for: " << cache_file << std::endl; std::exit(2); } size_t got; ifs >> got; ifs.close(); if(cache_line_size != got) { std::cerr << "Cache line has incorrect size : " << got << std::endl; std::exit(1); } std::cout << "Done" << std::endl; } using Clock = std::chrono::high_resolution_clock; using duration_t = std::chrono::duration; using std::chrono::nanoseconds; template T duration_cast(T seconds) { return std::chrono::duration_cast>(std::chrono::duration(seconds)).count(); } // from geeksforgeeks const constexpr unsigned num_to_bits[16] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; /* Recursively get nibble of a given number and map them in the array */ __attribute__((noinline)) unsigned countSetBits(size_t num) { unsigned nibble = 0; if (0 == num) return num_to_bits[0]; // Find last nibble nibble = num & 0xf; // Use pre-stored values to find count // in last nibble plus recursively add // remaining nibbles. return num_to_bits[nibble] + countSetBits(num >> 4); } __attribute__((noinline)) unsigned nthSetBit(size_t mask, unsigned bit) { uint64_t v = mask; // Input value to find position with rank r. unsigned int r = bit;// Input: bit's desired rank [1-64]. unsigned int s; // Output: Resulting position of bit with rank r [1-64] uint64_t a, b, c, d; // Intermediate temporaries for bit count. unsigned int t; // Bit count temporary. // Do a normal parallel bit count for a 64-bit integer, // but store all intermediate steps. // a = (v & 0x5555...) + ((v >> 1) & 0x5555...); a = v - ((v >> 1) & ~0UL/3); // b = (a & 0x3333...) + ((a >> 2) & 0x3333...); b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5); // c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...); c = (b + (b >> 4)) & ~0UL/0x11; // d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...); d = (c + (c >> 8)) & ~0UL/0x101; t = (d >> 32) + (d >> 48); // Now do branchless select! s = 64; // if (r > t) {s -= 32; r -= t;} s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8)); t = (d >> (s - 16)) & 0xff; // if (r > t) {s -= 16; r -= t;} s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8)); t = (c >> (s - 8)) & 0xf; // if (r > t) {s -= 8; r -= t;} s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8)); t = (b >> (s - 4)) & 0x7; // if (r > t) {s -= 4; r -= t;} s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8)); t = (a >> (s - 2)) & 0x3; // if (r > t) {s -= 2; r -= t;} s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8)); t = (v >> (s - 1)) & 0x1; // if (r > t) s--; s -= ((t - r) & 256) >> 8; s = 65 - s; return s; } // __attribute__((noinline)) unsigned int nth_bit_set(uint64_t value, unsigned int n) // { // uint64_t mask = 0x00000000FFFFFFFFul; // unsigned int size = 32u; // unsigned int base = 0u; // if(value == 0) return 0; // assertf(n < (unsigned)__builtin_popcountl(value), "%u < %d (%zu)", n, __builtin_popcountl(value), value); // n++; // while (size > 0) { // const unsigned int count = __builtin_popcountl(value & mask); // if (n > count) { // base += size; // size >>= 1; // mask |= mask << size; // } else { // size >>= 1; // mask >>= size; // } // } // return base; // } static inline uint64_t rotl64 (uint64_t n, unsigned int c) { const unsigned int mask = (8*sizeof(n) - 1); // assumes width is a power of 2. c &= mask; return (n<>( (-c)&mask )); } static inline uint64_t rotr64 (uint64_t n, unsigned int c) { const unsigned int mask = (8*sizeof(n) - 1); // assert ( (c<=mask) &&"rotate by type width or more"); c &= mask; return (n>>c) | (n<<( (-c)&mask )); }