| 1 | #pragma once
 | 
|---|
| 2 | 
 | 
|---|
| 3 | #include <cassert>
 | 
|---|
| 4 | #include <cstddef>
 | 
|---|
| 5 | #include <atomic>
 | 
|---|
| 6 | #include <chrono>
 | 
|---|
| 7 | #include <fstream>
 | 
|---|
| 8 | #include <iostream>
 | 
|---|
| 9 | 
 | 
|---|
| 10 | #include <unistd.h>
 | 
|---|
| 11 | #include <sys/sysinfo.h>
 | 
|---|
| 12 | 
 | 
|---|
| 13 | // #include <x86intrin.h>
 | 
|---|
| 14 | 
 | 
|---|
| 15 | // class Random {
 | 
|---|
| 16 | // private:
 | 
|---|
| 17 | //      unsigned int seed;
 | 
|---|
| 18 | // public:
 | 
|---|
| 19 | //      Random(int seed) {
 | 
|---|
| 20 | //              this->seed = seed;
 | 
|---|
| 21 | //      }
 | 
|---|
| 22 | 
 | 
|---|
| 23 | //      /** returns pseudorandom x satisfying 0 <= x < n. **/
 | 
|---|
| 24 | //      unsigned int next() {
 | 
|---|
| 25 | //              seed ^= seed << 6;
 | 
|---|
| 26 | //              seed ^= seed >> 21;
 | 
|---|
| 27 | //              seed ^= seed << 7;
 | 
|---|
| 28 | //              return seed;
 | 
|---|
| 29 | //      }
 | 
|---|
| 30 | // };
 | 
|---|
| 31 | 
 | 
|---|
| 32 | constexpr uint64_t extendedEuclidY(uint64_t a, uint64_t b);
 | 
|---|
| 33 | constexpr uint64_t extendedEuclidX(uint64_t a, uint64_t b){
 | 
|---|
| 34 |     return (b==0) ? 1 : extendedEuclidY(b, a - b * (a / b));
 | 
|---|
| 35 | }
 | 
|---|
| 36 | constexpr uint64_t extendedEuclidY(uint64_t a, uint64_t b){
 | 
|---|
| 37 |     return (b==0) ? 0 : extendedEuclidX(b, a - b * (a / b)) - (a / b) * extendedEuclidY(b, a - b * (a / b));
 | 
|---|
| 38 | }
 | 
|---|
| 39 | 
 | 
|---|
| 40 | class Random {
 | 
|---|
| 41 | private:
 | 
|---|
| 42 |         uint64_t x;
 | 
|---|
| 43 | 
 | 
|---|
| 44 |         static constexpr const uint64_t M  = 1ul << 48ul;
 | 
|---|
| 45 |         static constexpr const uint64_t A  = 25214903917;
 | 
|---|
| 46 |         static constexpr const uint64_t C  = 11;
 | 
|---|
| 47 |         static constexpr const uint64_t D  = 16;
 | 
|---|
| 48 | 
 | 
|---|
| 49 | public:
 | 
|---|
| 50 |         static constexpr const uint64_t m  = M;
 | 
|---|
| 51 |         static constexpr const uint64_t a  = A;
 | 
|---|
| 52 |         static constexpr const uint64_t c  = C;
 | 
|---|
| 53 |         static constexpr const uint64_t d  = D;
 | 
|---|
| 54 |         static constexpr const uint64_t ai = extendedEuclidX(A, M);
 | 
|---|
| 55 | public:
 | 
|---|
| 56 |         Random(unsigned int seed) {
 | 
|---|
| 57 |                 this->x = seed * a;
 | 
|---|
| 58 |         }
 | 
|---|
| 59 | 
 | 
|---|
| 60 |         /** returns pseudorandom x satisfying 0 <= x < n. **/
 | 
|---|
| 61 |         unsigned int next() {
 | 
|---|
| 62 |                 //nextx = (a * x + c) % m;
 | 
|---|
| 63 |                 x = (A * x + C) & (M - 1);
 | 
|---|
| 64 |                 return x >> D;
 | 
|---|
| 65 |         }
 | 
|---|
| 66 |         unsigned int prev() {
 | 
|---|
| 67 |                 //prevx = (ainverse * (x - c)) mod m
 | 
|---|
| 68 |                 unsigned int r = x >> D;
 | 
|---|
| 69 |                 x = ai * (x - C) & (M - 1);
 | 
|---|
| 70 |                 return r;
 | 
|---|
| 71 |         }
 | 
|---|
| 72 | 
 | 
|---|
| 73 |         void set_raw_state(uint64_t _x) {
 | 
|---|
| 74 |                 this->x = _x;
 | 
|---|
| 75 |         }
 | 
|---|
| 76 | 
 | 
|---|
| 77 |         uint64_t get_raw_state() {
 | 
|---|
| 78 |                 return this->x;
 | 
|---|
| 79 |         }
 | 
|---|
| 80 | };
 | 
|---|
| 81 | 
 | 
|---|
| 82 | static inline long long int rdtscl(void) {
 | 
|---|
| 83 |         #if defined( __i386 ) || defined( __x86_64 )
 | 
|---|
| 84 |                 unsigned int lo, hi;
 | 
|---|
| 85 |                 __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
 | 
|---|
| 86 |                 return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
 | 
|---|
| 87 |         #elif defined( __aarch64__ ) || defined( __arm__ )
 | 
|---|
| 88 |                 // https://github.com/google/benchmark/blob/v1.1.0/src/cycleclock.h#L116
 | 
|---|
| 89 |                 long long int virtual_timer_value;
 | 
|---|
| 90 |                 asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value));
 | 
|---|
| 91 |                 return virtual_timer_value;
 | 
|---|
| 92 |         #else
 | 
|---|
| 93 |                 #error unsupported hardware architecture
 | 
|---|
| 94 |         #endif
 | 
|---|
| 95 | }
 | 
|---|
| 96 | 
 | 
|---|
| 97 | #if defined( __i386 ) || defined( __x86_64 )
 | 
|---|
| 98 |         #define Pause() __asm__ __volatile__ ( "pause" : : : )
 | 
|---|
| 99 | #elif defined( __ARM_ARCH )
 | 
|---|
| 100 |         #define Pause() __asm__ __volatile__ ( "YIELD" : : : )
 | 
|---|
| 101 | #else
 | 
|---|
| 102 |         #error unsupported architecture
 | 
|---|
| 103 | #endif
 | 
|---|
| 104 | 
 | 
|---|
| 105 | static inline void affinity(int tid) {
 | 
|---|
| 106 |         static int cpus = get_nprocs();
 | 
|---|
| 107 | 
 | 
|---|
| 108 |         cpu_set_t  mask;
 | 
|---|
| 109 |         CPU_ZERO(&mask);
 | 
|---|
| 110 |         int cpu = cpus - tid;  // Set CPU affinity to tid, starting from the end
 | 
|---|
| 111 |         CPU_SET(cpu, &mask);
 | 
|---|
| 112 |         auto result = sched_setaffinity(0, sizeof(mask), &mask);
 | 
|---|
| 113 |         if(result != 0) {
 | 
|---|
| 114 |                 std::cerr << "Affinity set failed with " << result<< ", wanted " << cpu << std::endl;
 | 
|---|
| 115 |         }
 | 
|---|
| 116 | }
 | 
|---|
| 117 | 
 | 
|---|
| 118 | static const constexpr std::size_t cache_line_size = 64;
 | 
|---|
| 119 | static inline void check_cache_line_size() {
 | 
|---|
| 120 |         std::cout << "Checking cache line size" << std::endl;
 | 
|---|
| 121 |         const std::string cache_file = "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size";
 | 
|---|
| 122 | 
 | 
|---|
| 123 |         std::ifstream ifs (cache_file, std::ifstream::in);
 | 
|---|
| 124 | 
 | 
|---|
| 125 |         if(!ifs.good()) {
 | 
|---|
| 126 |                 std::cerr << "Could not open file to check cache line size" << std::endl;
 | 
|---|
| 127 |                 std::cerr << "Looking for: " << cache_file << std::endl;
 | 
|---|
| 128 |                 std::exit(2);
 | 
|---|
| 129 |         }
 | 
|---|
| 130 | 
 | 
|---|
| 131 |         size_t got;
 | 
|---|
| 132 |         ifs >> got;
 | 
|---|
| 133 | 
 | 
|---|
| 134 |         ifs.close();
 | 
|---|
| 135 | 
 | 
|---|
| 136 |         if(cache_line_size != got) {
 | 
|---|
| 137 |                 std::cerr << "Cache line has incorrect size : " << got << std::endl;
 | 
|---|
| 138 |                 std::exit(1);
 | 
|---|
| 139 |         }
 | 
|---|
| 140 | 
 | 
|---|
| 141 |         std::cout << "Done" << std::endl;
 | 
|---|
| 142 | }
 | 
|---|
| 143 | 
 | 
|---|
| 144 | using Clock = std::chrono::high_resolution_clock;
 | 
|---|
| 145 | using duration_t = std::chrono::duration<double>;
 | 
|---|
| 146 | using std::chrono::nanoseconds;
 | 
|---|
| 147 | 
 | 
|---|
| 148 | template<typename Ratio, typename T>
 | 
|---|
| 149 | T duration_cast(T seconds) {
 | 
|---|
| 150 |         return std::chrono::duration_cast<std::chrono::duration<T, Ratio>>(std::chrono::duration<T>(seconds)).count();
 | 
|---|
| 151 | }
 | 
|---|
| 152 | 
 | 
|---|
| 153 | static inline unsigned rand_bit(unsigned rnum, size_t mask) __attribute__((artificial));
 | 
|---|
| 154 | static inline unsigned rand_bit(unsigned rnum, size_t mask) {
 | 
|---|
| 155 |         unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
 | 
|---|
| 156 | #if !defined(__BMI2__)
 | 
|---|
| 157 |         uint64_t v = mask;   // Input value to find position with rank r.
 | 
|---|
| 158 |         unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
 | 
|---|
| 159 |         unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
 | 
|---|
| 160 |         uint64_t a, b, c, d; // Intermediate temporaries for bit count.
 | 
|---|
| 161 |         unsigned int t;      // Bit count temporary.
 | 
|---|
| 162 | 
 | 
|---|
| 163 |         // Do a normal parallel bit count for a 64-bit integer,
 | 
|---|
| 164 |         // but store all intermediate steps.
 | 
|---|
| 165 |         a =  v - ((v >> 1) & ~0UL/3);
 | 
|---|
| 166 |         b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
 | 
|---|
| 167 |         c = (b + (b >> 4)) & ~0UL/0x11;
 | 
|---|
| 168 |         d = (c + (c >> 8)) & ~0UL/0x101;
 | 
|---|
| 169 | 
 | 
|---|
| 170 | 
 | 
|---|
| 171 |         t = (d >> 32) + (d >> 48);
 | 
|---|
| 172 |         // Now do branchless select!
 | 
|---|
| 173 |         s  = 64;
 | 
|---|
| 174 |         s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
 | 
|---|
| 175 |         t  = (d >> (s - 16)) & 0xff;
 | 
|---|
| 176 |         s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
 | 
|---|
| 177 |         t  = (c >> (s - 8)) & 0xf;
 | 
|---|
| 178 |         s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
 | 
|---|
| 179 |         t  = (b >> (s - 4)) & 0x7;
 | 
|---|
| 180 |         s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
 | 
|---|
| 181 |         t  = (a >> (s - 2)) & 0x3;
 | 
|---|
| 182 |         s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
 | 
|---|
| 183 |         t  = (v >> (s - 1)) & 0x1;
 | 
|---|
| 184 |         s -= ((t - r) & 256) >> 8;
 | 
|---|
| 185 |         return s - 1;
 | 
|---|
| 186 | #else
 | 
|---|
| 187 |         uint64_t picked = _pdep_u64(1ul << bit, mask);
 | 
|---|
| 188 |         return picked ? __builtin_ctzl(picked) : 0;
 | 
|---|
| 189 | #endif
 | 
|---|
| 190 | }
 | 
|---|
| 191 | 
 | 
|---|
| 192 | // Barrier from
 | 
|---|
| 193 | class barrier_t {
 | 
|---|
| 194 | public:
 | 
|---|
| 195 |         barrier_t(size_t total)
 | 
|---|
| 196 |                 : waiting(0)
 | 
|---|
| 197 |                 , total(total)
 | 
|---|
| 198 |         {}
 | 
|---|
| 199 | 
 | 
|---|
| 200 |         void wait(unsigned) {
 | 
|---|
| 201 |                 size_t target = waiting++;
 | 
|---|
| 202 |                 target = (target - (target % total)) + total;
 | 
|---|
| 203 |                 while(waiting < target)
 | 
|---|
| 204 |                         Pause();
 | 
|---|
| 205 | 
 | 
|---|
| 206 |                 assert(waiting < (1ul << 60));
 | 
|---|
| 207 |         }
 | 
|---|
| 208 | 
 | 
|---|
| 209 | private:
 | 
|---|
| 210 |         std::atomic<size_t> waiting;
 | 
|---|
| 211 |         size_t total;
 | 
|---|
| 212 | };
 | 
|---|
| 213 | 
 | 
|---|
| 214 | struct spinlock_t {
 | 
|---|
| 215 |         std::atomic_bool ll = { false };
 | 
|---|
| 216 | 
 | 
|---|
| 217 |         inline void lock() {
 | 
|---|
| 218 |                 while( __builtin_expect(ll.exchange(true),false) ) {
 | 
|---|
| 219 |                         while(ll.load(std::memory_order_relaxed))
 | 
|---|
| 220 |                                 Pause();
 | 
|---|
| 221 |                 }
 | 
|---|
| 222 |         }
 | 
|---|
| 223 | 
 | 
|---|
| 224 |         inline bool try_lock() {
 | 
|---|
| 225 |                 return false == ll.exchange(true);
 | 
|---|
| 226 |         }
 | 
|---|
| 227 | 
 | 
|---|
| 228 |         inline void unlock() {
 | 
|---|
| 229 |                 ll.store(false, std::memory_order_release);
 | 
|---|
| 230 |         }
 | 
|---|
| 231 | 
 | 
|---|
| 232 |         inline explicit operator bool() {
 | 
|---|
| 233 |                 return ll.load(std::memory_order_relaxed);
 | 
|---|
| 234 |         }
 | 
|---|
| 235 | };
 | 
|---|
| 236 | 
 | 
|---|
| 237 | static inline bool bts(std::atomic_size_t & target, size_t bit ) {
 | 
|---|
| 238 |         //*
 | 
|---|
| 239 |         int result = 0;
 | 
|---|
| 240 |         asm volatile(
 | 
|---|
| 241 |                 "LOCK btsq %[bit], %[target]\n\t"
 | 
|---|
| 242 |                 :"=@ccc" (result)
 | 
|---|
| 243 |                 : [target] "m" (target), [bit] "r" (bit)
 | 
|---|
| 244 |         );
 | 
|---|
| 245 |         return result != 0;
 | 
|---|
| 246 |         /*/
 | 
|---|
| 247 |         size_t mask = 1ul << bit;
 | 
|---|
| 248 |         size_t ret = target.fetch_or(mask, std::memory_order_relaxed);
 | 
|---|
| 249 |         return (ret & mask) != 0;
 | 
|---|
| 250 |         //*/
 | 
|---|
| 251 | }
 | 
|---|
| 252 | 
 | 
|---|
| 253 | static inline bool btr(std::atomic_size_t & target, size_t bit ) {
 | 
|---|
| 254 |         //*
 | 
|---|
| 255 |         int result = 0;
 | 
|---|
| 256 |         asm volatile(
 | 
|---|
| 257 |                 "LOCK btrq %[bit], %[target]\n\t"
 | 
|---|
| 258 |                 :"=@ccc" (result)
 | 
|---|
| 259 |                 : [target] "m" (target), [bit] "r" (bit)
 | 
|---|
| 260 |         );
 | 
|---|
| 261 |         return result != 0;
 | 
|---|
| 262 |         /*/
 | 
|---|
| 263 |         size_t mask = 1ul << bit;
 | 
|---|
| 264 |         size_t ret = target.fetch_and(~mask, std::memory_order_relaxed);
 | 
|---|
| 265 |         return (ret & mask) != 0;
 | 
|---|
| 266 |         //*/
 | 
|---|
| 267 | }
 | 
|---|