[edb2fe0] | 1 | |
---|
| 2 | #include "../utils.hpp" |
---|
| 3 | |
---|
| 4 | void consume(int i, int j) __attribute__((noinline)); |
---|
| 5 | void consume(int i, int j) { |
---|
| 6 | asm volatile("":: "rm" (i), "rm" (i) ); |
---|
| 7 | } |
---|
| 8 | |
---|
| 9 | static inline unsigned rand_bit_sw(unsigned rnum, size_t mask) { |
---|
| 10 | unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0; |
---|
| 11 | uint64_t v = mask; // Input value to find position with rank r. |
---|
| 12 | unsigned int r = bit + 1;// Input: bit's desired rank [1-64]. |
---|
| 13 | unsigned int s; // Output: Resulting position of bit with rank r [1-64] |
---|
| 14 | uint64_t a, b, c, d; // Intermediate temporaries for bit count. |
---|
| 15 | unsigned int t; // Bit count temporary. |
---|
| 16 | |
---|
| 17 | // Do a normal parallel bit count for a 64-bit integer, |
---|
| 18 | // but store all intermediate steps. |
---|
| 19 | a = v - ((v >> 1) & ~0UL/3); |
---|
| 20 | b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5); |
---|
| 21 | c = (b + (b >> 4)) & ~0UL/0x11; |
---|
| 22 | d = (c + (c >> 8)) & ~0UL/0x101; |
---|
| 23 | |
---|
| 24 | |
---|
| 25 | t = (d >> 32) + (d >> 48); |
---|
| 26 | // Now do branchless select! |
---|
| 27 | s = 64; |
---|
| 28 | s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8)); |
---|
| 29 | t = (d >> (s - 16)) & 0xff; |
---|
| 30 | s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8)); |
---|
| 31 | t = (c >> (s - 8)) & 0xf; |
---|
| 32 | s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8)); |
---|
| 33 | t = (b >> (s - 4)) & 0x7; |
---|
| 34 | s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8)); |
---|
| 35 | t = (a >> (s - 2)) & 0x3; |
---|
| 36 | s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8)); |
---|
| 37 | t = (v >> (s - 1)) & 0x1; |
---|
| 38 | s -= ((t - r) & 256) >> 8; |
---|
| 39 | return s - 1; |
---|
| 40 | } |
---|
| 41 | |
---|
| 42 | static inline unsigned rand_bit_hw(unsigned rnum, size_t mask) { |
---|
| 43 | unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0; |
---|
| 44 | uint64_t picked = _pdep_u64(1ul << bit, mask); |
---|
| 45 | return picked ? __builtin_ctzl(picked) : 0; |
---|
| 46 | } |
---|
| 47 | |
---|
| 48 | struct TLS { |
---|
| 49 | Random rng = { 6 }; |
---|
| 50 | } tls; |
---|
| 51 | |
---|
| 52 | const unsigned numLists = 64; |
---|
| 53 | |
---|
| 54 | static inline void blind() { |
---|
| 55 | int i = tls.rng.next() % numLists; |
---|
| 56 | int j = tls.rng.next() % numLists; |
---|
| 57 | |
---|
| 58 | consume(i, j); |
---|
| 59 | } |
---|
| 60 | |
---|
| 61 | std::atomic_size_t list_mask[7]; |
---|
| 62 | static inline void bitmask_sw() { |
---|
| 63 | unsigned i, j; |
---|
| 64 | { |
---|
| 65 | // Pick two lists at random |
---|
| 66 | unsigned num = ((numLists - 1) >> 6) + 1; |
---|
| 67 | |
---|
| 68 | unsigned ri = tls.rng.next(); |
---|
| 69 | unsigned rj = tls.rng.next(); |
---|
| 70 | |
---|
| 71 | unsigned wdxi = (ri >> 6u) % num; |
---|
| 72 | unsigned wdxj = (rj >> 6u) % num; |
---|
| 73 | |
---|
| 74 | size_t maski = list_mask[wdxi].load(std::memory_order_relaxed); |
---|
| 75 | size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed); |
---|
| 76 | |
---|
| 77 | unsigned bi = rand_bit_sw(ri, maski); |
---|
| 78 | unsigned bj = rand_bit_sw(rj, maskj); |
---|
| 79 | |
---|
| 80 | i = bi | (wdxi << 6); |
---|
| 81 | j = bj | (wdxj << 6); |
---|
| 82 | } |
---|
| 83 | |
---|
| 84 | consume(i, j); |
---|
| 85 | } |
---|
| 86 | |
---|
| 87 | static inline void bitmask_hw() { |
---|
| 88 | #if !defined(__BMI2__) |
---|
| 89 | #warning NO bmi2 for pdep rand_bit |
---|
| 90 | return; |
---|
| 91 | #endif |
---|
| 92 | unsigned i, j; |
---|
| 93 | { |
---|
| 94 | // Pick two lists at random |
---|
| 95 | unsigned num = ((numLists - 1) >> 6) + 1; |
---|
| 96 | |
---|
| 97 | unsigned ri = tls.rng.next(); |
---|
| 98 | unsigned rj = tls.rng.next(); |
---|
| 99 | |
---|
| 100 | unsigned wdxi = (ri >> 6u) % num; |
---|
| 101 | unsigned wdxj = (rj >> 6u) % num; |
---|
| 102 | |
---|
| 103 | size_t maski = list_mask[wdxi].load(std::memory_order_relaxed); |
---|
| 104 | size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed); |
---|
| 105 | |
---|
| 106 | unsigned bi = rand_bit_hw(ri, maski); |
---|
| 107 | unsigned bj = rand_bit_hw(rj, maskj); |
---|
| 108 | |
---|
| 109 | i = bi | (wdxi << 6); |
---|
| 110 | j = bj | (wdxj << 6); |
---|
| 111 | } |
---|
| 112 | |
---|
| 113 | consume(i, j); |
---|
| 114 | } |
---|
| 115 | |
---|
| 116 | struct { |
---|
| 117 | const unsigned mask = 7; |
---|
| 118 | const unsigned depth = 3; |
---|
| 119 | const uint64_t indexes = 0x0706050403020100; |
---|
| 120 | uint64_t masks( unsigned node ) { |
---|
| 121 | return 0xff00ffff00ff; |
---|
| 122 | } |
---|
| 123 | } snzm; |
---|
| 124 | static inline void sparsemask() { |
---|
| 125 | #if !defined(__BMI2__) |
---|
| 126 | #warning NO bmi2 for sparse mask |
---|
| 127 | return; |
---|
| 128 | #endif |
---|
| 129 | unsigned i, j; |
---|
| 130 | { |
---|
| 131 | // Pick two random number |
---|
| 132 | unsigned ri = tls.rng.next(); |
---|
| 133 | unsigned rj = tls.rng.next(); |
---|
| 134 | |
---|
| 135 | // Pick two nodes from it |
---|
| 136 | unsigned wdxi = ri & snzm.mask; |
---|
| 137 | unsigned wdxj = rj & snzm.mask; |
---|
| 138 | |
---|
| 139 | // Get the masks from the nodes |
---|
| 140 | size_t maski = snzm.masks(wdxi); |
---|
| 141 | size_t maskj = snzm.masks(wdxj); |
---|
| 142 | |
---|
| 143 | uint64_t idxsi = _pext_u64(snzm.indexes, maski); |
---|
| 144 | uint64_t idxsj = _pext_u64(snzm.indexes, maskj); |
---|
| 145 | |
---|
| 146 | auto pi = __builtin_popcountll(maski); |
---|
| 147 | auto pj = __builtin_popcountll(maskj); |
---|
| 148 | |
---|
| 149 | ri = pi ? ri & ((pi >> 3) - 1) : 0; |
---|
| 150 | rj = pj ? rj & ((pj >> 3) - 1) : 0; |
---|
| 151 | |
---|
| 152 | unsigned bi = (idxsi >> (ri << 3)) & 0xff; |
---|
| 153 | unsigned bj = (idxsj >> (rj << 3)) & 0xff; |
---|
| 154 | |
---|
| 155 | i = (bi << snzm.depth) | wdxi; |
---|
| 156 | j = (bj << snzm.depth) | wdxj; |
---|
| 157 | } |
---|
| 158 | |
---|
| 159 | consume(i, j); |
---|
| 160 | } |
---|
| 161 | |
---|
| 162 | template<typename T> |
---|
| 163 | void benchmark( T func, const std::string & name ) { |
---|
| 164 | std::cout << "Starting " << name << std::endl; |
---|
| 165 | auto before = Clock::now(); |
---|
| 166 | const int N = 250'000'000; |
---|
| 167 | for(int i = 0; i < N; i++) { |
---|
| 168 | func(); |
---|
| 169 | } |
---|
| 170 | auto after = Clock::now(); |
---|
| 171 | duration_t durr = after - before; |
---|
| 172 | double duration = durr.count(); |
---|
| 173 | std::cout << "Duration(s) : " << duration << std::endl; |
---|
| 174 | std::cout << "Ops/sec : " << uint64_t(N / duration) << std::endl; |
---|
| 175 | std::cout << "ns/Op : " << double(duration * 1'000'000'000.0 / N) << std::endl; |
---|
| 176 | std::cout << std::endl; |
---|
| 177 | } |
---|
| 178 | |
---|
| 179 | int main() { |
---|
| 180 | std::cout.imbue(std::locale("")); |
---|
| 181 | |
---|
| 182 | benchmark(blind, "Blind guess"); |
---|
| 183 | benchmark(bitmask_sw, "Dense bitmask"); |
---|
| 184 | benchmark(bitmask_hw, "Dense bitmask with Parallel Deposit"); |
---|
| 185 | benchmark(sparsemask, "Parallel Extract bitmask"); |
---|
| 186 | } |
---|