| [edb2fe0] | 1 |
|
|---|
| 2 | #include "../utils.hpp"
|
|---|
| 3 |
|
|---|
| 4 | void consume(int i, int j) __attribute__((noinline));
|
|---|
| 5 | void consume(int i, int j) {
|
|---|
| 6 | asm volatile("":: "rm" (i), "rm" (i) );
|
|---|
| 7 | }
|
|---|
| 8 |
|
|---|
| 9 | static inline unsigned rand_bit_sw(unsigned rnum, size_t mask) {
|
|---|
| 10 | unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
|
|---|
| 11 | uint64_t v = mask; // Input value to find position with rank r.
|
|---|
| 12 | unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
|
|---|
| 13 | unsigned int s; // Output: Resulting position of bit with rank r [1-64]
|
|---|
| 14 | uint64_t a, b, c, d; // Intermediate temporaries for bit count.
|
|---|
| 15 | unsigned int t; // Bit count temporary.
|
|---|
| 16 |
|
|---|
| 17 | // Do a normal parallel bit count for a 64-bit integer,
|
|---|
| 18 | // but store all intermediate steps.
|
|---|
| 19 | a = v - ((v >> 1) & ~0UL/3);
|
|---|
| 20 | b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
|
|---|
| 21 | c = (b + (b >> 4)) & ~0UL/0x11;
|
|---|
| 22 | d = (c + (c >> 8)) & ~0UL/0x101;
|
|---|
| 23 |
|
|---|
| 24 |
|
|---|
| 25 | t = (d >> 32) + (d >> 48);
|
|---|
| 26 | // Now do branchless select!
|
|---|
| 27 | s = 64;
|
|---|
| 28 | s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
|
|---|
| 29 | t = (d >> (s - 16)) & 0xff;
|
|---|
| 30 | s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
|
|---|
| 31 | t = (c >> (s - 8)) & 0xf;
|
|---|
| 32 | s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
|
|---|
| 33 | t = (b >> (s - 4)) & 0x7;
|
|---|
| 34 | s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
|
|---|
| 35 | t = (a >> (s - 2)) & 0x3;
|
|---|
| 36 | s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
|
|---|
| 37 | t = (v >> (s - 1)) & 0x1;
|
|---|
| 38 | s -= ((t - r) & 256) >> 8;
|
|---|
| 39 | return s - 1;
|
|---|
| 40 | }
|
|---|
| 41 |
|
|---|
| 42 | static inline unsigned rand_bit_hw(unsigned rnum, size_t mask) {
|
|---|
| 43 | unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
|
|---|
| 44 | uint64_t picked = _pdep_u64(1ul << bit, mask);
|
|---|
| 45 | return picked ? __builtin_ctzl(picked) : 0;
|
|---|
| 46 | }
|
|---|
| 47 |
|
|---|
| 48 | struct TLS {
|
|---|
| 49 | Random rng = { 6 };
|
|---|
| 50 | } tls;
|
|---|
| 51 |
|
|---|
| 52 | const unsigned numLists = 64;
|
|---|
| 53 |
|
|---|
| 54 | static inline void blind() {
|
|---|
| 55 | int i = tls.rng.next() % numLists;
|
|---|
| 56 | int j = tls.rng.next() % numLists;
|
|---|
| 57 |
|
|---|
| 58 | consume(i, j);
|
|---|
| 59 | }
|
|---|
| 60 |
|
|---|
| 61 | std::atomic_size_t list_mask[7];
|
|---|
| 62 | static inline void bitmask_sw() {
|
|---|
| 63 | unsigned i, j;
|
|---|
| 64 | {
|
|---|
| 65 | // Pick two lists at random
|
|---|
| 66 | unsigned num = ((numLists - 1) >> 6) + 1;
|
|---|
| 67 |
|
|---|
| 68 | unsigned ri = tls.rng.next();
|
|---|
| 69 | unsigned rj = tls.rng.next();
|
|---|
| 70 |
|
|---|
| 71 | unsigned wdxi = (ri >> 6u) % num;
|
|---|
| 72 | unsigned wdxj = (rj >> 6u) % num;
|
|---|
| 73 |
|
|---|
| 74 | size_t maski = list_mask[wdxi].load(std::memory_order_relaxed);
|
|---|
| 75 | size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed);
|
|---|
| 76 |
|
|---|
| 77 | unsigned bi = rand_bit_sw(ri, maski);
|
|---|
| 78 | unsigned bj = rand_bit_sw(rj, maskj);
|
|---|
| 79 |
|
|---|
| 80 | i = bi | (wdxi << 6);
|
|---|
| 81 | j = bj | (wdxj << 6);
|
|---|
| 82 | }
|
|---|
| 83 |
|
|---|
| 84 | consume(i, j);
|
|---|
| 85 | }
|
|---|
| 86 |
|
|---|
| 87 | static inline void bitmask_hw() {
|
|---|
| 88 | #if !defined(__BMI2__)
|
|---|
| 89 | #warning NO bmi2 for pdep rand_bit
|
|---|
| 90 | return;
|
|---|
| 91 | #endif
|
|---|
| 92 | unsigned i, j;
|
|---|
| 93 | {
|
|---|
| 94 | // Pick two lists at random
|
|---|
| 95 | unsigned num = ((numLists - 1) >> 6) + 1;
|
|---|
| 96 |
|
|---|
| 97 | unsigned ri = tls.rng.next();
|
|---|
| 98 | unsigned rj = tls.rng.next();
|
|---|
| 99 |
|
|---|
| 100 | unsigned wdxi = (ri >> 6u) % num;
|
|---|
| 101 | unsigned wdxj = (rj >> 6u) % num;
|
|---|
| 102 |
|
|---|
| 103 | size_t maski = list_mask[wdxi].load(std::memory_order_relaxed);
|
|---|
| 104 | size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed);
|
|---|
| 105 |
|
|---|
| 106 | unsigned bi = rand_bit_hw(ri, maski);
|
|---|
| 107 | unsigned bj = rand_bit_hw(rj, maskj);
|
|---|
| 108 |
|
|---|
| 109 | i = bi | (wdxi << 6);
|
|---|
| 110 | j = bj | (wdxj << 6);
|
|---|
| 111 | }
|
|---|
| 112 |
|
|---|
| 113 | consume(i, j);
|
|---|
| 114 | }
|
|---|
| 115 |
|
|---|
| 116 | struct {
|
|---|
| 117 | const unsigned mask = 7;
|
|---|
| 118 | const unsigned depth = 3;
|
|---|
| 119 | const uint64_t indexes = 0x0706050403020100;
|
|---|
| 120 | uint64_t masks( unsigned node ) {
|
|---|
| 121 | return 0xff00ffff00ff;
|
|---|
| 122 | }
|
|---|
| 123 | } snzm;
|
|---|
| 124 | static inline void sparsemask() {
|
|---|
| 125 | #if !defined(__BMI2__)
|
|---|
| 126 | #warning NO bmi2 for sparse mask
|
|---|
| 127 | return;
|
|---|
| 128 | #endif
|
|---|
| 129 | unsigned i, j;
|
|---|
| 130 | {
|
|---|
| 131 | // Pick two random number
|
|---|
| 132 | unsigned ri = tls.rng.next();
|
|---|
| 133 | unsigned rj = tls.rng.next();
|
|---|
| 134 |
|
|---|
| 135 | // Pick two nodes from it
|
|---|
| 136 | unsigned wdxi = ri & snzm.mask;
|
|---|
| 137 | unsigned wdxj = rj & snzm.mask;
|
|---|
| 138 |
|
|---|
| 139 | // Get the masks from the nodes
|
|---|
| 140 | size_t maski = snzm.masks(wdxi);
|
|---|
| 141 | size_t maskj = snzm.masks(wdxj);
|
|---|
| 142 |
|
|---|
| 143 | uint64_t idxsi = _pext_u64(snzm.indexes, maski);
|
|---|
| 144 | uint64_t idxsj = _pext_u64(snzm.indexes, maskj);
|
|---|
| 145 |
|
|---|
| 146 | auto pi = __builtin_popcountll(maski);
|
|---|
| 147 | auto pj = __builtin_popcountll(maskj);
|
|---|
| 148 |
|
|---|
| 149 | ri = pi ? ri & ((pi >> 3) - 1) : 0;
|
|---|
| 150 | rj = pj ? rj & ((pj >> 3) - 1) : 0;
|
|---|
| 151 |
|
|---|
| 152 | unsigned bi = (idxsi >> (ri << 3)) & 0xff;
|
|---|
| 153 | unsigned bj = (idxsj >> (rj << 3)) & 0xff;
|
|---|
| 154 |
|
|---|
| 155 | i = (bi << snzm.depth) | wdxi;
|
|---|
| 156 | j = (bj << snzm.depth) | wdxj;
|
|---|
| 157 | }
|
|---|
| 158 |
|
|---|
| 159 | consume(i, j);
|
|---|
| 160 | }
|
|---|
| 161 |
|
|---|
| 162 | template<typename T>
|
|---|
| 163 | void benchmark( T func, const std::string & name ) {
|
|---|
| 164 | std::cout << "Starting " << name << std::endl;
|
|---|
| 165 | auto before = Clock::now();
|
|---|
| 166 | const int N = 250'000'000;
|
|---|
| 167 | for(int i = 0; i < N; i++) {
|
|---|
| 168 | func();
|
|---|
| 169 | }
|
|---|
| 170 | auto after = Clock::now();
|
|---|
| 171 | duration_t durr = after - before;
|
|---|
| 172 | double duration = durr.count();
|
|---|
| 173 | std::cout << "Duration(s) : " << duration << std::endl;
|
|---|
| 174 | std::cout << "Ops/sec : " << uint64_t(N / duration) << std::endl;
|
|---|
| 175 | std::cout << "ns/Op : " << double(duration * 1'000'000'000.0 / N) << std::endl;
|
|---|
| 176 | std::cout << std::endl;
|
|---|
| 177 | }
|
|---|
| 178 |
|
|---|
| 179 | int main() {
|
|---|
| 180 | std::cout.imbue(std::locale(""));
|
|---|
| 181 |
|
|---|
| 182 | benchmark(blind, "Blind guess");
|
|---|
| 183 | benchmark(bitmask_sw, "Dense bitmask");
|
|---|
| 184 | benchmark(bitmask_hw, "Dense bitmask with Parallel Deposit");
|
|---|
| 185 | benchmark(sparsemask, "Parallel Extract bitmask");
|
|---|
| 186 | }
|
|---|