1 |
|
---|
2 | #include "../utils.hpp"
|
---|
3 |
|
---|
4 | void consume(int i, int j) __attribute__((noinline));
|
---|
5 | void consume(int i, int j) {
|
---|
6 | asm volatile("":: "rm" (i), "rm" (i) );
|
---|
7 | }
|
---|
8 |
|
---|
9 | static inline unsigned rand_bit_sw(unsigned rnum, size_t mask) {
|
---|
10 | unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
|
---|
11 | uint64_t v = mask; // Input value to find position with rank r.
|
---|
12 | unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
|
---|
13 | unsigned int s; // Output: Resulting position of bit with rank r [1-64]
|
---|
14 | uint64_t a, b, c, d; // Intermediate temporaries for bit count.
|
---|
15 | unsigned int t; // Bit count temporary.
|
---|
16 |
|
---|
17 | // Do a normal parallel bit count for a 64-bit integer,
|
---|
18 | // but store all intermediate steps.
|
---|
19 | a = v - ((v >> 1) & ~0UL/3);
|
---|
20 | b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
|
---|
21 | c = (b + (b >> 4)) & ~0UL/0x11;
|
---|
22 | d = (c + (c >> 8)) & ~0UL/0x101;
|
---|
23 |
|
---|
24 |
|
---|
25 | t = (d >> 32) + (d >> 48);
|
---|
26 | // Now do branchless select!
|
---|
27 | s = 64;
|
---|
28 | s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
|
---|
29 | t = (d >> (s - 16)) & 0xff;
|
---|
30 | s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
|
---|
31 | t = (c >> (s - 8)) & 0xf;
|
---|
32 | s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
|
---|
33 | t = (b >> (s - 4)) & 0x7;
|
---|
34 | s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
|
---|
35 | t = (a >> (s - 2)) & 0x3;
|
---|
36 | s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
|
---|
37 | t = (v >> (s - 1)) & 0x1;
|
---|
38 | s -= ((t - r) & 256) >> 8;
|
---|
39 | return s - 1;
|
---|
40 | }
|
---|
41 |
|
---|
42 | static inline unsigned rand_bit_hw(unsigned rnum, size_t mask) {
|
---|
43 | unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
|
---|
44 | uint64_t picked = _pdep_u64(1ul << bit, mask);
|
---|
45 | return picked ? __builtin_ctzl(picked) : 0;
|
---|
46 | }
|
---|
47 |
|
---|
48 | struct TLS {
|
---|
49 | Random rng = { 6 };
|
---|
50 | } tls;
|
---|
51 |
|
---|
52 | const unsigned numLists = 64;
|
---|
53 |
|
---|
54 | static inline void blind() {
|
---|
55 | int i = tls.rng.next() % numLists;
|
---|
56 | int j = tls.rng.next() % numLists;
|
---|
57 |
|
---|
58 | consume(i, j);
|
---|
59 | }
|
---|
60 |
|
---|
61 | std::atomic_size_t list_mask[7];
|
---|
62 | static inline void bitmask_sw() {
|
---|
63 | unsigned i, j;
|
---|
64 | {
|
---|
65 | // Pick two lists at random
|
---|
66 | unsigned num = ((numLists - 1) >> 6) + 1;
|
---|
67 |
|
---|
68 | unsigned ri = tls.rng.next();
|
---|
69 | unsigned rj = tls.rng.next();
|
---|
70 |
|
---|
71 | unsigned wdxi = (ri >> 6u) % num;
|
---|
72 | unsigned wdxj = (rj >> 6u) % num;
|
---|
73 |
|
---|
74 | size_t maski = list_mask[wdxi].load(std::memory_order_relaxed);
|
---|
75 | size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed);
|
---|
76 |
|
---|
77 | unsigned bi = rand_bit_sw(ri, maski);
|
---|
78 | unsigned bj = rand_bit_sw(rj, maskj);
|
---|
79 |
|
---|
80 | i = bi | (wdxi << 6);
|
---|
81 | j = bj | (wdxj << 6);
|
---|
82 | }
|
---|
83 |
|
---|
84 | consume(i, j);
|
---|
85 | }
|
---|
86 |
|
---|
87 | static inline void bitmask_hw() {
|
---|
88 | #if !defined(__BMI2__)
|
---|
89 | #warning NO bmi2 for pdep rand_bit
|
---|
90 | return;
|
---|
91 | #endif
|
---|
92 | unsigned i, j;
|
---|
93 | {
|
---|
94 | // Pick two lists at random
|
---|
95 | unsigned num = ((numLists - 1) >> 6) + 1;
|
---|
96 |
|
---|
97 | unsigned ri = tls.rng.next();
|
---|
98 | unsigned rj = tls.rng.next();
|
---|
99 |
|
---|
100 | unsigned wdxi = (ri >> 6u) % num;
|
---|
101 | unsigned wdxj = (rj >> 6u) % num;
|
---|
102 |
|
---|
103 | size_t maski = list_mask[wdxi].load(std::memory_order_relaxed);
|
---|
104 | size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed);
|
---|
105 |
|
---|
106 | unsigned bi = rand_bit_hw(ri, maski);
|
---|
107 | unsigned bj = rand_bit_hw(rj, maskj);
|
---|
108 |
|
---|
109 | i = bi | (wdxi << 6);
|
---|
110 | j = bj | (wdxj << 6);
|
---|
111 | }
|
---|
112 |
|
---|
113 | consume(i, j);
|
---|
114 | }
|
---|
115 |
|
---|
116 | struct {
|
---|
117 | const unsigned mask = 7;
|
---|
118 | const unsigned depth = 3;
|
---|
119 | const uint64_t indexes = 0x0706050403020100;
|
---|
120 | uint64_t masks( unsigned node ) {
|
---|
121 | return 0xff00ffff00ff;
|
---|
122 | }
|
---|
123 | } snzm;
|
---|
124 | static inline void sparsemask() {
|
---|
125 | #if !defined(__BMI2__)
|
---|
126 | #warning NO bmi2 for sparse mask
|
---|
127 | return;
|
---|
128 | #endif
|
---|
129 | unsigned i, j;
|
---|
130 | {
|
---|
131 | // Pick two random number
|
---|
132 | unsigned ri = tls.rng.next();
|
---|
133 | unsigned rj = tls.rng.next();
|
---|
134 |
|
---|
135 | // Pick two nodes from it
|
---|
136 | unsigned wdxi = ri & snzm.mask;
|
---|
137 | unsigned wdxj = rj & snzm.mask;
|
---|
138 |
|
---|
139 | // Get the masks from the nodes
|
---|
140 | size_t maski = snzm.masks(wdxi);
|
---|
141 | size_t maskj = snzm.masks(wdxj);
|
---|
142 |
|
---|
143 | uint64_t idxsi = _pext_u64(snzm.indexes, maski);
|
---|
144 | uint64_t idxsj = _pext_u64(snzm.indexes, maskj);
|
---|
145 |
|
---|
146 | auto pi = __builtin_popcountll(maski);
|
---|
147 | auto pj = __builtin_popcountll(maskj);
|
---|
148 |
|
---|
149 | ri = pi ? ri & ((pi >> 3) - 1) : 0;
|
---|
150 | rj = pj ? rj & ((pj >> 3) - 1) : 0;
|
---|
151 |
|
---|
152 | unsigned bi = (idxsi >> (ri << 3)) & 0xff;
|
---|
153 | unsigned bj = (idxsj >> (rj << 3)) & 0xff;
|
---|
154 |
|
---|
155 | i = (bi << snzm.depth) | wdxi;
|
---|
156 | j = (bj << snzm.depth) | wdxj;
|
---|
157 | }
|
---|
158 |
|
---|
159 | consume(i, j);
|
---|
160 | }
|
---|
161 |
|
---|
162 | template<typename T>
|
---|
163 | void benchmark( T func, const std::string & name ) {
|
---|
164 | std::cout << "Starting " << name << std::endl;
|
---|
165 | auto before = Clock::now();
|
---|
166 | const int N = 250'000'000;
|
---|
167 | for(int i = 0; i < N; i++) {
|
---|
168 | func();
|
---|
169 | }
|
---|
170 | auto after = Clock::now();
|
---|
171 | duration_t durr = after - before;
|
---|
172 | double duration = durr.count();
|
---|
173 | std::cout << "Duration(s) : " << duration << std::endl;
|
---|
174 | std::cout << "Ops/sec : " << uint64_t(N / duration) << std::endl;
|
---|
175 | std::cout << "ns/Op : " << double(duration * 1'000'000'000.0 / N) << std::endl;
|
---|
176 | std::cout << std::endl;
|
---|
177 | }
|
---|
178 |
|
---|
179 | int main() {
|
---|
180 | std::cout.imbue(std::locale(""));
|
---|
181 |
|
---|
182 | benchmark(blind, "Blind guess");
|
---|
183 | benchmark(bitmask_sw, "Dense bitmask");
|
---|
184 | benchmark(bitmask_hw, "Dense bitmask with Parallel Deposit");
|
---|
185 | benchmark(sparsemask, "Parallel Extract bitmask");
|
---|
186 | }
|
---|