1 | |
---|
2 | #include "../utils.hpp" |
---|
3 | |
---|
4 | void consume(int i, int j) __attribute__((noinline)); |
---|
5 | void consume(int i, int j) { |
---|
6 | asm volatile("":: "rm" (i), "rm" (i) ); |
---|
7 | } |
---|
8 | |
---|
9 | static inline unsigned rand_bit_sw(unsigned rnum, size_t mask) { |
---|
10 | unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0; |
---|
11 | uint64_t v = mask; // Input value to find position with rank r. |
---|
12 | unsigned int r = bit + 1;// Input: bit's desired rank [1-64]. |
---|
13 | unsigned int s; // Output: Resulting position of bit with rank r [1-64] |
---|
14 | uint64_t a, b, c, d; // Intermediate temporaries for bit count. |
---|
15 | unsigned int t; // Bit count temporary. |
---|
16 | |
---|
17 | // Do a normal parallel bit count for a 64-bit integer, |
---|
18 | // but store all intermediate steps. |
---|
19 | a = v - ((v >> 1) & ~0UL/3); |
---|
20 | b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5); |
---|
21 | c = (b + (b >> 4)) & ~0UL/0x11; |
---|
22 | d = (c + (c >> 8)) & ~0UL/0x101; |
---|
23 | |
---|
24 | |
---|
25 | t = (d >> 32) + (d >> 48); |
---|
26 | // Now do branchless select! |
---|
27 | s = 64; |
---|
28 | s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8)); |
---|
29 | t = (d >> (s - 16)) & 0xff; |
---|
30 | s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8)); |
---|
31 | t = (c >> (s - 8)) & 0xf; |
---|
32 | s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8)); |
---|
33 | t = (b >> (s - 4)) & 0x7; |
---|
34 | s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8)); |
---|
35 | t = (a >> (s - 2)) & 0x3; |
---|
36 | s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8)); |
---|
37 | t = (v >> (s - 1)) & 0x1; |
---|
38 | s -= ((t - r) & 256) >> 8; |
---|
39 | return s - 1; |
---|
40 | } |
---|
41 | |
---|
42 | static inline unsigned rand_bit_hw(unsigned rnum, size_t mask) { |
---|
43 | unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0; |
---|
44 | uint64_t picked = _pdep_u64(1ul << bit, mask); |
---|
45 | return picked ? __builtin_ctzl(picked) : 0; |
---|
46 | } |
---|
47 | |
---|
48 | struct TLS { |
---|
49 | Random rng = { 6 }; |
---|
50 | } tls; |
---|
51 | |
---|
52 | const unsigned numLists = 64; |
---|
53 | |
---|
54 | static inline void blind() { |
---|
55 | int i = tls.rng.next() % numLists; |
---|
56 | int j = tls.rng.next() % numLists; |
---|
57 | |
---|
58 | consume(i, j); |
---|
59 | } |
---|
60 | |
---|
61 | std::atomic_size_t list_mask[7]; |
---|
62 | static inline void bitmask_sw() { |
---|
63 | unsigned i, j; |
---|
64 | { |
---|
65 | // Pick two lists at random |
---|
66 | unsigned num = ((numLists - 1) >> 6) + 1; |
---|
67 | |
---|
68 | unsigned ri = tls.rng.next(); |
---|
69 | unsigned rj = tls.rng.next(); |
---|
70 | |
---|
71 | unsigned wdxi = (ri >> 6u) % num; |
---|
72 | unsigned wdxj = (rj >> 6u) % num; |
---|
73 | |
---|
74 | size_t maski = list_mask[wdxi].load(std::memory_order_relaxed); |
---|
75 | size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed); |
---|
76 | |
---|
77 | unsigned bi = rand_bit_sw(ri, maski); |
---|
78 | unsigned bj = rand_bit_sw(rj, maskj); |
---|
79 | |
---|
80 | i = bi | (wdxi << 6); |
---|
81 | j = bj | (wdxj << 6); |
---|
82 | } |
---|
83 | |
---|
84 | consume(i, j); |
---|
85 | } |
---|
86 | |
---|
87 | static inline void bitmask_hw() { |
---|
88 | #if !defined(__BMI2__) |
---|
89 | #warning NO bmi2 for pdep rand_bit |
---|
90 | return; |
---|
91 | #endif |
---|
92 | unsigned i, j; |
---|
93 | { |
---|
94 | // Pick two lists at random |
---|
95 | unsigned num = ((numLists - 1) >> 6) + 1; |
---|
96 | |
---|
97 | unsigned ri = tls.rng.next(); |
---|
98 | unsigned rj = tls.rng.next(); |
---|
99 | |
---|
100 | unsigned wdxi = (ri >> 6u) % num; |
---|
101 | unsigned wdxj = (rj >> 6u) % num; |
---|
102 | |
---|
103 | size_t maski = list_mask[wdxi].load(std::memory_order_relaxed); |
---|
104 | size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed); |
---|
105 | |
---|
106 | unsigned bi = rand_bit_hw(ri, maski); |
---|
107 | unsigned bj = rand_bit_hw(rj, maskj); |
---|
108 | |
---|
109 | i = bi | (wdxi << 6); |
---|
110 | j = bj | (wdxj << 6); |
---|
111 | } |
---|
112 | |
---|
113 | consume(i, j); |
---|
114 | } |
---|
115 | |
---|
116 | struct { |
---|
117 | const unsigned mask = 7; |
---|
118 | const unsigned depth = 3; |
---|
119 | const uint64_t indexes = 0x0706050403020100; |
---|
120 | uint64_t masks( unsigned node ) { |
---|
121 | return 0xff00ffff00ff; |
---|
122 | } |
---|
123 | } snzm; |
---|
124 | static inline void sparsemask() { |
---|
125 | #if !defined(__BMI2__) |
---|
126 | #warning NO bmi2 for sparse mask |
---|
127 | return; |
---|
128 | #endif |
---|
129 | unsigned i, j; |
---|
130 | { |
---|
131 | // Pick two random number |
---|
132 | unsigned ri = tls.rng.next(); |
---|
133 | unsigned rj = tls.rng.next(); |
---|
134 | |
---|
135 | // Pick two nodes from it |
---|
136 | unsigned wdxi = ri & snzm.mask; |
---|
137 | unsigned wdxj = rj & snzm.mask; |
---|
138 | |
---|
139 | // Get the masks from the nodes |
---|
140 | size_t maski = snzm.masks(wdxi); |
---|
141 | size_t maskj = snzm.masks(wdxj); |
---|
142 | |
---|
143 | uint64_t idxsi = _pext_u64(snzm.indexes, maski); |
---|
144 | uint64_t idxsj = _pext_u64(snzm.indexes, maskj); |
---|
145 | |
---|
146 | auto pi = __builtin_popcountll(maski); |
---|
147 | auto pj = __builtin_popcountll(maskj); |
---|
148 | |
---|
149 | ri = pi ? ri & ((pi >> 3) - 1) : 0; |
---|
150 | rj = pj ? rj & ((pj >> 3) - 1) : 0; |
---|
151 | |
---|
152 | unsigned bi = (idxsi >> (ri << 3)) & 0xff; |
---|
153 | unsigned bj = (idxsj >> (rj << 3)) & 0xff; |
---|
154 | |
---|
155 | i = (bi << snzm.depth) | wdxi; |
---|
156 | j = (bj << snzm.depth) | wdxj; |
---|
157 | } |
---|
158 | |
---|
159 | consume(i, j); |
---|
160 | } |
---|
161 | |
---|
162 | template<typename T> |
---|
163 | void benchmark( T func, const std::string & name ) { |
---|
164 | std::cout << "Starting " << name << std::endl; |
---|
165 | auto before = Clock::now(); |
---|
166 | const int N = 250'000'000; |
---|
167 | for(int i = 0; i < N; i++) { |
---|
168 | func(); |
---|
169 | } |
---|
170 | auto after = Clock::now(); |
---|
171 | duration_t durr = after - before; |
---|
172 | double duration = durr.count(); |
---|
173 | std::cout << "Duration(s) : " << duration << std::endl; |
---|
174 | std::cout << "Ops/sec : " << uint64_t(N / duration) << std::endl; |
---|
175 | std::cout << "ns/Op : " << double(duration * 1'000'000'000.0 / N) << std::endl; |
---|
176 | std::cout << std::endl; |
---|
177 | } |
---|
178 | |
---|
179 | int main() { |
---|
180 | std::cout.imbue(std::locale("")); |
---|
181 | |
---|
182 | benchmark(blind, "Blind guess"); |
---|
183 | benchmark(bitmask_sw, "Dense bitmask"); |
---|
184 | benchmark(bitmask_hw, "Dense bitmask with Parallel Deposit"); |
---|
185 | benchmark(sparsemask, "Parallel Extract bitmask"); |
---|
186 | } |
---|