source: doc/theses/thierry_delisle_PhD/code/readyQ_proto/bitbench/select.cpp@ bb7422a

ADT ast-experimental
Last change on this file since bb7422a was f9f3775, checked in by Thierry Delisle <tdelisle@…>, 5 years ago

Moved phd code for the readQ prototype to it's own folder

  • Property mode set to 100644
File size: 4.7 KB
Line 
1
2#include "../utils.hpp"
3
4void consume(int i, int j) __attribute__((noinline));
5void consume(int i, int j) {
6 asm volatile("":: "rm" (i), "rm" (i) );
7}
8
9static inline unsigned rand_bit_sw(unsigned rnum, size_t mask) {
10 unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
11 uint64_t v = mask; // Input value to find position with rank r.
12 unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
13 unsigned int s; // Output: Resulting position of bit with rank r [1-64]
14 uint64_t a, b, c, d; // Intermediate temporaries for bit count.
15 unsigned int t; // Bit count temporary.
16
17 // Do a normal parallel bit count for a 64-bit integer,
18 // but store all intermediate steps.
19 a = v - ((v >> 1) & ~0UL/3);
20 b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
21 c = (b + (b >> 4)) & ~0UL/0x11;
22 d = (c + (c >> 8)) & ~0UL/0x101;
23
24
25 t = (d >> 32) + (d >> 48);
26 // Now do branchless select!
27 s = 64;
28 s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
29 t = (d >> (s - 16)) & 0xff;
30 s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
31 t = (c >> (s - 8)) & 0xf;
32 s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
33 t = (b >> (s - 4)) & 0x7;
34 s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
35 t = (a >> (s - 2)) & 0x3;
36 s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
37 t = (v >> (s - 1)) & 0x1;
38 s -= ((t - r) & 256) >> 8;
39 return s - 1;
40}
41
42static inline unsigned rand_bit_hw(unsigned rnum, size_t mask) {
43 unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
44 uint64_t picked = _pdep_u64(1ul << bit, mask);
45 return picked ? __builtin_ctzl(picked) : 0;
46}
47
48struct TLS {
49 Random rng = { 6 };
50} tls;
51
52const unsigned numLists = 64;
53
54static inline void blind() {
55 int i = tls.rng.next() % numLists;
56 int j = tls.rng.next() % numLists;
57
58 consume(i, j);
59}
60
61std::atomic_size_t list_mask[7];
62static inline void bitmask_sw() {
63 unsigned i, j;
64 {
65 // Pick two lists at random
66 unsigned num = ((numLists - 1) >> 6) + 1;
67
68 unsigned ri = tls.rng.next();
69 unsigned rj = tls.rng.next();
70
71 unsigned wdxi = (ri >> 6u) % num;
72 unsigned wdxj = (rj >> 6u) % num;
73
74 size_t maski = list_mask[wdxi].load(std::memory_order_relaxed);
75 size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed);
76
77 unsigned bi = rand_bit_sw(ri, maski);
78 unsigned bj = rand_bit_sw(rj, maskj);
79
80 i = bi | (wdxi << 6);
81 j = bj | (wdxj << 6);
82 }
83
84 consume(i, j);
85}
86
87static inline void bitmask_hw() {
88 #if !defined(__BMI2__)
89 #warning NO bmi2 for pdep rand_bit
90 return;
91 #endif
92 unsigned i, j;
93 {
94 // Pick two lists at random
95 unsigned num = ((numLists - 1) >> 6) + 1;
96
97 unsigned ri = tls.rng.next();
98 unsigned rj = tls.rng.next();
99
100 unsigned wdxi = (ri >> 6u) % num;
101 unsigned wdxj = (rj >> 6u) % num;
102
103 size_t maski = list_mask[wdxi].load(std::memory_order_relaxed);
104 size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed);
105
106 unsigned bi = rand_bit_hw(ri, maski);
107 unsigned bj = rand_bit_hw(rj, maskj);
108
109 i = bi | (wdxi << 6);
110 j = bj | (wdxj << 6);
111 }
112
113 consume(i, j);
114}
115
116struct {
117 const unsigned mask = 7;
118 const unsigned depth = 3;
119 const uint64_t indexes = 0x0706050403020100;
120 uint64_t masks( unsigned node ) {
121 return 0xff00ffff00ff;
122 }
123} snzm;
124static inline void sparsemask() {
125 #if !defined(__BMI2__)
126 #warning NO bmi2 for sparse mask
127 return;
128 #endif
129 unsigned i, j;
130 {
131 // Pick two random number
132 unsigned ri = tls.rng.next();
133 unsigned rj = tls.rng.next();
134
135 // Pick two nodes from it
136 unsigned wdxi = ri & snzm.mask;
137 unsigned wdxj = rj & snzm.mask;
138
139 // Get the masks from the nodes
140 size_t maski = snzm.masks(wdxi);
141 size_t maskj = snzm.masks(wdxj);
142
143 uint64_t idxsi = _pext_u64(snzm.indexes, maski);
144 uint64_t idxsj = _pext_u64(snzm.indexes, maskj);
145
146 auto pi = __builtin_popcountll(maski);
147 auto pj = __builtin_popcountll(maskj);
148
149 ri = pi ? ri & ((pi >> 3) - 1) : 0;
150 rj = pj ? rj & ((pj >> 3) - 1) : 0;
151
152 unsigned bi = (idxsi >> (ri << 3)) & 0xff;
153 unsigned bj = (idxsj >> (rj << 3)) & 0xff;
154
155 i = (bi << snzm.depth) | wdxi;
156 j = (bj << snzm.depth) | wdxj;
157 }
158
159 consume(i, j);
160}
161
162template<typename T>
163void benchmark( T func, const std::string & name ) {
164 std::cout << "Starting " << name << std::endl;
165 auto before = Clock::now();
166 const int N = 250'000'000;
167 for(int i = 0; i < N; i++) {
168 func();
169 }
170 auto after = Clock::now();
171 duration_t durr = after - before;
172 double duration = durr.count();
173 std::cout << "Duration(s) : " << duration << std::endl;
174 std::cout << "Ops/sec : " << uint64_t(N / duration) << std::endl;
175 std::cout << "ns/Op : " << double(duration * 1'000'000'000.0 / N) << std::endl;
176 std::cout << std::endl;
177}
178
179int main() {
180 std::cout.imbue(std::locale(""));
181
182 benchmark(blind, "Blind guess");
183 benchmark(bitmask_sw, "Dense bitmask");
184 benchmark(bitmask_hw, "Dense bitmask with Parallel Deposit");
185 benchmark(sparsemask, "Parallel Extract bitmask");
186}
Note: See TracBrowser for help on using the repository browser.