source: doc/theses/thierry_delisle_PhD/code/readyQ_proto/randbit.cpp@ bb7422a

ADT ast-experimental
Last change on this file since bb7422a was 7f5683e, checked in by Thierry Delisle <tdelisle@…>, 4 years ago

Several prototype fixes for arm

  • Property mode set to 100644
File size: 6.2 KB
Line 
1#include <cstddef>
2#include <cstdint>
3#include <x86intrin.h>
4
5__attribute__((noinline)) unsigned nthSetBit(size_t mask, unsigned bit) {
6 uint64_t v = mask; // Input value to find position with rank r.
7 unsigned int r = bit;// Input: bit's desired rank [1-64].
8 unsigned int s; // Output: Resulting position of bit with rank r [1-64]
9 uint64_t a, b, c, d; // Intermediate temporaries for bit count.
10 unsigned int t; // Bit count temporary.
11
12 // Do a normal parallel bit count for a 64-bit integer,
13 // but store all intermediate steps.
14 // a = (v & 0x5555...) + ((v >> 1) & 0x5555...);
15 a = v - ((v >> 1) & ~0UL/3);
16 // b = (a & 0x3333...) + ((a >> 2) & 0x3333...);
17 b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
18 // c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...);
19 c = (b + (b >> 4)) & ~0UL/0x11;
20 // d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...);
21 d = (c + (c >> 8)) & ~0UL/0x101;
22
23
24 t = (d >> 32) + (d >> 48);
25 // Now do branchless select!
26 s = 64;
27 // if (r > t) {s -= 32; r -= t;}
28 s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
29 t = (d >> (s - 16)) & 0xff;
30 // if (r > t) {s -= 16; r -= t;}
31 s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
32 t = (c >> (s - 8)) & 0xf;
33 // if (r > t) {s -= 8; r -= t;}
34 s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
35 t = (b >> (s - 4)) & 0x7;
36 // if (r > t) {s -= 4; r -= t;}
37 s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
38 t = (a >> (s - 2)) & 0x3;
39 // if (r > t) {s -= 2; r -= t;}
40 s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
41 t = (v >> (s - 1)) & 0x1;
42 // if (r > t) s--;
43 s -= ((t - r) & 256) >> 8;
44 // s = 65 - s;
45 return s;
46}
47
48unsigned rand_bit(unsigned rnum, uint64_t mask) {
49 unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
50#if defined(BRANCHLESS)
51 uint64_t v = mask; // Input value to find position with rank r.
52 unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
53 unsigned int s; // Output: Resulting position of bit with rank r [1-64]
54 uint64_t a, b, c, d; // Intermediate temporaries for bit count.
55 unsigned int t; // Bit count temporary.
56
57 // Do a normal parallel bit count for a 64-bit integer,
58 // but store all intermediate steps.
59 // a = (v & 0x5555...) + ((v >> 1) & 0x5555...);
60 a = v - ((v >> 1) & ~0UL/3);
61 // b = (a & 0x3333...) + ((a >> 2) & 0x3333...);
62 b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
63 // c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...);
64 c = (b + (b >> 4)) & ~0UL/0x11;
65 // d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...);
66 d = (c + (c >> 8)) & ~0UL/0x101;
67
68
69 t = (d >> 32) + (d >> 48);
70 // Now do branchless select!
71 s = 64;
72 // if (r > t) {s -= 32; r -= t;}
73 s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
74 t = (d >> (s - 16)) & 0xff;
75 // if (r > t) {s -= 16; r -= t;}
76 s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
77 t = (c >> (s - 8)) & 0xf;
78 // if (r > t) {s -= 8; r -= t;}
79 s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
80 t = (b >> (s - 4)) & 0x7;
81 // if (r > t) {s -= 4; r -= t;}
82 s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
83 t = (a >> (s - 2)) & 0x3;
84 // if (r > t) {s -= 2; r -= t;}
85 s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
86 t = (v >> (s - 1)) & 0x1;
87 // if (r > t) s--;
88 s -= ((t - r) & 256) >> 8;
89 // s = 65 - s;
90 return s - 1;
91#elif defined(LOOP)
92 for(unsigned i = 0; i < bit; i++) {
93 mask ^= (1ul << (__builtin_ffsl(mask) - 1ul));
94 }
95 return __builtin_ffsl(mask) - 1ul;
96#elif defined(PDEP)
97 uint64_t picked = _pdep_u64(1ul << bit, mask);
98 return __builtin_ffsl(picked) - 1ul;
99#else
100#error must define LOOP, PDEP or BRANCHLESS
101#endif
102}
103
104#include <cassert>
105#include <atomic>
106#include <chrono>
107#include <iomanip>
108#include <iostream>
109#include <locale>
110#include <thread>
111
112#include <unistd.h>
113
114class barrier_t {
115public:
116 barrier_t(size_t total)
117 : waiting(0)
118 , total(total)
119 {}
120
121 void wait(unsigned) {
122 size_t target = waiting++;
123 target = (target - (target % total)) + total;
124 while(waiting < target)
125 Pause();
126
127 assert(waiting < (1ul << 60));
128 }
129
130private:
131 std::atomic<size_t> waiting;
132 size_t total;
133};
134
135class Random {
136private:
137 unsigned int seed;
138public:
139 Random(int seed) {
140 this->seed = seed;
141 }
142
143 /** returns pseudorandom x satisfying 0 <= x < n. **/
144 unsigned int next() {
145 seed ^= seed << 6;
146 seed ^= seed >> 21;
147 seed ^= seed << 7;
148 return seed;
149 }
150};
151
152using Clock = std::chrono::high_resolution_clock;
153using duration_t = std::chrono::duration<double>;
154using std::chrono::nanoseconds;
155
156template<typename Ratio, typename T>
157T duration_cast(T seconds) {
158 return std::chrono::duration_cast<std::chrono::duration<T, Ratio>>(std::chrono::duration<T>(seconds)).count();
159}
160
161void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) {
162
163
164 std::cout << "Starting" << std::endl;
165 auto before = Clock::now();
166 barrier.wait(0);
167
168 while(true) {
169 usleep(100000);
170 auto now = Clock::now();
171 duration_t durr = now - before;
172 if( durr.count() > duration ) {
173 done = true;
174 break;
175 }
176 std::cout << "\r" << std::setprecision(4) << durr.count();
177 std::cout.flush();
178 }
179
180 barrier.wait(0);
181 auto after = Clock::now();
182 duration_t durr = after - before;
183 duration = durr.count();
184 std::cout << "\rClosing down" << std::endl;
185}
186
187__attribute__((noinline)) void body(Random & rand) {
188 uint64_t mask = (uint64_t(rand.next()) << 32ul) | uint64_t(rand.next());
189 unsigned idx = rand.next();
190
191 unsigned bit = rand_bit(idx, mask);
192
193 if(__builtin_expect(((1ul << bit) & mask) == 0, false)) {
194 std::cerr << std::hex << "Rand " << idx << " from " << mask;
195 std::cerr << " gave " << (1ul << bit) << "(" << std::dec << bit << ")" << std::endl;
196 std::abort();
197 }
198}
199
200void runRandBit(double duration) {
201
202 std::atomic_bool done = { false };
203 barrier_t barrier(2);
204
205 size_t count = 0;
206 std::thread thread([&done, &barrier, &count]() {
207
208 Random rand(22);
209
210 barrier.wait(1);
211
212 for(;!done; count++) {
213 body(rand);
214 }
215
216 barrier.wait(1);
217 });
218
219 waitfor(duration, barrier, done);
220 thread.join();
221
222 size_t ops = count;
223 size_t ops_sec = size_t(double(ops) / duration);
224 auto dur_nano = duration_cast<std::nano>(1.0);
225
226 std::cout << "Duration : " << duration << "s\n";
227 std::cout << "ns/Op : " << ( dur_nano / ops )<< "\n";
228 std::cout << "Ops/sec : " << ops_sec << "\n";
229 std::cout << "Total ops : " << ops << std::endl;
230
231}
232
233int main() {
234 std::cout.imbue(std::locale(""));
235 runRandBit(5);
236}
Note: See TracBrowser for help on using the repository browser.