| 1 | #include <cstddef>
 | 
|---|
| 2 | #include <cstdint>
 | 
|---|
| 3 | #include <x86intrin.h>
 | 
|---|
| 4 | 
 | 
|---|
| 5 | __attribute__((noinline)) unsigned nthSetBit(size_t mask, unsigned bit) {
 | 
|---|
| 6 |         uint64_t v = mask;   // Input value to find position with rank r.
 | 
|---|
| 7 |         unsigned int r = bit;// Input: bit's desired rank [1-64].
 | 
|---|
| 8 |         unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
 | 
|---|
| 9 |         uint64_t a, b, c, d; // Intermediate temporaries for bit count.
 | 
|---|
| 10 |         unsigned int t;      // Bit count temporary.
 | 
|---|
| 11 | 
 | 
|---|
| 12 |         // Do a normal parallel bit count for a 64-bit integer,
 | 
|---|
| 13 |         // but store all intermediate steps.
 | 
|---|
| 14 |         // a = (v & 0x5555...) + ((v >> 1) & 0x5555...);
 | 
|---|
| 15 |         a =  v - ((v >> 1) & ~0UL/3);
 | 
|---|
| 16 |         // b = (a & 0x3333...) + ((a >> 2) & 0x3333...);
 | 
|---|
| 17 |         b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
 | 
|---|
| 18 |         // c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...);
 | 
|---|
| 19 |         c = (b + (b >> 4)) & ~0UL/0x11;
 | 
|---|
| 20 |         // d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...);
 | 
|---|
| 21 |         d = (c + (c >> 8)) & ~0UL/0x101;
 | 
|---|
| 22 | 
 | 
|---|
| 23 | 
 | 
|---|
| 24 |         t = (d >> 32) + (d >> 48);
 | 
|---|
| 25 |         // Now do branchless select!
 | 
|---|
| 26 |         s  = 64;
 | 
|---|
| 27 |         // if (r > t) {s -= 32; r -= t;}
 | 
|---|
| 28 |         s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
 | 
|---|
| 29 |         t  = (d >> (s - 16)) & 0xff;
 | 
|---|
| 30 |         // if (r > t) {s -= 16; r -= t;}
 | 
|---|
| 31 |         s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
 | 
|---|
| 32 |         t  = (c >> (s - 8)) & 0xf;
 | 
|---|
| 33 |         // if (r > t) {s -= 8; r -= t;}
 | 
|---|
| 34 |         s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
 | 
|---|
| 35 |         t  = (b >> (s - 4)) & 0x7;
 | 
|---|
| 36 |         // if (r > t) {s -= 4; r -= t;}
 | 
|---|
| 37 |         s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
 | 
|---|
| 38 |         t  = (a >> (s - 2)) & 0x3;
 | 
|---|
| 39 |         // if (r > t) {s -= 2; r -= t;}
 | 
|---|
| 40 |         s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
 | 
|---|
| 41 |         t  = (v >> (s - 1)) & 0x1;
 | 
|---|
| 42 |         // if (r > t) s--;
 | 
|---|
| 43 |         s -= ((t - r) & 256) >> 8;
 | 
|---|
| 44 |         // s = 65 - s;
 | 
|---|
| 45 |         return s;
 | 
|---|
| 46 | }
 | 
|---|
| 47 | 
 | 
|---|
| 48 | unsigned rand_bit(unsigned rnum, uint64_t mask) {
 | 
|---|
| 49 |         unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
 | 
|---|
| 50 | #if defined(BRANCHLESS)
 | 
|---|
| 51 |         uint64_t v = mask;   // Input value to find position with rank r.
 | 
|---|
| 52 |         unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
 | 
|---|
| 53 |         unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
 | 
|---|
| 54 |         uint64_t a, b, c, d; // Intermediate temporaries for bit count.
 | 
|---|
| 55 |         unsigned int t;      // Bit count temporary.
 | 
|---|
| 56 | 
 | 
|---|
| 57 |         // Do a normal parallel bit count for a 64-bit integer,
 | 
|---|
| 58 |         // but store all intermediate steps.
 | 
|---|
| 59 |         // a = (v & 0x5555...) + ((v >> 1) & 0x5555...);
 | 
|---|
| 60 |         a =  v - ((v >> 1) & ~0UL/3);
 | 
|---|
| 61 |         // b = (a & 0x3333...) + ((a >> 2) & 0x3333...);
 | 
|---|
| 62 |         b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
 | 
|---|
| 63 |         // c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...);
 | 
|---|
| 64 |         c = (b + (b >> 4)) & ~0UL/0x11;
 | 
|---|
| 65 |         // d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...);
 | 
|---|
| 66 |         d = (c + (c >> 8)) & ~0UL/0x101;
 | 
|---|
| 67 | 
 | 
|---|
| 68 | 
 | 
|---|
| 69 |         t = (d >> 32) + (d >> 48);
 | 
|---|
| 70 |         // Now do branchless select!
 | 
|---|
| 71 |         s  = 64;
 | 
|---|
| 72 |         // if (r > t) {s -= 32; r -= t;}
 | 
|---|
| 73 |         s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
 | 
|---|
| 74 |         t  = (d >> (s - 16)) & 0xff;
 | 
|---|
| 75 |         // if (r > t) {s -= 16; r -= t;}
 | 
|---|
| 76 |         s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
 | 
|---|
| 77 |         t  = (c >> (s - 8)) & 0xf;
 | 
|---|
| 78 |         // if (r > t) {s -= 8; r -= t;}
 | 
|---|
| 79 |         s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
 | 
|---|
| 80 |         t  = (b >> (s - 4)) & 0x7;
 | 
|---|
| 81 |         // if (r > t) {s -= 4; r -= t;}
 | 
|---|
| 82 |         s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
 | 
|---|
| 83 |         t  = (a >> (s - 2)) & 0x3;
 | 
|---|
| 84 |         // if (r > t) {s -= 2; r -= t;}
 | 
|---|
| 85 |         s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
 | 
|---|
| 86 |         t  = (v >> (s - 1)) & 0x1;
 | 
|---|
| 87 |         // if (r > t) s--;
 | 
|---|
| 88 |         s -= ((t - r) & 256) >> 8;
 | 
|---|
| 89 |         // s = 65 - s;
 | 
|---|
| 90 |         return s - 1;
 | 
|---|
| 91 | #elif defined(LOOP)
 | 
|---|
| 92 |         for(unsigned i = 0; i < bit; i++) {
 | 
|---|
| 93 |                 mask ^= (1ul << (__builtin_ffsl(mask) - 1ul));
 | 
|---|
| 94 |         }
 | 
|---|
| 95 |         return __builtin_ffsl(mask) - 1ul;
 | 
|---|
| 96 | #elif defined(PDEP)
 | 
|---|
| 97 |         uint64_t picked = _pdep_u64(1ul << bit, mask);
 | 
|---|
| 98 |         return __builtin_ffsl(picked) - 1ul;
 | 
|---|
| 99 | #else
 | 
|---|
| 100 | #error must define LOOP, PDEP or BRANCHLESS
 | 
|---|
| 101 | #endif
 | 
|---|
| 102 | }
 | 
|---|
| 103 | 
 | 
|---|
| 104 | #include <cassert>
 | 
|---|
| 105 | #include <atomic>
 | 
|---|
| 106 | #include <chrono>
 | 
|---|
| 107 | #include <iomanip>
 | 
|---|
| 108 | #include <iostream>
 | 
|---|
| 109 | #include <locale>
 | 
|---|
| 110 | #include <thread>
 | 
|---|
| 111 | 
 | 
|---|
| 112 | #include <unistd.h>
 | 
|---|
| 113 | 
 | 
|---|
| 114 | class barrier_t {
 | 
|---|
| 115 | public:
 | 
|---|
| 116 |         barrier_t(size_t total)
 | 
|---|
| 117 |                 : waiting(0)
 | 
|---|
| 118 |                 , total(total)
 | 
|---|
| 119 |         {}
 | 
|---|
| 120 | 
 | 
|---|
| 121 |         void wait(unsigned) {
 | 
|---|
| 122 |                 size_t target = waiting++;
 | 
|---|
| 123 |                 target = (target - (target % total)) + total;
 | 
|---|
| 124 |                 while(waiting < target)
 | 
|---|
| 125 |                         asm volatile("pause");
 | 
|---|
| 126 | 
 | 
|---|
| 127 |                 assert(waiting < (1ul << 60));
 | 
|---|
| 128 |         }
 | 
|---|
| 129 | 
 | 
|---|
| 130 | private:
 | 
|---|
| 131 |         std::atomic<size_t> waiting;
 | 
|---|
| 132 |         size_t total;
 | 
|---|
| 133 | };
 | 
|---|
| 134 | 
 | 
|---|
| 135 | class Random {
 | 
|---|
| 136 | private:
 | 
|---|
| 137 |         unsigned int seed;
 | 
|---|
| 138 | public:
 | 
|---|
| 139 |         Random(int seed) {
 | 
|---|
| 140 |                 this->seed = seed;
 | 
|---|
| 141 |         }
 | 
|---|
| 142 | 
 | 
|---|
| 143 |         /** returns pseudorandom x satisfying 0 <= x < n. **/
 | 
|---|
| 144 |         unsigned int next() {
 | 
|---|
| 145 |                 seed ^= seed << 6;
 | 
|---|
| 146 |                 seed ^= seed >> 21;
 | 
|---|
| 147 |                 seed ^= seed << 7;
 | 
|---|
| 148 |                 return seed;
 | 
|---|
| 149 |         }
 | 
|---|
| 150 | };
 | 
|---|
| 151 | 
 | 
|---|
| 152 | using Clock = std::chrono::high_resolution_clock;
 | 
|---|
| 153 | using duration_t = std::chrono::duration<double>;
 | 
|---|
| 154 | using std::chrono::nanoseconds;
 | 
|---|
| 155 | 
 | 
|---|
| 156 | template<typename Ratio, typename T>
 | 
|---|
| 157 | T duration_cast(T seconds) {
 | 
|---|
| 158 |         return std::chrono::duration_cast<std::chrono::duration<T, Ratio>>(std::chrono::duration<T>(seconds)).count();
 | 
|---|
| 159 | }
 | 
|---|
| 160 | 
 | 
|---|
| 161 | void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) {
 | 
|---|
| 162 | 
 | 
|---|
| 163 | 
 | 
|---|
| 164 |         std::cout << "Starting" << std::endl;
 | 
|---|
| 165 |         auto before = Clock::now();
 | 
|---|
| 166 |         barrier.wait(0);
 | 
|---|
| 167 | 
 | 
|---|
| 168 |         while(true) {
 | 
|---|
| 169 |                 usleep(100000);
 | 
|---|
| 170 |                 auto now = Clock::now();
 | 
|---|
| 171 |                 duration_t durr = now - before;
 | 
|---|
| 172 |                 if( durr.count() > duration ) {
 | 
|---|
| 173 |                         done = true;
 | 
|---|
| 174 |                         break;
 | 
|---|
| 175 |                 }
 | 
|---|
| 176 |                 std::cout << "\r" << std::setprecision(4) << durr.count();
 | 
|---|
| 177 |                 std::cout.flush();
 | 
|---|
| 178 |         }
 | 
|---|
| 179 | 
 | 
|---|
| 180 |         barrier.wait(0);
 | 
|---|
| 181 |         auto after = Clock::now();
 | 
|---|
| 182 |         duration_t durr = after - before;
 | 
|---|
| 183 |         duration = durr.count();
 | 
|---|
| 184 |         std::cout << "\rClosing down" << std::endl;
 | 
|---|
| 185 | }
 | 
|---|
| 186 | 
 | 
|---|
| 187 | __attribute__((noinline)) void body(Random & rand) {
 | 
|---|
| 188 |         uint64_t mask = (uint64_t(rand.next()) << 32ul) | uint64_t(rand.next());
 | 
|---|
| 189 |         unsigned idx = rand.next();
 | 
|---|
| 190 | 
 | 
|---|
| 191 |         unsigned bit = rand_bit(idx, mask);
 | 
|---|
| 192 | 
 | 
|---|
| 193 |         if(__builtin_expect(((1ul << bit) & mask) == 0, false)) {
 | 
|---|
| 194 |                 std::cerr << std::hex <<  "Rand " << idx << " from " << mask;
 | 
|---|
| 195 |                 std::cerr << " gave " << (1ul << bit) << "(" << std::dec << bit << ")" << std::endl;
 | 
|---|
| 196 |                 std::abort();
 | 
|---|
| 197 |         }
 | 
|---|
| 198 | }
 | 
|---|
| 199 | 
 | 
|---|
| 200 | void runRandBit(double duration) {
 | 
|---|
| 201 | 
 | 
|---|
| 202 |         std::atomic_bool done  = { false };
 | 
|---|
| 203 |         barrier_t barrier(2);
 | 
|---|
| 204 | 
 | 
|---|
| 205 |         size_t count = 0;
 | 
|---|
| 206 |         std::thread thread([&done, &barrier, &count]() {
 | 
|---|
| 207 | 
 | 
|---|
| 208 |                 Random rand(22);
 | 
|---|
| 209 | 
 | 
|---|
| 210 |                 barrier.wait(1);
 | 
|---|
| 211 | 
 | 
|---|
| 212 |                 for(;!done; count++) {
 | 
|---|
| 213 |                         body(rand);
 | 
|---|
| 214 |                 }
 | 
|---|
| 215 | 
 | 
|---|
| 216 |                 barrier.wait(1);
 | 
|---|
| 217 |         });
 | 
|---|
| 218 | 
 | 
|---|
| 219 |         waitfor(duration, barrier, done);
 | 
|---|
| 220 |         thread.join();
 | 
|---|
| 221 | 
 | 
|---|
| 222 |         size_t ops = count;
 | 
|---|
| 223 |         size_t ops_sec = size_t(double(ops) / duration);
 | 
|---|
| 224 |         auto dur_nano = duration_cast<std::nano>(1.0);
 | 
|---|
| 225 | 
 | 
|---|
| 226 |         std::cout << "Duration      : " << duration << "s\n";
 | 
|---|
| 227 |         std::cout << "ns/Op         : " << ( dur_nano / ops )<< "\n";
 | 
|---|
| 228 |         std::cout << "Ops/sec       : " << ops_sec << "\n";
 | 
|---|
| 229 |         std::cout << "Total ops     : " << ops << std::endl;
 | 
|---|
| 230 | 
 | 
|---|
| 231 | }
 | 
|---|
| 232 | 
 | 
|---|
| 233 | int main() {
 | 
|---|
| 234 |         std::cout.imbue(std::locale(""));
 | 
|---|
| 235 |         runRandBit(5);
 | 
|---|
| 236 | }
 | 
|---|