Index: doc/theses/thierry_delisle_PhD/code/utils.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/utils.hpp	(revision 9421f3d8aa85bb55de754037f302d2e96c392242)
+++ doc/theses/thierry_delisle_PhD/code/utils.hpp	(revision 5c9b20cdbd8f11c2e3b1817c4e65a16c5f1e9a65)
@@ -58,5 +58,5 @@
 }
 
-void affinity(int tid) {
+static inline void affinity(int tid) {
 	static int cpus = get_nprocs();
 
@@ -72,5 +72,5 @@
 
 static const constexpr std::size_t cache_line_size = 64;
-void check_cache_line_size() {
+static inline void check_cache_line_size() {
 	std::cout << "Checking cache line size" << std::endl;
 	const std::string cache_file = "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size";
@@ -106,26 +106,9 @@
 }
 
-// from geeksforgeeks
-const constexpr unsigned num_to_bits[16] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
-
-/* Recursively get nibble of a given number
-and map them in the array */
-__attribute__((noinline)) unsigned countSetBits(size_t num) {
-	unsigned nibble = 0;
-	if (0 == num)
-		return num_to_bits[0];
-
-	// Find last nibble
-	nibble = num & 0xf;
-
-	// Use pre-stored values to find count
-	// in last nibble plus recursively add
-	// remaining nibbles.
-	return num_to_bits[nibble] + countSetBits(num >> 4);
-}
-
-__attribute__((noinline)) unsigned nthSetBit(size_t mask, unsigned bit) {
+static inline unsigned rand_bit(unsigned rnum, size_t mask) {
+	unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
+#if !defined(__BMI2__)
 	uint64_t v = mask;   // Input value to find position with rank r.
-	unsigned int r = bit;// Input: bit's desired rank [1-64].
+	unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
 	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
 	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
@@ -134,11 +117,7 @@
 	// Do a normal parallel bit count for a 64-bit integer,
 	// but store all intermediate steps.
-	// a = (v & 0x5555...) + ((v >> 1) & 0x5555...);
 	a =  v - ((v >> 1) & ~0UL/3);
-	// b = (a & 0x3333...) + ((a >> 2) & 0x3333...);
 	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
-	// c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...);
 	c = (b + (b >> 4)) & ~0UL/0x11;
-	// d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...);
 	d = (c + (c >> 8)) & ~0UL/0x101;
 
@@ -147,63 +126,19 @@
 	// Now do branchless select!
 	s  = 64;
-	// if (r > t) {s -= 32; r -= t;}
 	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
 	t  = (d >> (s - 16)) & 0xff;
-	// if (r > t) {s -= 16; r -= t;}
 	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
 	t  = (c >> (s - 8)) & 0xf;
-	// if (r > t) {s -= 8; r -= t;}
 	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
 	t  = (b >> (s - 4)) & 0x7;
-	// if (r > t) {s -= 4; r -= t;}
 	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
 	t  = (a >> (s - 2)) & 0x3;
-	// if (r > t) {s -= 2; r -= t;}
 	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
 	t  = (v >> (s - 1)) & 0x1;
-	// if (r > t) s--;
 	s -= ((t - r) & 256) >> 8;
-	s = 65 - s;
-	return s;
+	return s - 1;
+#else
+	uint64_t picked = _pdep_u64(1ul << bit, mask);
+	return picked ? __builtin_ctzl(picked) : 0;
+#endif
 }
-
-// __attribute__((noinline)) unsigned int nth_bit_set(uint64_t value, unsigned int n)
-// {
-// 	uint64_t      mask = 0x00000000FFFFFFFFul;
-// 	unsigned int  size = 32u;
-// 	unsigned int  base = 0u;
-
-// 	if(value == 0) return 0;
-// 	assertf(n < (unsigned)__builtin_popcountl(value), "%u < %d (%zu)", n, __builtin_popcountl(value), value);
-// 	n++;
-// 	while (size > 0) {
-// 		const unsigned int  count = __builtin_popcountl(value & mask);
-// 		if (n > count) {
-// 			base += size;
-// 			size >>= 1;
-// 			mask |= mask << size;
-// 		} else {
-// 			size >>= 1;
-// 			mask >>= size;
-// 		}
-//     }
-
-//     return base;
-// }
-
-
-static inline uint64_t rotl64 (uint64_t n, unsigned int c) {
-	const unsigned int mask = (8*sizeof(n) - 1);  // assumes width is a power of 2.
-
-	c &= mask;
-	return (n<<c) | (n>>( (-c)&mask ));
-}
-
-static inline uint64_t rotr64 (uint64_t n, unsigned int c)
-{
-	const unsigned int mask = (8*sizeof(n) - 1);
-
-	// assert ( (c<=mask) &&"rotate by type width or more");
-	c &= mask;
-	return (n>>c) | (n<<( (-c)&mask ));
-}
