Index: doc/theses/thierry_delisle_PhD/code/Makefile
===================================================================
--- doc/theses/thierry_delisle_PhD/code/Makefile	(revision df75fe974af930420e0dc45e8117b8e9434c224c)
+++ doc/theses/thierry_delisle_PhD/code/Makefile	(revision 40cac90c90ddfdd945a6730e54a6ae73d8629bfc)
@@ -1,12 +1,12 @@
 
 
-CXXFLAGS = -O3 -g -Wall -Wextra -std=c++17 -DNO_STATS
+CXXFLAGS = -O3 -g -Wall -Wextra -std=c++17
 LDFLAGS = -pthread -latomic
 
 push:
-	clang++ relaxed_list.cpp -g -Wall -Wextra -std=c++17 -fsyntax-only &&  rsync -av relaxed_list.cpp relaxed_list.hpp utils.hpp assert.hpp scale.sh plg7a:~/workspace/sched/.
+	clang++ relaxed_list.cpp -g -Wall -Wextra -std=c++17 -fsyntax-only &&  rsync -av relaxed_list.cpp relaxed_list.hpp utils.hpp assert.hpp scale.sh plg7b:~/workspace/sched/.
 
 relaxed_list: $(firstword $(MAKEFILE_LIST)) | build
-	clang++ relaxed_list.cpp $(CXXFLAGS) $(LDFLAGS) -MMD -MF build/$(@).d -o $(@)
+	clang++ relaxed_list.cpp $(CXXFLAGS) $(LDFLAGS) -lpng -MMD -MF build/$(@).d -o $(@)
 
 -include build/relaxed_list.d
Index: doc/theses/thierry_delisle_PhD/code/randbit.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/randbit.cpp	(revision 40cac90c90ddfdd945a6730e54a6ae73d8629bfc)
+++ doc/theses/thierry_delisle_PhD/code/randbit.cpp	(revision 40cac90c90ddfdd945a6730e54a6ae73d8629bfc)
@@ -0,0 +1,236 @@
+#include <cstddef>
+#include <cstdint>
+#include <x86intrin.h>
+
+__attribute__((noinline)) unsigned nthSetBit(size_t mask, unsigned bit) {
+	uint64_t v = mask;   // Input value to find position with rank r.
+	unsigned int r = bit;// Input: bit's desired rank [1-64].
+	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
+	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
+	unsigned int t;      // Bit count temporary.
+
+	// Do a normal parallel bit count for a 64-bit integer,
+	// but store all intermediate steps.
+	// a = (v & 0x5555...) + ((v >> 1) & 0x5555...);
+	a =  v - ((v >> 1) & ~0UL/3);
+	// b = (a & 0x3333...) + ((a >> 2) & 0x3333...);
+	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
+	// c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...);
+	c = (b + (b >> 4)) & ~0UL/0x11;
+	// d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...);
+	d = (c + (c >> 8)) & ~0UL/0x101;
+
+
+	t = (d >> 32) + (d >> 48);
+	// Now do branchless select!
+	s  = 64;
+	// if (r > t) {s -= 32; r -= t;}
+	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
+	t  = (d >> (s - 16)) & 0xff;
+	// if (r > t) {s -= 16; r -= t;}
+	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
+	t  = (c >> (s - 8)) & 0xf;
+	// if (r > t) {s -= 8; r -= t;}
+	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
+	t  = (b >> (s - 4)) & 0x7;
+	// if (r > t) {s -= 4; r -= t;}
+	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
+	t  = (a >> (s - 2)) & 0x3;
+	// if (r > t) {s -= 2; r -= t;}
+	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
+	t  = (v >> (s - 1)) & 0x1;
+	// if (r > t) s--;
+	s -= ((t - r) & 256) >> 8;
+	// s = 65 - s;
+	return s;
+}
+
+unsigned rand_bit(unsigned rnum, uint64_t mask) {
+	unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
+#if defined(BRANCHLESS)
+	uint64_t v = mask;   // Input value to find position with rank r.
+	unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
+	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
+	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
+	unsigned int t;      // Bit count temporary.
+
+	// Do a normal parallel bit count for a 64-bit integer,
+	// but store all intermediate steps.
+	// a = (v & 0x5555...) + ((v >> 1) & 0x5555...);
+	a =  v - ((v >> 1) & ~0UL/3);
+	// b = (a & 0x3333...) + ((a >> 2) & 0x3333...);
+	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
+	// c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...);
+	c = (b + (b >> 4)) & ~0UL/0x11;
+	// d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...);
+	d = (c + (c >> 8)) & ~0UL/0x101;
+
+
+	t = (d >> 32) + (d >> 48);
+	// Now do branchless select!
+	s  = 64;
+	// if (r > t) {s -= 32; r -= t;}
+	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
+	t  = (d >> (s - 16)) & 0xff;
+	// if (r > t) {s -= 16; r -= t;}
+	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
+	t  = (c >> (s - 8)) & 0xf;
+	// if (r > t) {s -= 8; r -= t;}
+	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
+	t  = (b >> (s - 4)) & 0x7;
+	// if (r > t) {s -= 4; r -= t;}
+	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
+	t  = (a >> (s - 2)) & 0x3;
+	// if (r > t) {s -= 2; r -= t;}
+	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
+	t  = (v >> (s - 1)) & 0x1;
+	// if (r > t) s--;
+	s -= ((t - r) & 256) >> 8;
+	// s = 65 - s;
+	return s - 1;
+#elif defined(LOOP)
+	for(unsigned i = 0; i < bit; i++) {
+		mask ^= (1ul << (__builtin_ffsl(mask) - 1ul));
+	}
+	return __builtin_ffsl(mask) - 1ul;
+#elif defined(PDEP)
+	uint64_t picked = _pdep_u64(1ul << bit, mask);
+	return __builtin_ffsl(picked) - 1ul;
+#else
+#error must define LOOP, PDEP or BRANCHLESS
+#endif
+}
+
+#include <cassert>
+#include <atomic>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <locale>
+#include <thread>
+
+#include <unistd.h>
+
+class barrier_t {
+public:
+	barrier_t(size_t total)
+		: waiting(0)
+		, total(total)
+	{}
+
+	void wait(unsigned) {
+		size_t target = waiting++;
+		target = (target - (target % total)) + total;
+		while(waiting < target)
+			asm volatile("pause");
+
+		assert(waiting < (1ul << 60));
+    	}
+
+private:
+	std::atomic<size_t> waiting;
+	size_t total;
+};
+
+class Random {
+private:
+	unsigned int seed;
+public:
+	Random(int seed) {
+		this->seed = seed;
+	}
+
+	/** returns pseudorandom x satisfying 0 <= x < n. **/
+	unsigned int next() {
+		seed ^= seed << 6;
+		seed ^= seed >> 21;
+		seed ^= seed << 7;
+		return seed;
+    	}
+};
+
+using Clock = std::chrono::high_resolution_clock;
+using duration_t = std::chrono::duration<double>;
+using std::chrono::nanoseconds;
+
+template<typename Ratio, typename T>
+T duration_cast(T seconds) {
+	return std::chrono::duration_cast<std::chrono::duration<T, Ratio>>(std::chrono::duration<T>(seconds)).count();
+}
+
+void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) {
+
+
+	std::cout << "Starting" << std::endl;
+	auto before = Clock::now();
+	barrier.wait(0);
+
+	while(true) {
+		usleep(100000);
+		auto now = Clock::now();
+		duration_t durr = now - before;
+		if( durr.count() > duration ) {
+			done = true;
+			break;
+		}
+		std::cout << "\r" << std::setprecision(4) << durr.count();
+		std::cout.flush();
+	}
+
+	barrier.wait(0);
+	auto after = Clock::now();
+	duration_t durr = after - before;
+	duration = durr.count();
+	std::cout << "\rClosing down" << std::endl;
+}
+
+__attribute__((noinline)) void body(Random & rand) {
+	uint64_t mask = (uint64_t(rand.next()) << 32ul) | uint64_t(rand.next());
+	unsigned idx = rand.next();
+
+	unsigned bit = rand_bit(idx, mask);
+
+	if(__builtin_expect(((1ul << bit) & mask) == 0, false)) {
+		std::cerr << std::hex <<  "Rand " << idx << " from " << mask;
+		std::cerr << " gave " << (1ul << bit) << "(" << std::dec << bit << ")" << std::endl;
+		std::abort();
+	}
+}
+
+void runRandBit(double duration) {
+
+	std::atomic_bool done  = { false };
+	barrier_t barrier(2);
+
+	size_t count = 0;
+	std::thread thread([&done, &barrier, &count]() {
+
+		Random rand(22);
+
+		barrier.wait(1);
+
+		for(;!done; count++) {
+			body(rand);
+		}
+
+		barrier.wait(1);
+	});
+
+	waitfor(duration, barrier, done);
+	thread.join();
+
+	size_t ops = count;
+	size_t ops_sec = size_t(double(ops) / duration);
+	auto dur_nano = duration_cast<std::nano>(1.0);
+
+	std::cout << "Duration      : " << duration << "s\n";
+	std::cout << "ns/Op         : " << ( dur_nano / ops )<< "\n";
+	std::cout << "Ops/sec       : " << ops_sec << "\n";
+	std::cout << "Total ops     : " << ops << std::endl;
+
+}
+
+int main() {
+	std::cout.imbue(std::locale(""));
+	runRandBit(5);
+}
Index: doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp	(revision df75fe974af930420e0dc45e8117b8e9434c224c)
+++ doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp	(revision 40cac90c90ddfdd945a6730e54a6ae73d8629bfc)
@@ -22,4 +22,5 @@
 
 	int value;
+	int id;
 
 	Node() { creates++; }
@@ -37,5 +38,10 @@
 
 template<>
-relaxed_list<Node>::intrusive_queue_t::stat::Dif relaxed_list<Node>::intrusive_queue_t::stat::dif = {};
+relaxed_list<Node> * relaxed_list<Node>::head = nullptr;
+
+#ifndef NO_STATS
+template<>
+relaxed_list<Node>::GlobalStats relaxed_list<Node>::global_stats = {};
+#endif
 
 // ================================================================================================
@@ -49,4 +55,6 @@
 	size_t crc_in  = 0;
 	size_t crc_out = 0;
+	size_t valmax = 0;
+	size_t valmin = 100000000ul;
 };
 
@@ -57,28 +65,28 @@
 	std::atomic_size_t crc_in  = { 0 };
 	std::atomic_size_t crc_out = { 0 };
-	struct {
-		struct {
-			std::atomic_size_t attempt = { 0 };
-			std::atomic_size_t success = { 0 };
-		} push;
-		struct {
-			std::atomic_size_t attempt = { 0 };
-			std::atomic_size_t success = { 0 };
-			std::atomic_size_t mask_attempt = { 0 };
-		} pop;
-	} pick;
-	struct {
-		struct {
-			std::atomic_size_t value = { 0 };
-			std::atomic_size_t count = { 0 };
-		} push;
-		struct {
-			std::atomic_size_t value = { 0 };
-			std::atomic_size_t count = { 0 };
-		} pop;
-	} qstat;
+	std::atomic_size_t valmax = { 0 };
+	std::atomic_size_t valmin = { 100000000ul };
 };
 
+void atomic_max(std::atomic_size_t & target, size_t value) {
+	for(;;) {
+		size_t expect = target.load(std::memory_order_relaxed);
+		if(value <= expect) return;
+		bool success = target.compare_exchange_strong(expect, value);
+		if(success) return;
+	}
+}
+
+void atomic_min(std::atomic_size_t & target, size_t value) {
+	for(;;) {
+		size_t expect = target.load(std::memory_order_relaxed);
+		if(value >= expect) return;
+		bool success = target.compare_exchange_strong(expect, value);
+		if(success) return;
+	}
+}
+
 void tally_stats(global_stat_t & global, local_stat_t & local) {
+
 	global.in    += local.in;
 	global.out   += local.out;
@@ -88,14 +96,8 @@
 	global.crc_out += local.crc_out;
 
-	global.pick.push.attempt += relaxed_list<Node>::tls.pick.push.attempt;
-	global.pick.push.success += relaxed_list<Node>::tls.pick.push.success;
-	global.pick.pop .attempt += relaxed_list<Node>::tls.pick.pop.attempt;
-	global.pick.pop .success += relaxed_list<Node>::tls.pick.pop.success;
-	global.pick.pop .mask_attempt += relaxed_list<Node>::tls.pick.pop.mask_attempt;
-
-	global.qstat.push.value += relaxed_list<Node>::tls.empty.push.value;
-	global.qstat.push.count += relaxed_list<Node>::tls.empty.push.count;
-	global.qstat.pop .value += relaxed_list<Node>::tls.empty.pop .value;
-	global.qstat.pop .count += relaxed_list<Node>::tls.empty.pop .count;
+	atomic_max(global.valmax, local.valmax);
+	atomic_min(global.valmin, local.valmin);
+
+	relaxed_list<Node>::stats_tls_tally();
 }
 
@@ -124,4 +126,26 @@
 }
 
+void waitfor(double & duration, barrier_t & barrier, const std::atomic_size_t & count) {
+	std::cout << "Starting" << std::endl;
+	auto before = Clock::now();
+	barrier.wait(0);
+
+	while(true) {
+		usleep(100000);
+		size_t c = count.load();
+		if( c == 0 ) {
+			break;
+		}
+		std::cout << "\r" << c;
+		std::cout.flush();
+	}
+
+	barrier.wait(0);
+	auto after = Clock::now();
+	duration_t durr = after - before;
+	duration = durr.count();
+	std::cout << "\rClosing down" << std::endl;
+}
+
 void print_stats(double duration, unsigned nthread, global_stat_t & global) {
 	assert(Node::creates == Node::destroys);
@@ -140,20 +164,14 @@
 	std::cout << "Ops/sec       : " << ops_sec << "\n";
 	std::cout << "Total ops     : " << ops << "(" << global.in << "i, " << global.out << "o, " << global.empty << "e)\n";
+	if(global.valmax != 0) {
+		std::cout << "Max runs      : " << global.valmax << "\n";
+		std::cout << "Min runs      : " << global.valmin << "\n";
+	}
 	#ifndef NO_STATS
-		double push_sur = (100.0 * double(global.pick.push.success) / global.pick.push.attempt);
-		double pop_sur  = (100.0 * double(global.pick.pop .success) / global.pick.pop .attempt);
-
-		std::cout << "Push Pick %   : " << push_sur << "(" << global.pick.push.success << " / " << global.pick.push.attempt << ")\n";
-		std::cout << "Pop  Pick %   : " << pop_sur  << "(" << global.pick.pop .success << " / " << global.pick.pop .attempt << ")\n";
-		std::cout << "Pop mask trys : " << global.pick.pop.mask_attempt << std::endl;
-
-		double avgQ_push = double(global.qstat.push.value) / global.qstat.push.count;
-		double avgQ_pop  = double(global.qstat.pop .value) / global.qstat.pop .count;
-		double avgQ      = double(global.qstat.push.value + global.qstat.pop .value) / (global.qstat.push.count + global.qstat.pop .count);
-		std::cout << "Push   Avg Qs : " << avgQ_push << " (" << global.qstat.push.count << "ops)\n";
-		std::cout << "Pop    Avg Qs : " << avgQ_pop  << " (" << global.qstat.pop .count << "ops)\n";
-		std::cout << "Global Avg Qs : " << avgQ      << " (" << (global.qstat.push.count + global.qstat.pop .count) << "ops)\n";
+		relaxed_list<Node>::stats_print(std::cout);
 	#endif
 }
+
+void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output);
 
 // ================================================================================================
@@ -193,5 +211,4 @@
 	assert(nnodes <= nslots);
 	// List being tested
-	relaxed_list<Node> list = { nthread * nqueues };
 
 	// Barrier for synchronization
@@ -207,77 +224,79 @@
 	std::cout << "Initializing ";
 	size_t npushed = 0;
-
-	Node** all_nodes[nthread];
-	for(auto & nodes : all_nodes) {
-		nodes = new __attribute__((aligned(64))) Node*[nslots + 8];
-		Random rand(rdtscl());
-		for(unsigned i = 0; i < nnodes; i++) {
-			nodes[i] = new Node(rand.next() % 100);
-		}
-
-		for(unsigned i = nnodes; i < nslots; i++) {
-			nodes[i] = nullptr;
-		}
-
-		for(int i = 0; i < 10 && i < (int)nslots; i++) {
-			int idx = rand.next() % nslots;
-			if (auto node = nodes[idx]) {
-				global.crc_in += node->value;
-				list.push(node);
-				npushed++;
-				nodes[idx] = nullptr;
+	relaxed_list<Node> list = { nthread * nqueues };
+	{
+		Node** all_nodes[nthread];
+		for(auto & nodes : all_nodes) {
+			nodes = new __attribute__((aligned(64))) Node*[nslots + 8];
+			Random rand(rdtscl());
+			for(unsigned i = 0; i < nnodes; i++) {
+				nodes[i] = new Node(rand.next() % 100);
 			}
-		}
-	}
-
-	std::cout << nnodes << " nodes (" << nslots << " slots)" << std::endl;
-
-	enable_stats = true;
-
-	std::thread * threads[nthread];
-	unsigned i = 1;
-	for(auto & t : threads) {
-		auto & my_nodes = all_nodes[i - 1];
-		t = new std::thread([&done, &list, &barrier, &global, &my_nodes, nslots](unsigned tid) {
-			Random rand(tid + rdtscl());
-
-			local_stat_t local;
-
-			// affinity(tid);
-
-			barrier.wait(tid);
-
-			// EXPERIMENT START
-
-			runChurn_body(done, rand, my_nodes, nslots, local, list);
-
-			// EXPERIMENT END
-
-			barrier.wait(tid);
-
-			tally_stats(global, local);
-
-			for(unsigned i = 0; i < nslots; i++) {
-				delete my_nodes[i];
+
+			for(unsigned i = nnodes; i < nslots; i++) {
+				nodes[i] = nullptr;
 			}
-		}, i++);
-	}
-
-	waitfor(duration, barrier, done);
-
-	for(auto t : threads) {
-		t->join();
-		delete t;
-	}
-
-	enable_stats = false;
-
-	while(auto node = list.pop()) {
-		global.crc_out += node->value;
-		delete node;
-	}
-
-	for(auto nodes : all_nodes) {
-		delete[] nodes;
+
+			for(int i = 0; i < 10 && i < (int)nslots; i++) {
+				int idx = rand.next() % nslots;
+				if (auto node = nodes[idx]) {
+					global.crc_in += node->value;
+					list.push(node);
+					npushed++;
+					nodes[idx] = nullptr;
+				}
+			}
+		}
+
+		std::cout << nnodes << " nodes (" << nslots << " slots)" << std::endl;
+
+		enable_stats = true;
+
+		std::thread * threads[nthread];
+		unsigned i = 1;
+		for(auto & t : threads) {
+			auto & my_nodes = all_nodes[i - 1];
+			t = new std::thread([&done, &list, &barrier, &global, &my_nodes, nslots](unsigned tid) {
+				Random rand(tid + rdtscl());
+
+				local_stat_t local;
+
+				// affinity(tid);
+
+				barrier.wait(tid);
+
+				// EXPERIMENT START
+
+				runChurn_body(done, rand, my_nodes, nslots, local, list);
+
+				// EXPERIMENT END
+
+				barrier.wait(tid);
+
+				tally_stats(global, local);
+
+				for(unsigned i = 0; i < nslots; i++) {
+					delete my_nodes[i];
+				}
+			}, i++);
+		}
+
+		waitfor(duration, barrier, done);
+
+		for(auto t : threads) {
+			t->join();
+			delete t;
+		}
+
+		enable_stats = false;
+
+		while(auto node = list.pop()) {
+			global.crc_out += node->value;
+			delete node;
+		}
+
+		for(auto nodes : all_nodes) {
+			delete[] nodes;
+		}
 	}
 
@@ -323,6 +342,113 @@
 	std::cout << "PingPong Benchmark" << std::endl;
 
+
+	// Barrier for synchronization
+	barrier_t barrier(nthread + 1);
+
+	// Data to check everything is OK
+	global_stat_t global;
+
+	// Flag to signal termination
+	std::atomic_bool done  = { false };
+
+	std::cout << "Initializing ";
 	// List being tested
 	relaxed_list<Node> list = { nthread * nqueues };
+	{
+		enable_stats = true;
+
+		std::thread * threads[nthread];
+		unsigned i = 1;
+		for(auto & t : threads) {
+			t = new std::thread([&done, &list, &barrier, &global, nnodes](unsigned tid) {
+				Random rand(tid + rdtscl());
+
+				Node nodes[nnodes];
+				for(auto & n : nodes) {
+					n.value = (int)rand.next() % 100;
+				}
+
+				local_stat_t local;
+
+				// affinity(tid);
+
+				barrier.wait(tid);
+
+				// EXPERIMENT START
+
+				runPingPong_body(done, nodes, nnodes, local, list);
+
+				// EXPERIMENT END
+
+				barrier.wait(tid);
+
+				tally_stats(global, local);
+			}, i++);
+		}
+
+		waitfor(duration, barrier, done);
+
+		for(auto t : threads) {
+			t->join();
+			delete t;
+		}
+
+		enable_stats = false;
+	}
+
+	print_stats(duration, nthread, global);
+}
+
+// ================================================================================================
+__attribute__((noinline)) void runFairness_body(
+	unsigned tid,
+	size_t width,
+	size_t length,
+	int output[],
+	std::atomic_size_t & count,
+	Node initial_nodes[],
+	unsigned nnodes,
+	local_stat_t & local,
+	relaxed_list<Node> & list
+) {
+	Node * nodes[nnodes];
+	{
+		unsigned i = 0;
+		for(auto & n : nodes) {
+			n = &initial_nodes[i++];
+		}
+	}
+
+	while(__builtin_expect(0 != count.load(std::memory_order_relaxed), true)) {
+
+		for(Node * & node : nodes) {
+			local.crc_in += node->id;
+			list.push(node);
+			local.in++;
+		}
+
+		// -----
+
+		for(Node * & node : nodes) {
+			node = list.pop();
+			assert(node);
+
+			if (unsigned(node->value) < length) {
+				size_t idx = (node->value * width) + node->id;
+				assert(idx < (width * length));
+				output[idx] = tid;
+			}
+
+			node->value++;
+			if(unsigned(node->value) == length) count--;
+
+			local.crc_out += node->id;
+			local.out++;
+		}
+	}
+}
+
+void runFairness(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes, const std::string & output) {
+	std::cout << "Fairness Benchmark, outputing to : " << output << std::endl;
 
 	// Barrier for synchronization
@@ -332,50 +458,71 @@
 	global_stat_t global;
 
+	std::cout << "Initializing ";
+
+	// Check fairness by creating a png of where the threads ran
+	size_t width = nthread * nnodes;
+	size_t length = 100000;
+
+	std::unique_ptr<int[]> data_out { new int[width * length] };
+
 	// Flag to signal termination
-	std::atomic_bool done  = { false };
-
-	std::cout << "Initializing ";
-	enable_stats = true;
-
-	std::thread * threads[nthread];
-	unsigned i = 1;
-	for(auto & t : threads) {
-		t = new std::thread([&done, &list, &barrier, &global, nnodes](unsigned tid) {
-			Random rand(tid + rdtscl());
-
-			Node nodes[nnodes];
-			for(auto & n : nodes) {
-				n.value = (int)rand.next() % 100;
-			}
-
-			local_stat_t local;
-
-			// affinity(tid);
-
-			barrier.wait(tid);
-
-			// EXPERIMENT START
-
-			runPingPong_body(done, nodes, nnodes, local, list);
-
-			// EXPERIMENT END
-
-			barrier.wait(tid);
-
-			tally_stats(global, local);
-		}, i++);
-	}
-
-	waitfor(duration, barrier, done);
-
-	for(auto t : threads) {
-		t->join();
-		delete t;
-	}
-
-	enable_stats = false;
+	std::atomic_size_t count = width;
+
+	// List being tested
+	relaxed_list<Node> list = { nthread * nqueues };
+	{
+		enable_stats = true;
+
+		std::thread * threads[nthread];
+		unsigned i = 1;
+		for(auto & t : threads) {
+			t = new std::thread([&count, &list, &barrier, &global, nnodes, width, length, data_out = data_out.get()](unsigned tid) {
+				unsigned int start = (tid - 1) * nnodes;
+				Node nodes[nnodes];
+				for(auto & n : nodes) {
+					n.id = start;
+					n.value = 0;
+					start++;
+				}
+
+				local_stat_t local;
+
+				// affinity(tid);
+
+				barrier.wait(tid);
+
+				// EXPERIMENT START
+
+				runFairness_body(tid, width, length, data_out, count, nodes, nnodes, local, list);
+
+				// EXPERIMENT END
+
+				barrier.wait(tid);
+
+				for(const auto & n : nodes) {
+					local.valmax = max(local.valmax, size_t(n.value));
+					local.valmin = min(local.valmin, size_t(n.value));
+				}
+
+				tally_stats(global, local);
+			}, i++);
+		}
+
+		waitfor(duration, barrier, count);
+
+		for(auto t : threads) {
+			t->join();
+			delete t;
+		}
+
+		enable_stats = false;
+	}
 
 	print_stats(duration, nthread, global);
-}
+
+	save_fairness(data_out.get(), 100, nthread, width, length, output);
+}
+
+// ================================================================================================
 
 bool iequals(const std::string& a, const std::string& b)
@@ -395,8 +542,10 @@
 	unsigned nnodes   = 100;
 	unsigned nslots   = 100;
+	std::string out   = "fairness.png";
 
 	enum {
 		Churn,
 		PingPong,
+		Fairness,
 		NONE
 	} benchmark = NONE;
@@ -485,4 +634,16 @@
 					}
 					break;
+				case Fairness:
+					nnodes = 1;
+					switch(argc - optind) {
+					case 0: break;
+					case 1:
+						arg = optarg = argv[optind];
+						out = arg;
+						break;
+					default:
+						std::cerr << "'Churn' benchmark doesn't accept more than 2 extra arguments" << std::endl;
+						goto usage;
+					}
 				}
 				goto run;
@@ -499,4 +660,8 @@
 				if(iequals(arg, "pingpong")) {
 					benchmark = PingPong;
+					break;
+				}
+				if(iequals(arg, "fairness")) {
+					benchmark = Fairness;
 					break;
 				}
@@ -556,4 +721,7 @@
 			runPingPong(nthreads, nqueues, duration, nnodes);
 			break;
+		case Fairness:
+			runFairness(nthreads, nqueues, duration, nnodes, out);
+			break;
 		default:
 			abort();
@@ -563,2 +731,193 @@
 
 const char * __my_progname = "Relaxed List";
+
+struct rgb_t {
+    double r;       // a fraction between 0 and 1
+    double g;       // a fraction between 0 and 1
+    double b;       // a fraction between 0 and 1
+};
+
+struct hsv_t {
+    double h;       // angle in degrees
+    double s;       // a fraction between 0 and 1
+    double v;       // a fraction between 0 and 1
+};
+
+rgb_t hsv2rgb(hsv_t in) {
+	double hh, p, q, t, ff;
+	long   i;
+	rgb_t  out;
+
+	if(in.s <= 0.0) {       // < is bogus, just shuts up warnings
+		out.r = in.v;
+		out.g = in.v;
+		out.b = in.v;
+		return out;
+	}
+	hh = in.h;
+	if(hh >= 360.0) hh = 0.0;
+	hh /= 60.0;
+	i = (long)hh;
+	ff = hh - i;
+	p = in.v * (1.0 - in.s);
+	q = in.v * (1.0 - (in.s * ff));
+	t = in.v * (1.0 - (in.s * (1.0 - ff)));
+
+	switch(i) {
+	case 0:
+		out.r = in.v;
+		out.g = t;
+		out.b = p;
+		break;
+	case 1:
+		out.r = q;
+		out.g = in.v;
+		out.b = p;
+		break;
+	case 2:
+		out.r = p;
+		out.g = in.v;
+		out.b = t;
+		break;
+
+	case 3:
+		out.r = p;
+		out.g = q;
+		out.b = in.v;
+		break;
+	case 4:
+		out.r = t;
+		out.g = p;
+		out.b = in.v;
+		break;
+	case 5:
+	default:
+		out.r = in.v;
+		out.g = p;
+		out.b = q;
+		break;
+	}
+	return out;
+}
+
+void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output) {
+	std::ofstream os(output);
+	os << "<html>\n";
+	os << "<head>\n";
+	os << "<style>\n";
+	os << "</style>\n";
+	os << "</head>\n";
+	os << "<body>\n";
+	os << "<table style=\"width=100%\">\n";
+
+	size_t idx = 0;
+	for(size_t r = 0ul; r < rows; r++) {
+		os << "<tr>\n";
+		for(size_t c = 0ul; c < columns; c++) {
+			os << "<td class=\"custom custom" << data[idx] << "\"></td>\n";
+			idx++;
+		}
+		os << "</tr>\n";
+	}
+
+	os << "</table>\n";
+	os << "</body>\n";
+	os << "</html>\n";
+	os << std::endl;
+}
+
+#include <png.h>
+#include <setjmp.h>
+
+/*
+void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output) {
+	int width  = columns * factor;
+	int height = rows / factor;
+
+	int code = 0;
+	int idx = 0;
+	FILE *fp = NULL;
+	png_structp png_ptr = NULL;
+	png_infop info_ptr = NULL;
+	png_bytep row = NULL;
+
+	// Open file for writing (binary mode)
+	fp = fopen(output.c_str(), "wb");
+	if (fp == NULL) {
+		fprintf(stderr, "Could not open file %s for writing\n", output.c_str());
+		code = 1;
+		goto finalise;
+	}
+
+	   // Initialize write structure
+	png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+	if (png_ptr == NULL) {
+		fprintf(stderr, "Could not allocate write struct\n");
+		code = 1;
+		goto finalise;
+	}
+
+	// Initialize info structure
+	info_ptr = png_create_info_struct(png_ptr);
+	if (info_ptr == NULL) {
+		fprintf(stderr, "Could not allocate info struct\n");
+		code = 1;
+		goto finalise;
+	}
+
+	// Setup Exception handling
+	if (setjmp(png_jmpbuf(png_ptr))) {
+		fprintf(stderr, "Error during png creation\n");
+		code = 1;
+		goto finalise;
+	}
+
+	png_init_io(png_ptr, fp);
+
+	// Write header (8 bit colour depth)
+	png_set_IHDR(png_ptr, info_ptr, width, height,
+		8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE,
+		PNG_COMPRESSION_TYPE_BASE, PNG_FILTER_TYPE_BASE);
+
+	png_write_info(png_ptr, info_ptr);
+
+	// Allocate memory for one row (3 bytes per pixel - RGB)
+	row = (png_bytep) malloc(3 * width * sizeof(png_byte));
+
+	// Write image data
+	int x, y;
+	for (y=0 ; y<height ; y++) {
+		for (x=0 ; x<width ; x++) {
+			auto & r = row[(x * 3) + 0];
+			auto & g = row[(x * 3) + 1];
+			auto & b = row[(x * 3) + 2];
+			assert(idx < (rows * columns));
+			int color = data[idx] - 1;
+			assert(color < nthreads);
+			assert(color >= 0);
+			idx++;
+
+			double angle = double(color) / double(nthreads);
+
+			auto c = hsv2rgb({ 360.0 * angle, 0.8, 0.8 });
+
+			r = char(c.r * 255.0);
+			g = char(c.g * 255.0);
+			b = char(c.b * 255.0);
+
+		}
+		png_write_row(png_ptr, row);
+	}
+
+	assert(idx == (rows * columns));
+
+	// End write
+	png_write_end(png_ptr, NULL);
+
+	finalise:
+	if (fp != NULL) fclose(fp);
+	if (info_ptr != NULL) png_free_data(png_ptr, info_ptr, PNG_FREE_ALL, -1);
+	if (png_ptr != NULL) png_destroy_write_struct(&png_ptr, (png_infopp)NULL);
+	if (row != NULL) free(row);
+}
+*/
Index: doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp	(revision df75fe974af930420e0dc45e8117b8e9434c224c)
+++ doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp	(revision 40cac90c90ddfdd945a6730e54a6ae73d8629bfc)
@@ -105,5 +105,4 @@
 	static_assert(std::is_same<decltype(node_t::_links), _LinksFields_t<node_t>>::value, "Node must have a links field");
 
-
 public:
 	relaxed_list(unsigned numLists)
@@ -112,14 +111,16 @@
 	{
 		assertf(7 * 8 * 8 >= numLists, "List currently only supports 448 sublists");
-		assert(sizeof(*this) == 128);
+		// assert(sizeof(*this) == 128);
+		std::cout << "Constructing Relaxed List with " << numLists << std::endl;
+
+		#ifndef NO_STATS
+			if(head) this->next = head;
+			head = this;
+		#endif
 	}
 
 	~relaxed_list() {
+		std::cout << "Destroying Relaxed List" << std::endl;
 		lists.reset();
-		#ifndef NO_STATS
-			std::cout << "Difference   : "
-				<< ssize_t(double(intrusive_queue_t::stat::dif.value) / intrusive_queue_t::stat::dif.num  ) << " avg\t"
-				<< intrusive_queue_t::stat::dif.max << "max" << std::endl;
-		#endif
 	}
 
@@ -175,10 +176,12 @@
 			int nnempty;
 			while(0 != (nnempty = numNonEmpty)) {
+				tls.pick.pop.mask_attempt++;
 				unsigned i, j;
-				if( numLists < 4 || (numLists / nnempty) < 4 ) {
-					// Pick two lists at random
-					i = tls.rng.next() % numLists;
-					j = tls.rng.next() % numLists;
-				} else {
+				// if( numLists < 4 || (numLists / nnempty) < 4 ) {
+				// 	// Pick two lists at random
+				// 	i = tls.rng.next() % numLists;
+				// 	j = tls.rng.next() % numLists;
+				// } else
+				{
 					#ifndef NO_STATS
 						// tls.pick.push.mask_attempt++;
@@ -199,12 +202,9 @@
 					if(maski == 0 && maskj == 0) continue;
 
-					unsigned biti = maski ? ri % __builtin_popcountl(maski) : 0;
-					unsigned bitj = maskj ? rj % __builtin_popcountl(maskj) : 0;
-
-					unsigned bi = 64 - nthSetBit(maski, biti + 1);
-					unsigned bj = 64 - nthSetBit(maskj, bitj + 1);
-
-					assertf(bi < 64, "%zu %u %u", maski, biti, bi);
-					assertf(bj < 64, "%zu %u %u", maskj, bitj, bj);
+					unsigned bi = rand_bit(ri, maski);
+					unsigned bj = rand_bit(rj, maskj);
+
+					assertf(bi < 64, "%zu %u", maski, bi);
+					assertf(bj < 64, "%zu %u", maskj, bj);
 
 					i = bi | (wdxi << 6);
@@ -294,10 +294,8 @@
 		struct stat {
 			ssize_t diff = 0;
-
-			static struct Dif {
-				ssize_t value = 0;
-				size_t  num   = 0;
-				ssize_t max   = 0;
-			} dif;
+			size_t  push = 0;
+			size_t  pop  = 0;
+			// size_t value = 0;
+			// size_t count = 0;
 		};
 
@@ -314,5 +312,8 @@
 		#endif
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winvalid-offsetof"
 		static constexpr auto fields_offset = offsetof( node_t, _links );
+#pragma GCC diagnostic pop
 	public:
 		intrusive_queue_t()
@@ -330,11 +331,5 @@
 		}
 
-		~intrusive_queue_t() {
-			#ifndef NO_STATS
-				stat::dif.value+= s.diff;
-				stat::dif.num  ++;
-				stat::dif.max  = std::abs(stat::dif.max) > std::abs(s.diff) ? stat::dif.max : s.diff;
-			#endif
-		}
+		~intrusive_queue_t() = default;
 
 		inline node_t * head() const {
@@ -367,5 +362,8 @@
 			tail->_links.prev = node;
 			#ifndef NO_STATS
-				if(enable_stats) s.diff++;
+				if(enable_stats) {
+					s.diff++;
+					s.push++;
+				}
 			#endif
 			if(before._links.ts == 0l) {
@@ -390,5 +388,8 @@
 
 			#ifndef NO_STATS
-				if(enable_stats) s.diff--;
+				if(enable_stats) {
+					s.diff--;
+					s.pop ++;
+				}
 			#endif
 			if(next == tail) {
@@ -419,6 +420,6 @@
 
 public:
-	std::atomic_int numNonEmpty  = 0;  // number of non-empty lists
-	std::atomic_size_t list_mask[7] = { 0 }; // which queues are empty
+	std::atomic_int numNonEmpty  = { 0 };  // number of non-empty lists
+	std::atomic_size_t list_mask[7] = { {0}, {0}, {0}, {0}, {0}, {0}, {0} }; // which queues are empty
 private:
     	__attribute__((aligned(64))) std::unique_ptr<intrusive_queue_t []> lists;
@@ -427,3 +428,92 @@
 public:
 	static const constexpr size_t sizeof_queue = sizeof(intrusive_queue_t);
+
+#ifndef NO_STATS
+	static void stats_print(std::ostream & os) {
+		auto it = head;
+		while(it) {
+			it->stats_print_local(os);
+			it = it->next;
+		}
+	}
+
+	static void stats_tls_tally() {
+		global_stats.pick.push.attempt += tls.pick.push.attempt;
+		global_stats.pick.push.success += tls.pick.push.success;
+		global_stats.pick.pop .attempt += tls.pick.pop.attempt;
+		global_stats.pick.pop .success += tls.pick.pop.success;
+		global_stats.pick.pop .mask_attempt += tls.pick.pop.mask_attempt;
+
+		global_stats.qstat.push.value += tls.empty.push.value;
+		global_stats.qstat.push.count += tls.empty.push.count;
+		global_stats.qstat.pop .value += tls.empty.pop .value;
+		global_stats.qstat.pop .count += tls.empty.pop .count;
+	}
+
+private:
+	static struct GlobalStats {
+		struct {
+			struct {
+				std::atomic_size_t attempt = { 0 };
+				std::atomic_size_t success = { 0 };
+			} push;
+			struct {
+				std::atomic_size_t attempt = { 0 };
+				std::atomic_size_t success = { 0 };
+				std::atomic_size_t mask_attempt = { 0 };
+			} pop;
+		} pick;
+		struct {
+			struct {
+				std::atomic_size_t value = { 0 };
+				std::atomic_size_t count = { 0 };
+			} push;
+			struct {
+				std::atomic_size_t value = { 0 };
+				std::atomic_size_t count = { 0 };
+			} pop;
+		} qstat;
+	} global_stats;
+
+	// Link list of all lists for stats
+	__attribute__((aligned(64))) relaxed_list<node_t> * next = nullptr;
+
+	static relaxed_list<node_t> * head;
+
+	void stats_print_local(std::ostream & os ) {
+		std::cout << "----- Relaxed List Stats -----" << std::endl;
+		{
+			ssize_t diff = 0;
+			size_t  num  = 0;
+			ssize_t max  = 0;
+
+			for(size_t i = 0; i < numLists; i++) {
+				const auto & list = lists[i];
+				diff+= list.s.diff;
+				num ++;
+				max  = std::abs(max) > std::abs(list.s.diff) ? max : list.s.diff;
+				os << "Local Q ops   : " << (list.s.push + list.s.pop) << "(" << list.s.push << "i, " << list.s.pop << "o)\n";
+			}
+
+			os << "Difference   : " << ssize_t(double(diff) / num  ) << " avg\t" << max << "max" << std::endl;
+		}
+
+		const auto & global = global_stats;
+
+		double push_sur = (100.0 * double(global.pick.push.success) / global.pick.push.attempt);
+		double pop_sur  = (100.0 * double(global.pick.pop .success) / global.pick.pop .attempt);
+		double mpop_sur = (100.0 * double(global.pick.pop .success) / global.pick.pop .mask_attempt);
+
+		os << "Push   Pick % : " << push_sur << "(" << global.pick.push.success << " / " << global.pick.push.attempt << ")\n";
+		os << "Pop    Pick % : " << pop_sur  << "(" << global.pick.pop .success << " / " << global.pick.pop .attempt << ")\n";
+		os << "TryPop Pick % : " << mpop_sur << "(" << global.pick.pop .success << " / " << global.pick.pop .mask_attempt << ")\n";
+
+		double avgQ_push = double(global.qstat.push.value) / global.qstat.push.count;
+		double avgQ_pop  = double(global.qstat.pop .value) / global.qstat.pop .count;
+		double avgQ      = double(global.qstat.push.value + global.qstat.pop .value) / (global.qstat.push.count + global.qstat.pop .count);
+		os << "Push   Avg Qs : " << avgQ_push << " (" << global.qstat.push.count << "ops)\n";
+		os << "Pop    Avg Qs : " << avgQ_pop  << " (" << global.qstat.pop .count << "ops)\n";
+		os << "Global Avg Qs : " << avgQ      << " (" << (global.qstat.push.count + global.qstat.pop .count) << "ops)\n";
+	}
+#endif
 };
Index: doc/theses/thierry_delisle_PhD/code/utils.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/utils.hpp	(revision df75fe974af930420e0dc45e8117b8e9434c224c)
+++ doc/theses/thierry_delisle_PhD/code/utils.hpp	(revision 40cac90c90ddfdd945a6730e54a6ae73d8629bfc)
@@ -58,5 +58,5 @@
 }
 
-void affinity(int tid) {
+static inline void affinity(int tid) {
 	static int cpus = get_nprocs();
 
@@ -72,5 +72,5 @@
 
 static const constexpr std::size_t cache_line_size = 64;
-void check_cache_line_size() {
+static inline void check_cache_line_size() {
 	std::cout << "Checking cache line size" << std::endl;
 	const std::string cache_file = "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size";
@@ -106,26 +106,9 @@
 }
 
-// from geeksforgeeks
-const constexpr unsigned num_to_bits[16] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
-
-/* Recursively get nibble of a given number
-and map them in the array */
-__attribute__((noinline)) unsigned countSetBits(size_t num) {
-	unsigned nibble = 0;
-	if (0 == num)
-		return num_to_bits[0];
-
-	// Find last nibble
-	nibble = num & 0xf;
-
-	// Use pre-stored values to find count
-	// in last nibble plus recursively add
-	// remaining nibbles.
-	return num_to_bits[nibble] + countSetBits(num >> 4);
-}
-
-__attribute__((noinline)) unsigned nthSetBit(size_t mask, unsigned bit) {
+static inline unsigned rand_bit(unsigned rnum, size_t mask) {
+	unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
+#if !defined(__BMI2__)
 	uint64_t v = mask;   // Input value to find position with rank r.
-	unsigned int r = bit;// Input: bit's desired rank [1-64].
+	unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
 	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
 	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
@@ -134,11 +117,7 @@
 	// Do a normal parallel bit count for a 64-bit integer,
 	// but store all intermediate steps.
-	// a = (v & 0x5555...) + ((v >> 1) & 0x5555...);
 	a =  v - ((v >> 1) & ~0UL/3);
-	// b = (a & 0x3333...) + ((a >> 2) & 0x3333...);
 	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
-	// c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...);
 	c = (b + (b >> 4)) & ~0UL/0x11;
-	// d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...);
 	d = (c + (c >> 8)) & ~0UL/0x101;
 
@@ -147,63 +126,19 @@
 	// Now do branchless select!
 	s  = 64;
-	// if (r > t) {s -= 32; r -= t;}
 	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
 	t  = (d >> (s - 16)) & 0xff;
-	// if (r > t) {s -= 16; r -= t;}
 	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
 	t  = (c >> (s - 8)) & 0xf;
-	// if (r > t) {s -= 8; r -= t;}
 	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
 	t  = (b >> (s - 4)) & 0x7;
-	// if (r > t) {s -= 4; r -= t;}
 	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
 	t  = (a >> (s - 2)) & 0x3;
-	// if (r > t) {s -= 2; r -= t;}
 	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
 	t  = (v >> (s - 1)) & 0x1;
-	// if (r > t) s--;
 	s -= ((t - r) & 256) >> 8;
-	s = 65 - s;
-	return s;
+	return s - 1;
+#else
+	uint64_t picked = _pdep_u64(1ul << bit, mask);
+	return picked ? __builtin_ctzl(picked) : 0;
+#endif
 }
-
-// __attribute__((noinline)) unsigned int nth_bit_set(uint64_t value, unsigned int n)
-// {
-// 	uint64_t      mask = 0x00000000FFFFFFFFul;
-// 	unsigned int  size = 32u;
-// 	unsigned int  base = 0u;
-
-// 	if(value == 0) return 0;
-// 	assertf(n < (unsigned)__builtin_popcountl(value), "%u < %d (%zu)", n, __builtin_popcountl(value), value);
-// 	n++;
-// 	while (size > 0) {
-// 		const unsigned int  count = __builtin_popcountl(value & mask);
-// 		if (n > count) {
-// 			base += size;
-// 			size >>= 1;
-// 			mask |= mask << size;
-// 		} else {
-// 			size >>= 1;
-// 			mask >>= size;
-// 		}
-//     }
-
-//     return base;
-// }
-
-
-static inline uint64_t rotl64 (uint64_t n, unsigned int c) {
-	const unsigned int mask = (8*sizeof(n) - 1);  // assumes width is a power of 2.
-
-	c &= mask;
-	return (n<<c) | (n>>( (-c)&mask ));
-}
-
-static inline uint64_t rotr64 (uint64_t n, unsigned int c)
-{
-	const unsigned int mask = (8*sizeof(n) - 1);
-
-	// assert ( (c<=mask) &&"rotate by type width or more");
-	c &= mask;
-	return (n>>c) | (n<<( (-c)&mask ));
-}
