Index: doc/theses/thierry_delisle_PhD/code/Makefile
===================================================================
--- doc/theses/thierry_delisle_PhD/code/Makefile	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,22 +1,0 @@
-
-
-CXXFLAGS = -O3 -g -Wall -Wextra -std=c++17
-LDFLAGS = -pthread -latomic
-
-push:
-	clang++ relaxed_list.cpp -g -Wall -Wextra -std=c++17 -fsyntax-only &&  rsync -av relaxed_list.cpp relaxed_list.hpp utils.hpp assert.hpp scale.sh plg7b:~/workspace/sched/.
-
-relaxed_list: $(firstword $(MAKEFILE_LIST)) | build
-	clang++ relaxed_list.cpp $(CXXFLAGS) $(LDFLAGS) -lpng -MMD -MF build/$(@).d -o $(@)
-
--include build/relaxed_list.d
-
-layout.ast: $(firstword $(MAKEFILE_LIST)) | build
-	clang++ relaxed_list_layout.cpp $(CXXFLAGS) -MMD -MF build/$(@).d -MT $(@) -E -o build/$(@).ii
-	clang++ -Xclang -fdump-record-layouts -fsyntax-only $(CXXFLAGS) build/$(@).ii > build/layout.ast.raw
-	cat build/$(@).raw > $(@)
-
--include build/layout.ast.d
-
-build:
-	mkdir -p build
Index: doc/theses/thierry_delisle_PhD/code/assert.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/assert.hpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,22 +1,0 @@
-#pragma once
-
-#ifndef NDEBUG
-#include <cassert>
-#include <cstdlib>
-
-#define sstr(s) #s
-#define xstr(s) sstr(s)
-
-extern const char * __my_progname;
-
-#define assertf(cond, ...) ({             \
-	if(!(cond)) {                       \
-		fprintf(stderr, "%s: " __FILE__ ":" xstr(__LINE__) ": %s: Assertion '" xstr(cond) "' failed.\n", __my_progname, __PRETTY_FUNCTION__); \
-		fprintf(stderr, __VA_ARGS__); \
-		fprintf(stderr, "\n"); \
-		std::abort();                 \
-	}                                   \
-})
-#else
-#define assertf(cond, ...)
-#endif
Index: doc/theses/thierry_delisle_PhD/code/bitbench/select.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/bitbench/select.cpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,186 +1,0 @@
-
-#include "../utils.hpp"
-
-void consume(int i, int j) __attribute__((noinline));
-void consume(int i, int j) {
-	asm volatile("":: "rm" (i), "rm" (i) );
-}
-
-static inline unsigned rand_bit_sw(unsigned rnum, size_t mask) {
-	unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
-	uint64_t v = mask;   // Input value to find position with rank r.
-	unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
-	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
-	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
-	unsigned int t;      // Bit count temporary.
-
-	// Do a normal parallel bit count for a 64-bit integer,
-	// but store all intermediate steps.
-	a =  v - ((v >> 1) & ~0UL/3);
-	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
-	c = (b + (b >> 4)) & ~0UL/0x11;
-	d = (c + (c >> 8)) & ~0UL/0x101;
-
-
-	t = (d >> 32) + (d >> 48);
-	// Now do branchless select!
-	s  = 64;
-	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
-	t  = (d >> (s - 16)) & 0xff;
-	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
-	t  = (c >> (s - 8)) & 0xf;
-	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
-	t  = (b >> (s - 4)) & 0x7;
-	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
-	t  = (a >> (s - 2)) & 0x3;
-	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
-	t  = (v >> (s - 1)) & 0x1;
-	s -= ((t - r) & 256) >> 8;
-	return s - 1;
-}
-
-static inline unsigned rand_bit_hw(unsigned rnum, size_t mask) {
-	unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
-	uint64_t picked = _pdep_u64(1ul << bit, mask);
-	return picked ? __builtin_ctzl(picked) : 0;
-}
-
-struct TLS {
-	Random rng = { 6 };
-} tls;
-
-const unsigned numLists = 64;
-
-static inline void blind() {
-	int i = tls.rng.next() % numLists;
-	int j = tls.rng.next() % numLists;
-
-	consume(i, j);
-}
-
-std::atomic_size_t list_mask[7];
-static inline void bitmask_sw() {
-	unsigned i, j;
-	{
-		// Pick two lists at random
-		unsigned num = ((numLists - 1) >> 6) + 1;
-
-		unsigned ri = tls.rng.next();
-		unsigned rj = tls.rng.next();
-
-		unsigned wdxi = (ri >> 6u) % num;
-		unsigned wdxj = (rj >> 6u) % num;
-
-		size_t maski = list_mask[wdxi].load(std::memory_order_relaxed);
-		size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed);
-
-		unsigned bi = rand_bit_sw(ri, maski);
-		unsigned bj = rand_bit_sw(rj, maskj);
-
-		i = bi | (wdxi << 6);
-		j = bj | (wdxj << 6);
-	}
-
-	consume(i, j);
-}
-
-static inline void bitmask_hw() {
-	#if !defined(__BMI2__)
-		#warning NO bmi2 for pdep rand_bit
-		return;
-	#endif
-	unsigned i, j;
-	{
-		// Pick two lists at random
-		unsigned num = ((numLists - 1) >> 6) + 1;
-
-		unsigned ri = tls.rng.next();
-		unsigned rj = tls.rng.next();
-
-		unsigned wdxi = (ri >> 6u) % num;
-		unsigned wdxj = (rj >> 6u) % num;
-
-		size_t maski = list_mask[wdxi].load(std::memory_order_relaxed);
-		size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed);
-
-		unsigned bi = rand_bit_hw(ri, maski);
-		unsigned bj = rand_bit_hw(rj, maskj);
-
-		i = bi | (wdxi << 6);
-		j = bj | (wdxj << 6);
-	}
-
-	consume(i, j);
-}
-
-struct {
-	const unsigned mask = 7;
-	const unsigned depth = 3;
-	const uint64_t indexes = 0x0706050403020100;
-	uint64_t masks( unsigned node ) {
-		return 0xff00ffff00ff;
-	}
-} snzm;
-static inline void sparsemask() {
-	#if !defined(__BMI2__)
-		#warning NO bmi2 for sparse mask
-		return;
-	#endif
-	unsigned i, j;
-	{
-		// Pick two random number
-		unsigned ri = tls.rng.next();
-		unsigned rj = tls.rng.next();
-
-		// Pick two nodes from it
-		unsigned wdxi = ri & snzm.mask;
-		unsigned wdxj = rj & snzm.mask;
-
-		// Get the masks from the nodes
-		size_t maski = snzm.masks(wdxi);
-		size_t maskj = snzm.masks(wdxj);
-
-		uint64_t idxsi = _pext_u64(snzm.indexes, maski);
-		uint64_t idxsj = _pext_u64(snzm.indexes, maskj);
-
-		auto pi = __builtin_popcountll(maski);
-		auto pj = __builtin_popcountll(maskj);
-
-		ri = pi ? ri & ((pi >> 3) - 1) : 0;
-		rj = pj ? rj & ((pj >> 3) - 1) : 0;
-
-		unsigned bi = (idxsi >> (ri << 3)) & 0xff;
-		unsigned bj = (idxsj >> (rj << 3)) & 0xff;
-
-		i = (bi << snzm.depth) | wdxi;
-		j = (bj << snzm.depth) | wdxj;
-	}
-
-	consume(i, j);
-}
-
-template<typename T>
-void benchmark( T func, const std::string & name ) {
-	std::cout << "Starting " << name << std::endl;
-	auto before = Clock::now();
-	const int N = 250'000'000;
-	for(int i = 0; i < N; i++) {
-		func();
-	}
-	auto after = Clock::now();
-	duration_t durr = after - before;
-	double duration = durr.count();
-	std::cout << "Duration(s) : " << duration << std::endl;
-	std::cout << "Ops/sec     : " << uint64_t(N / duration) << std::endl;
-	std::cout << "ns/Op       : " << double(duration * 1'000'000'000.0 / N) << std::endl;
-	std::cout << std::endl;
-}
-
-int main() {
-	std::cout.imbue(std::locale(""));
-
-	benchmark(blind, "Blind guess");
-	benchmark(bitmask_sw, "Dense bitmask");
-	benchmark(bitmask_hw, "Dense bitmask with Parallel Deposit");
-	benchmark(sparsemask, "Parallel Extract bitmask");
-}
Index: doc/theses/thierry_delisle_PhD/code/bts.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/bts.cpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,279 +1,0 @@
-#include <array>
-#include <iomanip>
-#include <iostream>
-#include <locale>
-#include <string>
-#include <thread>
-#include <vector>
-
-#include <getopt.h>
-#include <unistd.h>
-#include <sys/sysinfo.h>
-
-#include "utils.hpp"
-
-// ================================================================================================
-//                        UTILS
-// ================================================================================================
-
-struct local_stat_t {
-	size_t cnt = 0;
-};
-
-struct global_stat_t {
-	std::atomic_size_t cnt = { 0 };
-};
-
-void atomic_max(std::atomic_size_t & target, size_t value) {
-	for(;;) {
-		size_t expect = target.load(std::memory_order_relaxed);
-		if(value <= expect) return;
-		bool success = target.compare_exchange_strong(expect, value);
-		if(success) return;
-	}
-}
-
-void atomic_min(std::atomic_size_t & target, size_t value) {
-	for(;;) {
-		size_t expect = target.load(std::memory_order_relaxed);
-		if(value >= expect) return;
-		bool success = target.compare_exchange_strong(expect, value);
-		if(success) return;
-	}
-}
-
-void tally_stats(global_stat_t & global, local_stat_t & local) {
-	global.cnt   += local.cnt;
-}
-
-void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) {
-	std::cout << "Starting" << std::endl;
-	auto before = Clock::now();
-	barrier.wait(0);
-
-	while(true) {
-		usleep(100000);
-		auto now = Clock::now();
-		duration_t durr = now - before;
-		if( durr.count() > duration ) {
-			done = true;
-			break;
-		}
-		std::cout << "\r" << std::setprecision(4) << durr.count();
-		std::cout.flush();
-	}
-
-	barrier.wait(0);
-	auto after = Clock::now();
-	duration_t durr = after - before;
-	duration = durr.count();
-	std::cout << "\rClosing down" << std::endl;
-}
-
-void waitfor(double & duration, barrier_t & barrier, const std::atomic_size_t & count) {
-	std::cout << "Starting" << std::endl;
-	auto before = Clock::now();
-	barrier.wait(0);
-
-	while(true) {
-		usleep(100000);
-		size_t c = count.load();
-		if( c == 0 ) {
-			break;
-		}
-		std::cout << "\r" << c;
-		std::cout.flush();
-	}
-
-	barrier.wait(0);
-	auto after = Clock::now();
-	duration_t durr = after - before;
-	duration = durr.count();
-	std::cout << "\rClosing down" << std::endl;
-}
-
-void print_stats(double duration, unsigned nthread, global_stat_t & global) {
-	std::cout << "Done" << std::endl;
-
-	size_t ops = global.cnt;
-	size_t ops_sec = size_t(double(ops) / duration);
-	size_t ops_thread = ops_sec / nthread;
-	auto dur_nano = duration_cast<std::nano>(1.0);
-
-	std::cout << "Duration      : " << duration << "s\n";
-	std::cout << "ns/Op         : " << ( dur_nano / ops_thread )<< "\n";
-	std::cout << "Ops/sec/thread: " << ops_thread << "\n";
-	std::cout << "Ops/sec       : " << ops_sec << "\n";
-	std::cout << "Total ops     : " << ops << "\n";
-}
-
-static inline bool bts(std::atomic_size_t & target, size_t bit ) {
-	/*
-	int result = 0;
-	asm volatile(
-		"LOCK btsq %[bit], %[target]\n\t"
-		:"=@ccc" (result)
-		: [target] "m" (target), [bit] "r" (bit)
-	);
- 	return result != 0;
-	/*/
-	size_t mask = 1ul << bit;
-	size_t ret = target.fetch_or(mask, std::memory_order_relaxed);
-	return (ret & mask) != 0;
-	//*/
-}
-
-static inline bool btr(std::atomic_size_t & target, size_t bit ) {
-	/*
-	int result = 0;
-	asm volatile(
-		"LOCK btrq %[bit], %[target]\n\t"
-		:"=@ccc" (result)
-		: [target] "m" (target), [bit] "r" (bit)
-	);
- 	return result != 0;
-	/*/
-	size_t mask = 1ul << bit;
-	size_t ret = target.fetch_and(~mask, std::memory_order_relaxed);
-	return (ret & mask) != 0;
-	//*/
-}
-
-// ================================================================================================
-//                        EXPERIMENTS
-// ================================================================================================
-
-// ================================================================================================
-__attribute__((noinline)) void runPingPong_body(
-	std::atomic<bool>& done,
-	local_stat_t & local,
-	std::atomic_size_t & target,
-	size_t id
-) {
-	while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
-
-		bool ret;
-		ret = bts(target, id);
-		assert(!ret);
-
-		// -----
-
-		ret = btr(target, id);
-		assert(ret);
-		local.cnt++;
-	}
-}
-
-void run(unsigned nthread, double duration) {
-	// Barrier for synchronization
-	barrier_t barrier(nthread + 1);
-
-	// Data to check everything is OK
-	global_stat_t global;
-
-	// Flag to signal termination
-	std::atomic_bool done  = { false };
-
-	std::cout << "Initializing ";
-	// List being tested
-	std::atomic_size_t word = { 0 };
-	{
-		std::thread * threads[nthread];
-		unsigned i = 1;
-		for(auto & t : threads) {
-			t = new std::thread([&done, &word, &barrier, &global](unsigned tid) {
-				local_stat_t local;
-
-				// affinity(tid);
-
-				barrier.wait(tid);
-
-				// EXPERIMENT START
-
-				runPingPong_body(done, local, word, tid - 1);
-
-				// EXPERIMENT END
-
-				barrier.wait(tid);
-
-				tally_stats(global, local);
-			}, i++);
-		}
-
-		waitfor(duration, barrier, done);
-
-		for(auto t : threads) {
-			t->join();
-			delete t;
-		}
-	}
-
-	print_stats(duration, nthread, global);
-}
-
-// ================================================================================================
-
-int main(int argc, char * argv[]) {
-
-	double duration   = 5.0;
-	unsigned nthreads = 2;
-
-	std::cout.imbue(std::locale(""));
-
-	for(;;) {
-		static struct option options[] = {
-			{"duration",  required_argument, 0, 'd'},
-			{"nthreads",  required_argument, 0, 't'},
-			{0, 0, 0, 0}
-		};
-
-		int idx = 0;
-		int opt = getopt_long(argc, argv, "d:t:", options, &idx);
-
-		std::string arg = optarg ? optarg : "";
-		size_t len = 0;
-		switch(opt) {
-			case -1:
-				if(optind != argc) {
-					std::cerr << "Too many arguments " << argc << " " << idx << std::endl;
-					goto usage;
-				}
-				goto run;
-			// Numeric Arguments
-			case 'd':
-				try {
-					duration = std::stod(optarg, &len);
-					if(len != arg.size()) { throw std::invalid_argument(""); }
-				} catch(std::invalid_argument &) {
-					std::cerr << "Duration must be a valid double, was " << arg << std::endl;
-					goto usage;
-				}
-				break;
-			case 't':
-				try {
-					nthreads = std::stoul(optarg, &len);
-					if(len != arg.size() || nthreads > (8 * sizeof(size_t))) { throw std::invalid_argument(""); }
-				} catch(std::invalid_argument &) {
-					std::cerr << "Number of threads must be a positive integer less than or equal to " << sizeof(size_t) * 8 << ", was " << arg << std::endl;
-					goto usage;
-				}
-				break;
-			// Other cases
-			default: /* ? */
-				std::cerr << opt << std::endl;
-			usage:
-				std::cerr << "Usage: " << argv[0] << ": [options]" << std::endl;
-				std::cerr << std::endl;
-				std::cerr << "  -d, --duration=DURATION  Duration of the experiment, in seconds" << std::endl;
-				std::cerr << "  -t, --nthreads=NTHREADS  Number of kernel threads" << std::endl;
-				std::exit(1);
-		}
-	}
-	run:
-
-	check_cache_line_size();
-
-	std::cout << "Running " << nthreads << " threads for " << duration << " seconds" << std::endl;
-	run(nthreads, duration);
-	return 0;
-}
Index: doc/theses/thierry_delisle_PhD/code/bts_test.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/bts_test.cpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,32 +1,0 @@
-#include <cassert>
-#include <iostream>
-
-bool bts(volatile size_t & target, size_t bit ) {
-	bool result = false;
-	asm volatile(
-		"LOCK btsq %[bit], %[target]\n\t"
-		:"=c" (result)
-		: [target] "m" (target), [bit] "r" (bit)
-	);
- 	return result;
-}
-
-bool btr(volatile size_t & target, size_t bit ) {
-	bool result = false;
-	asm volatile(
-		"LOCK btrq %[bit], %[target]\n\t"
-		:"=c" (result)
-		: [target] "m" (target), [bit] "r" (bit)
-	);
- 	return result;
-}
-
-int main() {
-	volatile size_t i = 0;
-	std::cout << std::hex << i << std::endl;
-	assert(bts(i, 31));
-	std::cout << std::hex << i << std::endl;
-	assert(btr(i, 31));
-	std::cout << std::hex << i << std::endl;
-	return 0;
-}
Index: doc/theses/thierry_delisle_PhD/code/links.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/links.hpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,122 +1,0 @@
-#pragma once
-
-#include "assert.hpp"
-#include "utils.hpp"
-
-template<typename node_t>
-struct _LinksFields_t {
-	node_t * prev = nullptr;
-	node_t * next = nullptr;
-	volatile unsigned long long ts = 0;
-	unsigned hint = (unsigned)-1;
-};
-
-template<typename node_t>
-class __attribute__((aligned(128))) intrusive_queue_t {
-public:
-	typedef spinlock_t lock_t;
-
-	struct stat {
-		ssize_t diff = 0;
-		size_t  push = 0;
-		size_t  pop  = 0;
-	};
-
-private:
-	struct sentinel_t {
-		_LinksFields_t<node_t> _links;
-	};
-
-public:
-	lock_t lock;
-
-private:
-	sentinel_t before;
-	sentinel_t after;
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Winvalid-offsetof"
-	static constexpr auto fields_offset = offsetof( node_t, _links );
-#pragma GCC diagnostic pop
-public:
-	intrusive_queue_t()
-		: before{{ nullptr, tail() }}
-		, after {{ head(), nullptr }}
-	{
-		/* paranoid */ assert((reinterpret_cast<uintptr_t>( head() ) + fields_offset) == reinterpret_cast<uintptr_t>(&before));
-		/* paranoid */ assert((reinterpret_cast<uintptr_t>( tail() ) + fields_offset) == reinterpret_cast<uintptr_t>(&after ));
-		/* paranoid */ assert(head()->_links.prev == nullptr);
-		/* paranoid */ assert(head()->_links.next == tail() );
-		/* paranoid */ assert(tail()->_links.next == nullptr);
-		/* paranoid */ assert(tail()->_links.prev == head() );
-		/* paranoid */ assert(sizeof(*this) == 128);
-		/* paranoid */ assert((intptr_t(this) % 128) == 0);
-	}
-
-	~intrusive_queue_t() = default;
-
-	inline node_t * head() const {
-		node_t * rhead = reinterpret_cast<node_t *>(
-			reinterpret_cast<uintptr_t>( &before ) - fields_offset
-		);
-		assert(rhead);
-		return rhead;
-	}
-
-	inline node_t * tail() const {
-		node_t * rtail = reinterpret_cast<node_t *>(
-			reinterpret_cast<uintptr_t>( &after ) - fields_offset
-		);
-		assert(rtail);
-		return rtail;
-	}
-
-	inline bool push(node_t * node) {
-		assert(lock);
-		assert(node->_links.ts != 0);
-		node_t * tail = this->tail();
-
-		node_t * prev = tail->_links.prev;
-		// assertf(node->_links.ts >= prev->_links.ts,
-		// 	"New node has smaller timestamp: %llu < %llu", node->_links.ts, prev->_links.ts);
-		node->_links.next = tail;
-		node->_links.prev = prev;
-		prev->_links.next = node;
-		tail->_links.prev = node;
-
-		if(before._links.ts == 0l) {
-			before._links.ts = node->_links.ts;
-			assert(node->_links.prev == this->head());
-			return true;
-		}
-		return false;
-	}
-
-	inline std::pair<node_t *, bool> pop() {
-		assert(lock);
-		node_t * head = this->head();
-		node_t * tail = this->tail();
-
-		node_t * node = head->_links.next;
-		node_t * next = node->_links.next;
-		if(node == tail) return {nullptr, false};
-
-		head->_links.next = next;
-		next->_links.prev = head;
-
-		if(next == tail) {
-			before._links.ts = 0l;
-			return {node, true};
-		}
-		else {
-			assert(next->_links.ts != 0);
-			before._links.ts = next->_links.ts;
-			assert(before._links.ts != 0);
-			return {node, false};
-		}
-	}
-
-	long long ts() const {
-		return before._links.ts;
-	}
-};
Index: doc/theses/thierry_delisle_PhD/code/prefetch.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/prefetch.cpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,106 +1,0 @@
-#include <algorithm>
-#include <array>
-#include <chrono>
-#include <iostream>
-#include <locale>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include <cassert>
-
-struct __attribute__((aligned(64))) element {
-	size_t value;
-};
-
-using block = std::array<element, 100>;
-
-block * create() {
-	block * b = new block();
-	for(auto & e : *b) {
-		e.value = rand();
-	}
-	b->back().value = b->size();
-
-	return b;
-}
-
-static inline size_t find(const block & b) {
-	size_t r = 0;
-	for(; r < b.size(); r++) {
-		if(__builtin_expect(b[r].value == b.size(), false)) break;
-	}
-
-	return r;
-}
-
-void usage(char * argv[]) {
-	std::cerr << argv[0] << ": [DURATION (FLOAT:SEC)] [NBLOCKS]" << std::endl;;
-	std::exit(1);
-}
-
-int main(int argc, char * argv[]) {
-	size_t nblocks = 1000;
-	double duration = 5;
-
-	std::cout.imbue(std::locale(""));
-
-	switch (argc)
-	{
-	case 3:
-		nblocks = std::stoul(argv[2]);
-		[[fallthrough]];
-	case 2:
-		duration = std::stod(argv[1]);
-		if( duration <= 0.0 ) {
-			std::cerr << "Duration must be positive, was " << argv[1] << "(" << duration << ")" << std::endl;
-			usage(argv);
-		}
-		[[fallthrough]];
-	case 1:
-		break;
-	default:
-		usage(argv);
-		break;
-	}
-
-	std::vector<std::unique_ptr<block>> blocks;
-	for(size_t i = 0; i < nblocks; i++) {
-		blocks.emplace_back( create() );
-	}
-	std::random_shuffle(blocks.begin(), blocks.end());
-
-	size_t CRC = 0;
-	size_t count = 0;
-
-	using clock = std::chrono::high_resolution_clock;
-	auto before = clock::now();
-
-	while(true) {
-		for(const auto & b : blocks) {
-			CRC += find(*b);
-			count++;
-		}
-		auto now = clock::now();
-		std::chrono::duration<double> durr = now - before;
-		if( durr.count() > duration ) {
-			break;
-		}
-	}
-
-	auto after = clock::now();
-	std::chrono::duration<double> durr = after - before;
-	duration = durr.count();
-
-	using std::chrono::duration_cast;
-	using std::chrono::nanoseconds;
-
-	size_t ops_sec = size_t(double(count) / duration);
-	auto dur_nano = duration_cast<nanoseconds>(std::chrono::duration<double>(1.0)).count();
-
-	std::cout << "CRC           : " << CRC << "\n";
-	std::cout << "Duration      : " << duration << "s\n";
-	std::cout << "Total ops     : " << count << "\n";
-	std::cout << "Ops/sec       : " << ops_sec << "\n";
-	std::cout << "ns/Op         : " << ( dur_nano / ops_sec )<< "\n";
-}
Index: doc/theses/thierry_delisle_PhD/code/process.sh
===================================================================
--- doc/theses/thierry_delisle_PhD/code/process.sh	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,42 +1,0 @@
-#!/bin/bash
-
-NAME=$1
-
-if [ ! -f "raw/${NAME}.out" ]; then
-    echo "Not output for ${NAME}"
-    exit 1
-fi
-
-if [ ! -f "raw/${NAME}.data" ]; then
-    echo "Not perf record for ${NAME}"
-    exit 1
-fi
-
-echo "Processing perf data for ${NAME}"
-
-OPS=$(grep -e 'Total ops' raw/${NAME}.out)
-CPOP=$( echo "Hello $OPS" | \grep -oP ", \K[0-9,]+(?=o)" --color | tr -d ',')
-CPUSH=$(echo "Hello $OPS" | \grep -oP "\(\K[0-9,]+(?=i)" --color | tr -d ',')
-
-REPORT=''
-perf report -n --percent-limit 5 --stdio --no-children -i raw/${NAME}.data > raw/.temp
-EVENT=$(cat raw/.temp | grep -e '^# Samples'| cut -d ' ' -f 6)
-SPOP=$( cat raw/.temp | grep -e '] relaxed_list<Node>::pop'  | tr -s ' ' | cut -d ' ' -f 3)
-SPUSH=$(cat raw/.temp | grep -e '] relaxed_list<Node>::push' | tr -s ' ' | cut -d ' ' -f 3)
-SARR=$( cat raw/.temp | grep -e '] snz[i|m]_t::node::arrive_h'   | tr -s ' ' | cut -d ' ' -f 3)
-
-echo "$OPS"
-echo "Push count: $CPUSH"
-echo "Pop  count: $CPOP"
-
-echo "Pop    samples: $SPOP"
-echo "Push   samples: $SPUSH"
-echo "Arrive samples: $SARR"
-
-SpPUSH=$(bc -l <<< "scale=9; $SPUSH / $CPUSH")
-SpPOP=$( bc -l <<< "scale=9; $SPOP  / $CPOP" )
-SpARR=$( bc -l <<< "scale=9; $SARR  / $CPUSH")
-
-printf "%s per push()  : %.9f\n" $EVENT $SpPUSH | sed ':a;s/\B[0-9]\{3\}\>/,&/;ta'
-printf "%s per pop()   : %.9f\n" $EVENT $SpPOP  | sed ':a;s/\B[0-9]\{3\}\>/,&/;ta'
-printf "%s per arrive(): %.9f\n" $EVENT $SpARR  | sed ':a;s/\B[0-9]\{3\}\>/,&/;ta'
Index: doc/theses/thierry_delisle_PhD/code/processor.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/processor.hpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,53 +1,0 @@
-#include <atomic>
-
-struct thread {};
-
-struct cluster {
-	void add();
-	void remove();
-	thread * next();
-};
-
-struct processor {
-
-	cluster cluster;
-	std::atomic<bool> stop;
-	volatile bool idle;
-};
-
-
-void run(thread * ) {
-	// verify preemption
-
-	// run Thread
-
-	// verify preemption
-
-	// finish Running
-}
-
-void main(processor & self) {
-
-	self.cluster.add();
-
-	while(!self.stop) {
-		if(thread * t = self.cluster.next()) {
-			run(t);
-			continue;
-		}
-
-		self.set_idle();
-		std::atomic_thread_fence();
-
-		if(thread * t = self.cluster.next()) {
-			self.idle = false;
-			run(t);
-			continue;
-		}
-
-		halt();
-	}
-
-	self.cluster.remove();
-
-}
Index: doc/theses/thierry_delisle_PhD/code/processor_list.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/processor_list.hpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,215 +1,0 @@
-#include <cassert>
-
-#include <atomic>
-#include <new>
-#include <type_traits>
-
-struct processor;
-
-struct __attribute__((aligned(64))) processor_id {
-	std::atomic<processor *> handle;
-	std::atomic<bool> lock;
-
-	processor_id() = default;
-	processor_id(processor * proc) : handle(proc), lock() {
-		/*paranoid*/ assert(std::atomic_is_lock_free(&lock));
-	}
-};
-
-extern unsigned num();
-
-#define ERROR throw 1
-
-class processor_list {
-private:
-
-	static const constexpr std::size_t cache_line_size = 64;
-
-	static_assert(sizeof (processor_id) <= cache_line_size, "ERROR: Instances must fit in one cache line" );
-	static_assert(alignof(processor_id) == cache_line_size, "ERROR: Instances must aligned to one cache line" );
-
-	const unsigned max;     // total cachelines allocated
-	std::atomic_uint alloc; // cachelines currently in use
-	std::atomic_uint ready; // cachelines ready to iterate over (!= to alloc when thread is in second half of doregister)
-	std::atomic<bool> lock; // writerlock
-	processor_id * data;    // data pointer
-
-private:
-	inline void acquire(std::atomic<bool> & ll) {
-		while( __builtin_expect(ll.exchange(true),false) ) {
-			while(ll.load(std::memory_order_relaxed))
-				asm volatile("pause");
-		}
-		/* paranoid */ assert(ll);
-	}
-
-public:
-	processor_list()
-		: max(num())
-		, alloc(0)
-		, ready(0)
-		, lock{false}
-		, data( new processor_id[max] )
-	{
-		/*paranoid*/ assert(num() == max);
-		/*paranoid*/ assert(std::atomic_is_lock_free(&alloc));
-		/*paranoid*/ assert(std::atomic_is_lock_free(&ready));
-	}
-
-	~processor_list() {
-		delete[] data;
-	}
-
-	//=======================================================================
-	// Lock-Free registering/unregistering of threads
-	unsigned doregister(processor * proc) {
-		// Step - 1 : check if there is already space in the data
-		uint_fast32_t s = ready;
-
-		// Check among all the ready
-		for(uint_fast32_t i = 0; i < s; i++) {
-			processor * null = nullptr; // Re-write every loop since compare thrashes it
-			if( data[i].handle.load(std::memory_order_relaxed) == null
-			 && data[i].handle.compare_exchange_strong(null, proc)) {
-				/*paranoid*/ assert(i < ready);
-				/*paranoid*/ assert(alignof(decltype(data[i])) == cache_line_size);
-				/*paranoid*/ assert((uintptr_t(&data[i]) % cache_line_size) == 0);
-				return i;
-			}
-		}
-
-		if(max <= alloc) ERROR;
-
-		// Step - 2 : F&A to get a new spot in the array.
-		uint_fast32_t n = alloc++;
-		if(max <= n) ERROR;
-
-		// Step - 3 : Mark space as used and then publish it.
-		void * storage = &data[n];
-		new (storage) processor_id( proc );
-		while(true) {
-			unsigned copy = n;
-			if( ready.load(std::memory_order_relaxed) == n
-			 && ready.compare_exchange_weak(copy, n + 1) )
-			 	break;
-			asm volatile("pause");
-		}
-
-		// Return new spot.
-		/*paranoid*/ assert(n < ready);
-		/*paranoid*/ assert(alignof(decltype(data[n])) == cache_line_size);
-		/*paranoid*/ assert((uintptr_t(&data[n]) % cache_line_size) == 0);
-		return n;
-	}
-
-	processor * unregister(unsigned iproc) {
-		/*paranoid*/ assert(iproc < ready);
-		auto ret = data[iproc].handle.load(std::memory_order_relaxed);
-		data[iproc].handle = nullptr;
-		return ret;
-	}
-
-	// Reset all registration
-	// Unsafe in most cases, use for testing only.
-	void reset() {
-		alloc = 0;
-		ready = 0;
-	}
-
-	processor * get(unsigned iproc) {
-		return data[iproc].handle.load(std::memory_order_relaxed);
-	}
-
-	//=======================================================================
-	// Reader-writer lock implementation
-	// Concurrent with doregister/unregister,
-	//    i.e., threads can be added at any point during or between the entry/exit
-
-	//-----------------------------------------------------------------------
-	// Reader side
-	void read_lock(unsigned iproc) {
-		/*paranoid*/ assert(iproc < ready);
-
-		// Step 1 : make sure no writer are in the middle of the critical section
-		while(lock.load(std::memory_order_relaxed))
-			asm volatile("pause");
-
-		// Fence needed because we don't want to start trying to acquire the lock
-		// before we read a false.
-		// Not needed on x86
-		// std::atomic_thread_fence(std::memory_order_seq_cst);
-
-		// Step 2 : acquire our local lock
-		acquire( data[iproc].lock );
-		/*paranoid*/ assert(data[iproc].lock);
-	}
-
-	void read_unlock(unsigned iproc) {
-		/*paranoid*/ assert(iproc < ready);
-		/*paranoid*/ assert(data[iproc].lock);
-		data[iproc].lock.store(false, std::memory_order_release);
-	}
-
-	//-----------------------------------------------------------------------
-	// Writer side
-	uint_fast32_t write_lock() {
-		// Step 1 : lock global lock
-		// It is needed to avoid processors that register mid Critical-Section
-		//   to simply lock their own lock and enter.
-		acquire(lock);
-
-		// Step 2 : lock per-proc lock
-		// Processors that are currently being registered aren't counted
-		//   but can't be in read_lock or in the critical section.
-		// All other processors are counted
-		uint_fast32_t s = ready;
-		for(uint_fast32_t i = 0; i < s; i++) {
-			acquire( data[i].lock );
-		}
-
-		return s;
-	}
-
-	void write_unlock(uint_fast32_t last_s) {
-		// Step 1 : release local locks
-		// This must be done while the global lock is held to avoid
-		//   threads that where created mid critical section
-		//   to race to lock their local locks and have the writer
-		//   immidiately unlock them
-		// Alternative solution : return s in write_lock and pass it to write_unlock
-		for(uint_fast32_t i = 0; i < last_s; i++) {
-			assert(data[i].lock);
-			data[i].lock.store(false, std::memory_order_release);
-		}
-
-		// Step 2 : release global lock
-		/*paranoid*/ assert(true == lock);
-		lock.store(false, std::memory_order_release);
-	}
-
-	//-----------------------------------------------------------------------
-	// Checking support
-	uint_fast32_t epoch_check() {
-		// Step 1 : lock global lock
-		// It is needed to avoid processors that register mid Critical-Section
-		//   to simply lock their own lock and enter.
-		while(lock.load(std::memory_order_relaxed))
-			asm volatile("pause");
-
-		// Step 2 : lock per-proc lock
-		// Processors that are currently being registered aren't counted
-		//   but can't be in read_lock or in the critical section.
-		// All other processors are counted
-		uint_fast32_t s = ready;
-		for(uint_fast32_t i = 0; i < s; i++) {
-			while(data[i].lock.load(std::memory_order_relaxed))
-				asm volatile("pause");
-		}
-
-		return s;
-	}
-
-public:
-};
-
-#undef ERROR
Index: doc/theses/thierry_delisle_PhD/code/processor_list_fast.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/processor_list_fast.cpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,173 +1,0 @@
-#include "processor_list.hpp"
-
-#include <array>
-#include <iomanip>
-#include <iostream>
-#include <locale>
-#include <string>
-#include <thread>
-
-#include "utils.hpp"
-
-unsigned num() {
-	return 0x1000000;
-}
-
-//-------------------
-
-struct processor {
-	unsigned id;
-};
-void run(unsigned nthread, double duration, unsigned writes, unsigned epochs) {
-	assert(writes < 100);
-
-	// List being tested
-	processor_list list = {};
-
-	// Barrier for synchronization
-	barrier_t barrier(nthread + 1);
-
-	// Data to check everything is OK
-	size_t write_committed = 0ul;
-	struct {
-		std::atomic_size_t write = { 0ul };
-		std::atomic_size_t read  = { 0ul };
-		std::atomic_size_t epoch = { 0ul };
-	} lock_cnt;
-
-	// Flag to signal termination
-	std::atomic_bool done = { false };
-
-	std::thread * threads[nthread];
-	unsigned i = 1;
-	for(auto & t : threads) {
-		t = new std::thread([&done, &list, &barrier, &write_committed, &lock_cnt, writes, epochs](unsigned tid) {
-			Random rand(tid + rdtscl());
-			processor proc;
-			proc.id = list.doregister(&proc);
-			size_t writes_cnt = 0;
-			size_t reads_cnt = 0;
-			size_t epoch_cnt = 0;
-
-			affinity(tid);
-
-			barrier.wait(tid);
-
-			while(__builtin_expect(!done, true)) {
-				auto r = rand.next() % 100;
-				if (r < writes) {
-					auto n = list.write_lock();
-					write_committed++;
-					writes_cnt++;
-					assert(writes_cnt < -2ul);
-					list.write_unlock(n);
-				}
-				else if(r < epochs) {
-					list.epoch_check();
-					epoch_cnt++;
-				}
-				else {
-					list.read_lock(proc.id);
-					reads_cnt++;
-					assert(reads_cnt < -2ul);
-					list.read_unlock(proc.id);
-				}
-			}
-
-			barrier.wait(tid);
-
-			auto p = list.unregister(proc.id);
-			assert(&proc == p);
-			lock_cnt.write += writes_cnt;
-			lock_cnt.read  += reads_cnt;
-			lock_cnt.epoch += epoch_cnt;
-		}, i++);
-	}
-
-	auto before = Clock::now();
-	barrier.wait(0);
-
-	while(true) {
-		usleep(1000);
-		auto now = Clock::now();
-		duration_t durr = now - before;
-		if( durr.count() > duration ) {
-			done = true;
-			break;
-		}
-	}
-
-	barrier.wait(0);
-	auto after = Clock::now();
-	duration_t durr = after - before;
-	duration = durr.count();
-
-	for(auto t : threads) {
-		t->join();
-		delete t;
-	}
-
-	assert(write_committed == lock_cnt.write);
-
-	size_t totalop = lock_cnt.read + lock_cnt.write + lock_cnt.epoch;
-	size_t ops_sec = size_t(double(totalop) / duration);
-	size_t ops_thread = ops_sec / nthread;
-	double dur_nano = duration_cast<std::nano>(1.0);
-
-	std::cout << "Duration      : " << duration << "s\n";
-	std::cout << "Total ops     : " << totalop << "(" << lock_cnt.read << "r, " << lock_cnt.write << "w, " << lock_cnt.epoch << "e)\n";
-	std::cout << "Ops/sec       : " << ops_sec << "\n";
-	std::cout << "Ops/sec/thread: " << ops_thread << "\n";
-	std::cout << "ns/Op         : " << ( dur_nano / ops_thread )<< "\n";
-}
-
-void usage(char * argv[]) {
-	std::cerr << argv[0] << ": [DURATION (FLOAT:SEC)] [NTHREADS] [%WRITES]" << std::endl;;
-	std::exit(1);
-}
-
-int main(int argc, char * argv[]) {
-
-	double duration   = 5.0;
-	unsigned nthreads = 2;
-	unsigned writes   = 0;
-	unsigned epochs   = 0;
-
-	std::cout.imbue(std::locale(""));
-
-	switch (argc)
-	{
-	case 5:
-		epochs = std::stoul(argv[4]);
-		[[fallthrough]];
-	case 4:
-		writes = std::stoul(argv[3]);
-		if( (writes + epochs) > 100 ) {
-			std::cerr << "Writes + Epochs must be valid percentage, was " << argv[3] << " + " << argv[4] << "(" << writes << " + " << epochs << ")" << std::endl;
-			usage(argv);
-		}
-		[[fallthrough]];
-	case 3:
-		nthreads = std::stoul(argv[2]);
-		[[fallthrough]];
-	case 2:
-		duration = std::stod(argv[1]);
-		if( duration <= 0.0 ) {
-			std::cerr << "Duration must be positive, was " << argv[1] << "(" << duration << ")" << std::endl;
-			usage(argv);
-		}
-		[[fallthrough]];
-	case 1:
-		break;
-	default:
-		usage(argv);
-		break;
-	}
-
-	check_cache_line_size();
-
-	std::cout << "Running " << nthreads << " threads for " << duration << " seconds with " << writes << "% writes and " << epochs << "% epochs" << std::endl;
-	run(nthreads, duration, writes, epochs + writes);
-
-	return 0;
-}
Index: doc/theses/thierry_delisle_PhD/code/processor_list_good.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/processor_list_good.cpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,269 +1,0 @@
-#include "processor_list.hpp"
-
-#include <iostream>
-#include <string>
-#include <thread>
-
-unsigned num() {
-	return 0x1000000;
-}
-
-// Barrier from
-class barrier_t {
-public:
-	barrier_t(size_t total)
-		: waiting(0)
-		, total(total)
-	{}
-
-	void wait(unsigned) {
-		size_t target = waiting++;
-		target = (target - (target % total)) + total;
-		while(waiting < target)
-			asm volatile("pause");
-
-		assert(waiting < (1ul << 60));
-    	}
-
-private:
-	std::atomic<size_t> waiting;
-	size_t total;
-};
-
-class Random {
-private:
-	unsigned int seed;
-public:
-	Random(int seed) {
-		this->seed = seed;
-	}
-
-	/** returns pseudorandom x satisfying 0 <= x < n. **/
-	unsigned int next() {
-		seed ^= seed << 6;
-		seed ^= seed >> 21;
-		seed ^= seed << 7;
-		return seed;
-    	}
-};
-
-//-------------------
-
-struct processor {
-	unsigned id;
-};
-
-// Stage 1
-// Make sure that the early registration works correctly
-// Registration uses a different process if the act of
-// registering the processor makes it the highest processor count
-// seen yet.
-void stage1(unsigned nthread, unsigned repeats) {
-	const int n = repeats;
-	const int nproc = 10;
-
-	// List being tested
-	processor_list list;
-
-	// Barrier for synchronization
-	barrier_t barrier(nthread + 1);
-
-	// Seen values to detect duplicattion
-	std::atomic<processor *> ids[nthread * nproc];
-	for(auto & i : ids) {
-		i = nullptr;
-	}
-
-	// Can't pass VLA to lambda
-	std::atomic<processor *> * idsp = ids;
-
-	// Threads which will run the code
-	std::thread * threads[nthread];
-	unsigned i = 1;
-	for(auto & t : threads) {
-		// Each thread will try to register a processor then add it to the
-		// list of registerd processor
-		t = new std::thread([&list, &barrier, idsp, n](unsigned tid){
-			processor proc[nproc];
-			for(int i = 0; i < n; i++) {
-				for(auto & p : proc) {
-					// Register the thread
-					p.id = list.doregister(&p);
-				}
-
-				for(auto & p : proc) {
-					// Make sure no one got this id before
-					processor * prev = idsp[p.id].exchange(&p);
-					assert(nullptr == prev);
-
-					// Make sure id is still consistend
-					assert(&p == list.get(p.id));
-				}
-
-				// wait for round to finish
-				barrier.wait(tid);
-
-				// wait for reset
-				barrier.wait(tid);
-			}
-		}, i++);
-	}
-
-	for(int i = 0; i < n; i++) {
-		//Wait for round to finish
-		barrier.wait(0);
-
-		// Reset list
-		list.reset();
-
-		std::cout << i << "\r";
-
-		// Reset seen values
-		for(auto & i : ids) {
-			i = nullptr;
-		}
-
-		// Start next round
-		barrier.wait(0);
-	}
-
-	for(auto t : threads) {
-		t->join();
-		delete t;
-	}
-}
-
-// Stage 2
-// Check that once churning starts, registration is still consistent.
-void stage2(unsigned nthread, unsigned repeats) {
-	// List being tested
-	processor_list list;
-
-	// Threads which will run the code
-	std::thread * threads[nthread];
-	unsigned i = 1;
-	for(auto & t : threads) {
-		// Each thread will try to register a few processors and
-		// unregister them, making sure that the registration is
-		// consistent
-		t = new std::thread([&list, repeats](unsigned tid){
-			processor procs[10];
-			for(unsigned i = 0; i < repeats; i++) {
-				// register the procs and note the id
-				for(auto & p : procs) {
-					p.id = list.doregister(&p);
-				}
-
-				if(1 == tid) std::cout << i << "\r";
-
-				// check the id is still consistent
-				for(const auto & p : procs) {
-					assert(&p == list.get(p.id));
-				}
-
-				// unregister and check the id is consistent
-				for(const auto & p : procs) {
-					assert(&p == list.unregister(p.id));
-				}
-			}
-		}, i++);
-	}
-
-	for(auto t : threads) {
-		t->join();
-		delete t;
-	}
-}
-
-bool is_writer();
-
-// Stage 3
-// Check that the reader writer lock works.
-void stage3(unsigned nthread, unsigned repeats) {
-	// List being tested
-	processor_list list;
-
-	size_t before = 0;
-
-	std::unique_ptr<size_t> after( new size_t(0) );
-
-	std::atomic<bool> done ( false );
-
-	// Threads which will run the code
-	std::thread * threads[nthread];
-	unsigned i = 1;
-	for(auto & t : threads) {
-		// Each thread will try to register a few processors and
-		// unregister them, making sure that the registration is
-		// consistent
-		t = new std::thread([&list, repeats, &before, &after, &done](unsigned tid){
-			Random rng(tid);
-			processor proc;
-			proc.id = list.doregister(&proc);
-			while(!done) {
-
-				if( (rng.next() % 100) == 0 ) {
-					auto r = list.write_lock();
-
-					auto b = before++;
-
-					std::cout << b << "\r";
-
-					(*after)++;
-
-					if(b >= repeats) done = true;
-
-					list.write_unlock(r);
-				}
-				else {
-					list.read_lock(proc.id);
-					assert(before == *after);
-					list.read_unlock(proc.id);
-				}
-
-			}
-
-			list.unregister(proc.id);
-		}, i++);
-	}
-
-	for(auto t : threads) {
-		t->join();
-		delete t;
-	}
-}
-
-int main(int argc, char * argv[]) {
-
-	unsigned nthreads = 1;
-	if( argc >= 3 ) {
-		size_t idx;
-		nthreads = std::stoul(argv[2], &idx);
-		assert('\0' == argv[2][idx]);
-	}
-
-	unsigned repeats = 100;
-	if( argc >= 2 ) {
-		size_t idx;
-		repeats = std::stoul(argv[1], &idx);
-		assert('\0' == argv[1][idx]);
-	}
-
-	processor_list::check_cache_line_size();
-
-	std::cout << "Running " << repeats << " repetitions on " << nthreads << " threads" << std::endl;
-	std::cout << "Checking registration - early" << std::endl;
-	stage1(nthreads, repeats);
-	std::cout << "Done                         " << std::endl;
-
-	std::cout << "Checking registration - churn" << std::endl;
-	stage2(nthreads, repeats);
-	std::cout << "Done                         " << std::endl;
-
-	std::cout << "Checking RW lock             " << std::endl;
-	stage3(nthreads, repeats);
-	std::cout << "Done                         " << std::endl;
-
-
-	return 0;
-}
Index: doc/theses/thierry_delisle_PhD/code/randbit.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/randbit.cpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,236 +1,0 @@
-#include <cstddef>
-#include <cstdint>
-#include <x86intrin.h>
-
-__attribute__((noinline)) unsigned nthSetBit(size_t mask, unsigned bit) {
-	uint64_t v = mask;   // Input value to find position with rank r.
-	unsigned int r = bit;// Input: bit's desired rank [1-64].
-	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
-	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
-	unsigned int t;      // Bit count temporary.
-
-	// Do a normal parallel bit count for a 64-bit integer,
-	// but store all intermediate steps.
-	// a = (v & 0x5555...) + ((v >> 1) & 0x5555...);
-	a =  v - ((v >> 1) & ~0UL/3);
-	// b = (a & 0x3333...) + ((a >> 2) & 0x3333...);
-	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
-	// c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...);
-	c = (b + (b >> 4)) & ~0UL/0x11;
-	// d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...);
-	d = (c + (c >> 8)) & ~0UL/0x101;
-
-
-	t = (d >> 32) + (d >> 48);
-	// Now do branchless select!
-	s  = 64;
-	// if (r > t) {s -= 32; r -= t;}
-	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
-	t  = (d >> (s - 16)) & 0xff;
-	// if (r > t) {s -= 16; r -= t;}
-	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
-	t  = (c >> (s - 8)) & 0xf;
-	// if (r > t) {s -= 8; r -= t;}
-	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
-	t  = (b >> (s - 4)) & 0x7;
-	// if (r > t) {s -= 4; r -= t;}
-	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
-	t  = (a >> (s - 2)) & 0x3;
-	// if (r > t) {s -= 2; r -= t;}
-	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
-	t  = (v >> (s - 1)) & 0x1;
-	// if (r > t) s--;
-	s -= ((t - r) & 256) >> 8;
-	// s = 65 - s;
-	return s;
-}
-
-unsigned rand_bit(unsigned rnum, uint64_t mask) {
-	unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
-#if defined(BRANCHLESS)
-	uint64_t v = mask;   // Input value to find position with rank r.
-	unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
-	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
-	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
-	unsigned int t;      // Bit count temporary.
-
-	// Do a normal parallel bit count for a 64-bit integer,
-	// but store all intermediate steps.
-	// a = (v & 0x5555...) + ((v >> 1) & 0x5555...);
-	a =  v - ((v >> 1) & ~0UL/3);
-	// b = (a & 0x3333...) + ((a >> 2) & 0x3333...);
-	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
-	// c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...);
-	c = (b + (b >> 4)) & ~0UL/0x11;
-	// d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...);
-	d = (c + (c >> 8)) & ~0UL/0x101;
-
-
-	t = (d >> 32) + (d >> 48);
-	// Now do branchless select!
-	s  = 64;
-	// if (r > t) {s -= 32; r -= t;}
-	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
-	t  = (d >> (s - 16)) & 0xff;
-	// if (r > t) {s -= 16; r -= t;}
-	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
-	t  = (c >> (s - 8)) & 0xf;
-	// if (r > t) {s -= 8; r -= t;}
-	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
-	t  = (b >> (s - 4)) & 0x7;
-	// if (r > t) {s -= 4; r -= t;}
-	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
-	t  = (a >> (s - 2)) & 0x3;
-	// if (r > t) {s -= 2; r -= t;}
-	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
-	t  = (v >> (s - 1)) & 0x1;
-	// if (r > t) s--;
-	s -= ((t - r) & 256) >> 8;
-	// s = 65 - s;
-	return s - 1;
-#elif defined(LOOP)
-	for(unsigned i = 0; i < bit; i++) {
-		mask ^= (1ul << (__builtin_ffsl(mask) - 1ul));
-	}
-	return __builtin_ffsl(mask) - 1ul;
-#elif defined(PDEP)
-	uint64_t picked = _pdep_u64(1ul << bit, mask);
-	return __builtin_ffsl(picked) - 1ul;
-#else
-#error must define LOOP, PDEP or BRANCHLESS
-#endif
-}
-
-#include <cassert>
-#include <atomic>
-#include <chrono>
-#include <iomanip>
-#include <iostream>
-#include <locale>
-#include <thread>
-
-#include <unistd.h>
-
-class barrier_t {
-public:
-	barrier_t(size_t total)
-		: waiting(0)
-		, total(total)
-	{}
-
-	void wait(unsigned) {
-		size_t target = waiting++;
-		target = (target - (target % total)) + total;
-		while(waiting < target)
-			asm volatile("pause");
-
-		assert(waiting < (1ul << 60));
-    	}
-
-private:
-	std::atomic<size_t> waiting;
-	size_t total;
-};
-
-class Random {
-private:
-	unsigned int seed;
-public:
-	Random(int seed) {
-		this->seed = seed;
-	}
-
-	/** returns pseudorandom x satisfying 0 <= x < n. **/
-	unsigned int next() {
-		seed ^= seed << 6;
-		seed ^= seed >> 21;
-		seed ^= seed << 7;
-		return seed;
-    	}
-};
-
-using Clock = std::chrono::high_resolution_clock;
-using duration_t = std::chrono::duration<double>;
-using std::chrono::nanoseconds;
-
-template<typename Ratio, typename T>
-T duration_cast(T seconds) {
-	return std::chrono::duration_cast<std::chrono::duration<T, Ratio>>(std::chrono::duration<T>(seconds)).count();
-}
-
-void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) {
-
-
-	std::cout << "Starting" << std::endl;
-	auto before = Clock::now();
-	barrier.wait(0);
-
-	while(true) {
-		usleep(100000);
-		auto now = Clock::now();
-		duration_t durr = now - before;
-		if( durr.count() > duration ) {
-			done = true;
-			break;
-		}
-		std::cout << "\r" << std::setprecision(4) << durr.count();
-		std::cout.flush();
-	}
-
-	barrier.wait(0);
-	auto after = Clock::now();
-	duration_t durr = after - before;
-	duration = durr.count();
-	std::cout << "\rClosing down" << std::endl;
-}
-
-__attribute__((noinline)) void body(Random & rand) {
-	uint64_t mask = (uint64_t(rand.next()) << 32ul) | uint64_t(rand.next());
-	unsigned idx = rand.next();
-
-	unsigned bit = rand_bit(idx, mask);
-
-	if(__builtin_expect(((1ul << bit) & mask) == 0, false)) {
-		std::cerr << std::hex <<  "Rand " << idx << " from " << mask;
-		std::cerr << " gave " << (1ul << bit) << "(" << std::dec << bit << ")" << std::endl;
-		std::abort();
-	}
-}
-
-void runRandBit(double duration) {
-
-	std::atomic_bool done  = { false };
-	barrier_t barrier(2);
-
-	size_t count = 0;
-	std::thread thread([&done, &barrier, &count]() {
-
-		Random rand(22);
-
-		barrier.wait(1);
-
-		for(;!done; count++) {
-			body(rand);
-		}
-
-		barrier.wait(1);
-	});
-
-	waitfor(duration, barrier, done);
-	thread.join();
-
-	size_t ops = count;
-	size_t ops_sec = size_t(double(ops) / duration);
-	auto dur_nano = duration_cast<std::nano>(1.0);
-
-	std::cout << "Duration      : " << duration << "s\n";
-	std::cout << "ns/Op         : " << ( dur_nano / ops )<< "\n";
-	std::cout << "Ops/sec       : " << ops_sec << "\n";
-	std::cout << "Total ops     : " << ops << std::endl;
-
-}
-
-int main() {
-	std::cout.imbue(std::locale(""));
-	runRandBit(5);
-}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/Makefile
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/Makefile	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/Makefile	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,22 @@
+
+
+CXXFLAGS = -O3 -g -Wall -Wextra -std=c++17
+LDFLAGS = -pthread -latomic
+
+push:
+	clang++ relaxed_list.cpp -g -Wall -Wextra -std=c++17 -fsyntax-only &&  rsync -av relaxed_list.cpp relaxed_list.hpp utils.hpp assert.hpp scale.sh plg7b:~/workspace/sched/.
+
+relaxed_list: $(firstword $(MAKEFILE_LIST)) | build
+	clang++ relaxed_list.cpp $(CXXFLAGS) $(LDFLAGS) -lpng -MMD -MF build/$(@).d -o $(@)
+
+-include build/relaxed_list.d
+
+layout.ast: $(firstword $(MAKEFILE_LIST)) | build
+	clang++ relaxed_list_layout.cpp $(CXXFLAGS) -MMD -MF build/$(@).d -MT $(@) -E -o build/$(@).ii
+	clang++ -Xclang -fdump-record-layouts -fsyntax-only $(CXXFLAGS) build/$(@).ii > build/layout.ast.raw
+	cat build/$(@).raw > $(@)
+
+-include build/layout.ast.d
+
+build:
+	mkdir -p build
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/assert.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/assert.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/assert.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,22 @@
+#pragma once
+
+#ifndef NDEBUG
+#include <cassert>
+#include <cstdlib>
+
+#define sstr(s) #s
+#define xstr(s) sstr(s)
+
+extern const char * __my_progname;
+
+#define assertf(cond, ...) ({             \
+	if(!(cond)) {                       \
+		fprintf(stderr, "%s: " __FILE__ ":" xstr(__LINE__) ": %s: Assertion '" xstr(cond) "' failed.\n", __my_progname, __PRETTY_FUNCTION__); \
+		fprintf(stderr, __VA_ARGS__); \
+		fprintf(stderr, "\n"); \
+		std::abort();                 \
+	}                                   \
+})
+#else
+#define assertf(cond, ...)
+#endif
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/bitbench/select.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/bitbench/select.cpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/bitbench/select.cpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,186 @@
+
+#include "../utils.hpp"
+
+void consume(int i, int j) __attribute__((noinline));
+void consume(int i, int j) {
+	asm volatile("":: "rm" (i), "rm" (i) );
+}
+
+static inline unsigned rand_bit_sw(unsigned rnum, size_t mask) {
+	unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
+	uint64_t v = mask;   // Input value to find position with rank r.
+	unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
+	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
+	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
+	unsigned int t;      // Bit count temporary.
+
+	// Do a normal parallel bit count for a 64-bit integer,
+	// but store all intermediate steps.
+	a =  v - ((v >> 1) & ~0UL/3);
+	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
+	c = (b + (b >> 4)) & ~0UL/0x11;
+	d = (c + (c >> 8)) & ~0UL/0x101;
+
+
+	t = (d >> 32) + (d >> 48);
+	// Now do branchless select!
+	s  = 64;
+	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
+	t  = (d >> (s - 16)) & 0xff;
+	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
+	t  = (c >> (s - 8)) & 0xf;
+	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
+	t  = (b >> (s - 4)) & 0x7;
+	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
+	t  = (a >> (s - 2)) & 0x3;
+	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
+	t  = (v >> (s - 1)) & 0x1;
+	s -= ((t - r) & 256) >> 8;
+	return s - 1;
+}
+
+static inline unsigned rand_bit_hw(unsigned rnum, size_t mask) {
+	unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
+	uint64_t picked = _pdep_u64(1ul << bit, mask);
+	return picked ? __builtin_ctzl(picked) : 0;
+}
+
+struct TLS {
+	Random rng = { 6 };
+} tls;
+
+const unsigned numLists = 64;
+
+static inline void blind() {
+	int i = tls.rng.next() % numLists;
+	int j = tls.rng.next() % numLists;
+
+	consume(i, j);
+}
+
+std::atomic_size_t list_mask[7];
+static inline void bitmask_sw() {
+	unsigned i, j;
+	{
+		// Pick two lists at random
+		unsigned num = ((numLists - 1) >> 6) + 1;
+
+		unsigned ri = tls.rng.next();
+		unsigned rj = tls.rng.next();
+
+		unsigned wdxi = (ri >> 6u) % num;
+		unsigned wdxj = (rj >> 6u) % num;
+
+		size_t maski = list_mask[wdxi].load(std::memory_order_relaxed);
+		size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed);
+
+		unsigned bi = rand_bit_sw(ri, maski);
+		unsigned bj = rand_bit_sw(rj, maskj);
+
+		i = bi | (wdxi << 6);
+		j = bj | (wdxj << 6);
+	}
+
+	consume(i, j);
+}
+
+static inline void bitmask_hw() {
+	#if !defined(__BMI2__)
+		#warning NO bmi2 for pdep rand_bit
+		return;
+	#endif
+	unsigned i, j;
+	{
+		// Pick two lists at random
+		unsigned num = ((numLists - 1) >> 6) + 1;
+
+		unsigned ri = tls.rng.next();
+		unsigned rj = tls.rng.next();
+
+		unsigned wdxi = (ri >> 6u) % num;
+		unsigned wdxj = (rj >> 6u) % num;
+
+		size_t maski = list_mask[wdxi].load(std::memory_order_relaxed);
+		size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed);
+
+		unsigned bi = rand_bit_hw(ri, maski);
+		unsigned bj = rand_bit_hw(rj, maskj);
+
+		i = bi | (wdxi << 6);
+		j = bj | (wdxj << 6);
+	}
+
+	consume(i, j);
+}
+
+struct {
+	const unsigned mask = 7;
+	const unsigned depth = 3;
+	const uint64_t indexes = 0x0706050403020100;
+	uint64_t masks( unsigned node ) {
+		return 0xff00ffff00ff;
+	}
+} snzm;
+static inline void sparsemask() {
+	#if !defined(__BMI2__)
+		#warning NO bmi2 for sparse mask
+		return;
+	#endif
+	unsigned i, j;
+	{
+		// Pick two random number
+		unsigned ri = tls.rng.next();
+		unsigned rj = tls.rng.next();
+
+		// Pick two nodes from it
+		unsigned wdxi = ri & snzm.mask;
+		unsigned wdxj = rj & snzm.mask;
+
+		// Get the masks from the nodes
+		size_t maski = snzm.masks(wdxi);
+		size_t maskj = snzm.masks(wdxj);
+
+		uint64_t idxsi = _pext_u64(snzm.indexes, maski);
+		uint64_t idxsj = _pext_u64(snzm.indexes, maskj);
+
+		auto pi = __builtin_popcountll(maski);
+		auto pj = __builtin_popcountll(maskj);
+
+		ri = pi ? ri & ((pi >> 3) - 1) : 0;
+		rj = pj ? rj & ((pj >> 3) - 1) : 0;
+
+		unsigned bi = (idxsi >> (ri << 3)) & 0xff;
+		unsigned bj = (idxsj >> (rj << 3)) & 0xff;
+
+		i = (bi << snzm.depth) | wdxi;
+		j = (bj << snzm.depth) | wdxj;
+	}
+
+	consume(i, j);
+}
+
+template<typename T>
+void benchmark( T func, const std::string & name ) {
+	std::cout << "Starting " << name << std::endl;
+	auto before = Clock::now();
+	const int N = 250'000'000;
+	for(int i = 0; i < N; i++) {
+		func();
+	}
+	auto after = Clock::now();
+	duration_t durr = after - before;
+	double duration = durr.count();
+	std::cout << "Duration(s) : " << duration << std::endl;
+	std::cout << "Ops/sec     : " << uint64_t(N / duration) << std::endl;
+	std::cout << "ns/Op       : " << double(duration * 1'000'000'000.0 / N) << std::endl;
+	std::cout << std::endl;
+}
+
+int main() {
+	std::cout.imbue(std::locale(""));
+
+	benchmark(blind, "Blind guess");
+	benchmark(bitmask_sw, "Dense bitmask");
+	benchmark(bitmask_hw, "Dense bitmask with Parallel Deposit");
+	benchmark(sparsemask, "Parallel Extract bitmask");
+}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/bts.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/bts.cpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/bts.cpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,279 @@
+#include <array>
+#include <iomanip>
+#include <iostream>
+#include <locale>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include <getopt.h>
+#include <unistd.h>
+#include <sys/sysinfo.h>
+
+#include "utils.hpp"
+
+// ================================================================================================
+//                        UTILS
+// ================================================================================================
+
+struct local_stat_t {
+	size_t cnt = 0;
+};
+
+struct global_stat_t {
+	std::atomic_size_t cnt = { 0 };
+};
+
+void atomic_max(std::atomic_size_t & target, size_t value) {
+	for(;;) {
+		size_t expect = target.load(std::memory_order_relaxed);
+		if(value <= expect) return;
+		bool success = target.compare_exchange_strong(expect, value);
+		if(success) return;
+	}
+}
+
+void atomic_min(std::atomic_size_t & target, size_t value) {
+	for(;;) {
+		size_t expect = target.load(std::memory_order_relaxed);
+		if(value >= expect) return;
+		bool success = target.compare_exchange_strong(expect, value);
+		if(success) return;
+	}
+}
+
+void tally_stats(global_stat_t & global, local_stat_t & local) {
+	global.cnt   += local.cnt;
+}
+
+void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) {
+	std::cout << "Starting" << std::endl;
+	auto before = Clock::now();
+	barrier.wait(0);
+
+	while(true) {
+		usleep(100000);
+		auto now = Clock::now();
+		duration_t durr = now - before;
+		if( durr.count() > duration ) {
+			done = true;
+			break;
+		}
+		std::cout << "\r" << std::setprecision(4) << durr.count();
+		std::cout.flush();
+	}
+
+	barrier.wait(0);
+	auto after = Clock::now();
+	duration_t durr = after - before;
+	duration = durr.count();
+	std::cout << "\rClosing down" << std::endl;
+}
+
+void waitfor(double & duration, barrier_t & barrier, const std::atomic_size_t & count) {
+	std::cout << "Starting" << std::endl;
+	auto before = Clock::now();
+	barrier.wait(0);
+
+	while(true) {
+		usleep(100000);
+		size_t c = count.load();
+		if( c == 0 ) {
+			break;
+		}
+		std::cout << "\r" << c;
+		std::cout.flush();
+	}
+
+	barrier.wait(0);
+	auto after = Clock::now();
+	duration_t durr = after - before;
+	duration = durr.count();
+	std::cout << "\rClosing down" << std::endl;
+}
+
+void print_stats(double duration, unsigned nthread, global_stat_t & global) {
+	std::cout << "Done" << std::endl;
+
+	size_t ops = global.cnt;
+	size_t ops_sec = size_t(double(ops) / duration);
+	size_t ops_thread = ops_sec / nthread;
+	auto dur_nano = duration_cast<std::nano>(1.0);
+
+	std::cout << "Duration      : " << duration << "s\n";
+	std::cout << "ns/Op         : " << ( dur_nano / ops_thread )<< "\n";
+	std::cout << "Ops/sec/thread: " << ops_thread << "\n";
+	std::cout << "Ops/sec       : " << ops_sec << "\n";
+	std::cout << "Total ops     : " << ops << "\n";
+}
+
+static inline bool bts(std::atomic_size_t & target, size_t bit ) {
+	/*
+	int result = 0;
+	asm volatile(
+		"LOCK btsq %[bit], %[target]\n\t"
+		:"=@ccc" (result)
+		: [target] "m" (target), [bit] "r" (bit)
+	);
+ 	return result != 0;
+	/*/
+	size_t mask = 1ul << bit;
+	size_t ret = target.fetch_or(mask, std::memory_order_relaxed);
+	return (ret & mask) != 0;
+	//*/
+}
+
+static inline bool btr(std::atomic_size_t & target, size_t bit ) {
+	/*
+	int result = 0;
+	asm volatile(
+		"LOCK btrq %[bit], %[target]\n\t"
+		:"=@ccc" (result)
+		: [target] "m" (target), [bit] "r" (bit)
+	);
+ 	return result != 0;
+	/*/
+	size_t mask = 1ul << bit;
+	size_t ret = target.fetch_and(~mask, std::memory_order_relaxed);
+	return (ret & mask) != 0;
+	//*/
+}
+
+// ================================================================================================
+//                        EXPERIMENTS
+// ================================================================================================
+
+// ================================================================================================
+__attribute__((noinline)) void runPingPong_body(
+	std::atomic<bool>& done,
+	local_stat_t & local,
+	std::atomic_size_t & target,
+	size_t id
+) {
+	while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
+
+		bool ret;
+		ret = bts(target, id);
+		assert(!ret);
+
+		// -----
+
+		ret = btr(target, id);
+		assert(ret);
+		local.cnt++;
+	}
+}
+
+void run(unsigned nthread, double duration) {
+	// Barrier for synchronization
+	barrier_t barrier(nthread + 1);
+
+	// Data to check everything is OK
+	global_stat_t global;
+
+	// Flag to signal termination
+	std::atomic_bool done  = { false };
+
+	std::cout << "Initializing ";
+	// List being tested
+	std::atomic_size_t word = { 0 };
+	{
+		std::thread * threads[nthread];
+		unsigned i = 1;
+		for(auto & t : threads) {
+			t = new std::thread([&done, &word, &barrier, &global](unsigned tid) {
+				local_stat_t local;
+
+				// affinity(tid);
+
+				barrier.wait(tid);
+
+				// EXPERIMENT START
+
+				runPingPong_body(done, local, word, tid - 1);
+
+				// EXPERIMENT END
+
+				barrier.wait(tid);
+
+				tally_stats(global, local);
+			}, i++);
+		}
+
+		waitfor(duration, barrier, done);
+
+		for(auto t : threads) {
+			t->join();
+			delete t;
+		}
+	}
+
+	print_stats(duration, nthread, global);
+}
+
+// ================================================================================================
+
+int main(int argc, char * argv[]) {
+
+	double duration   = 5.0;
+	unsigned nthreads = 2;
+
+	std::cout.imbue(std::locale(""));
+
+	for(;;) {
+		static struct option options[] = {
+			{"duration",  required_argument, 0, 'd'},
+			{"nthreads",  required_argument, 0, 't'},
+			{0, 0, 0, 0}
+		};
+
+		int idx = 0;
+		int opt = getopt_long(argc, argv, "d:t:", options, &idx);
+
+		std::string arg = optarg ? optarg : "";
+		size_t len = 0;
+		switch(opt) {
+			case -1:
+				if(optind != argc) {
+					std::cerr << "Too many arguments " << argc << " " << idx << std::endl;
+					goto usage;
+				}
+				goto run;
+			// Numeric Arguments
+			case 'd':
+				try {
+					duration = std::stod(optarg, &len);
+					if(len != arg.size()) { throw std::invalid_argument(""); }
+				} catch(std::invalid_argument &) {
+					std::cerr << "Duration must be a valid double, was " << arg << std::endl;
+					goto usage;
+				}
+				break;
+			case 't':
+				try {
+					nthreads = std::stoul(optarg, &len);
+					if(len != arg.size() || nthreads > (8 * sizeof(size_t))) { throw std::invalid_argument(""); }
+				} catch(std::invalid_argument &) {
+					std::cerr << "Number of threads must be a positive integer less than or equal to " << sizeof(size_t) * 8 << ", was " << arg << std::endl;
+					goto usage;
+				}
+				break;
+			// Other cases
+			default: /* ? */
+				std::cerr << opt << std::endl;
+			usage:
+				std::cerr << "Usage: " << argv[0] << ": [options]" << std::endl;
+				std::cerr << std::endl;
+				std::cerr << "  -d, --duration=DURATION  Duration of the experiment, in seconds" << std::endl;
+				std::cerr << "  -t, --nthreads=NTHREADS  Number of kernel threads" << std::endl;
+				std::exit(1);
+		}
+	}
+	run:
+
+	check_cache_line_size();
+
+	std::cout << "Running " << nthreads << " threads for " << duration << " seconds" << std::endl;
+	run(nthreads, duration);
+	return 0;
+}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/bts_test.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/bts_test.cpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/bts_test.cpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,32 @@
+#include <cassert>
+#include <iostream>
+
+bool bts(volatile size_t & target, size_t bit ) {
+	bool result = false;
+	asm volatile(
+		"LOCK btsq %[bit], %[target]\n\t"
+		:"=c" (result)
+		: [target] "m" (target), [bit] "r" (bit)
+	);
+ 	return result;
+}
+
+bool btr(volatile size_t & target, size_t bit ) {
+	bool result = false;
+	asm volatile(
+		"LOCK btrq %[bit], %[target]\n\t"
+		:"=c" (result)
+		: [target] "m" (target), [bit] "r" (bit)
+	);
+ 	return result;
+}
+
+int main() {
+	volatile size_t i = 0;
+	std::cout << std::hex << i << std::endl;
+	assert(bts(i, 31));
+	std::cout << std::hex << i << std::endl;
+	assert(btr(i, 31));
+	std::cout << std::hex << i << std::endl;
+	return 0;
+}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/links.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/links.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/links.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,122 @@
+#pragma once
+
+#include "assert.hpp"
+#include "utils.hpp"
+
+template<typename node_t>
+struct _LinksFields_t {
+	node_t * prev = nullptr;
+	node_t * next = nullptr;
+	volatile unsigned long long ts = 0;
+	unsigned hint = (unsigned)-1;
+};
+
+template<typename node_t>
+class __attribute__((aligned(128))) intrusive_queue_t {
+public:
+	typedef spinlock_t lock_t;
+
+	struct stat {
+		ssize_t diff = 0;
+		size_t  push = 0;
+		size_t  pop  = 0;
+	};
+
+private:
+	struct sentinel_t {
+		_LinksFields_t<node_t> _links;
+	};
+
+public:
+	lock_t lock;
+
+private:
+	sentinel_t before;
+	sentinel_t after;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winvalid-offsetof"
+	static constexpr auto fields_offset = offsetof( node_t, _links );
+#pragma GCC diagnostic pop
+public:
+	intrusive_queue_t()
+		: before{{ nullptr, tail() }}
+		, after {{ head(), nullptr }}
+	{
+		/* paranoid */ assert((reinterpret_cast<uintptr_t>( head() ) + fields_offset) == reinterpret_cast<uintptr_t>(&before));
+		/* paranoid */ assert((reinterpret_cast<uintptr_t>( tail() ) + fields_offset) == reinterpret_cast<uintptr_t>(&after ));
+		/* paranoid */ assert(head()->_links.prev == nullptr);
+		/* paranoid */ assert(head()->_links.next == tail() );
+		/* paranoid */ assert(tail()->_links.next == nullptr);
+		/* paranoid */ assert(tail()->_links.prev == head() );
+		/* paranoid */ assert(sizeof(*this) == 128);
+		/* paranoid */ assert((intptr_t(this) % 128) == 0);
+	}
+
+	~intrusive_queue_t() = default;
+
+	inline node_t * head() const {
+		node_t * rhead = reinterpret_cast<node_t *>(
+			reinterpret_cast<uintptr_t>( &before ) - fields_offset
+		);
+		assert(rhead);
+		return rhead;
+	}
+
+	inline node_t * tail() const {
+		node_t * rtail = reinterpret_cast<node_t *>(
+			reinterpret_cast<uintptr_t>( &after ) - fields_offset
+		);
+		assert(rtail);
+		return rtail;
+	}
+
+	inline bool push(node_t * node) {
+		assert(lock);
+		assert(node->_links.ts != 0);
+		node_t * tail = this->tail();
+
+		node_t * prev = tail->_links.prev;
+		// assertf(node->_links.ts >= prev->_links.ts,
+		// 	"New node has smaller timestamp: %llu < %llu", node->_links.ts, prev->_links.ts);
+		node->_links.next = tail;
+		node->_links.prev = prev;
+		prev->_links.next = node;
+		tail->_links.prev = node;
+
+		if(before._links.ts == 0l) {
+			before._links.ts = node->_links.ts;
+			assert(node->_links.prev == this->head());
+			return true;
+		}
+		return false;
+	}
+
+	inline std::pair<node_t *, bool> pop() {
+		assert(lock);
+		node_t * head = this->head();
+		node_t * tail = this->tail();
+
+		node_t * node = head->_links.next;
+		node_t * next = node->_links.next;
+		if(node == tail) return {nullptr, false};
+
+		head->_links.next = next;
+		next->_links.prev = head;
+
+		if(next == tail) {
+			before._links.ts = 0l;
+			return {node, true};
+		}
+		else {
+			assert(next->_links.ts != 0);
+			before._links.ts = next->_links.ts;
+			assert(before._links.ts != 0);
+			return {node, false};
+		}
+	}
+
+	long long ts() const {
+		return before._links.ts;
+	}
+};
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/prefetch.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/prefetch.cpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/prefetch.cpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,106 @@
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <iostream>
+#include <locale>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <cassert>
+
+struct __attribute__((aligned(64))) element {
+	size_t value;
+};
+
+using block = std::array<element, 100>;
+
+block * create() {
+	block * b = new block();
+	for(auto & e : *b) {
+		e.value = rand();
+	}
+	b->back().value = b->size();
+
+	return b;
+}
+
+static inline size_t find(const block & b) {
+	size_t r = 0;
+	for(; r < b.size(); r++) {
+		if(__builtin_expect(b[r].value == b.size(), false)) break;
+	}
+
+	return r;
+}
+
+void usage(char * argv[]) {
+	std::cerr << argv[0] << ": [DURATION (FLOAT:SEC)] [NBLOCKS]" << std::endl;;
+	std::exit(1);
+}
+
+int main(int argc, char * argv[]) {
+	size_t nblocks = 1000;
+	double duration = 5;
+
+	std::cout.imbue(std::locale(""));
+
+	switch (argc)
+	{
+	case 3:
+		nblocks = std::stoul(argv[2]);
+		[[fallthrough]];
+	case 2:
+		duration = std::stod(argv[1]);
+		if( duration <= 0.0 ) {
+			std::cerr << "Duration must be positive, was " << argv[1] << "(" << duration << ")" << std::endl;
+			usage(argv);
+		}
+		[[fallthrough]];
+	case 1:
+		break;
+	default:
+		usage(argv);
+		break;
+	}
+
+	std::vector<std::unique_ptr<block>> blocks;
+	for(size_t i = 0; i < nblocks; i++) {
+		blocks.emplace_back( create() );
+	}
+	std::random_shuffle(blocks.begin(), blocks.end());
+
+	size_t CRC = 0;
+	size_t count = 0;
+
+	using clock = std::chrono::high_resolution_clock;
+	auto before = clock::now();
+
+	while(true) {
+		for(const auto & b : blocks) {
+			CRC += find(*b);
+			count++;
+		}
+		auto now = clock::now();
+		std::chrono::duration<double> durr = now - before;
+		if( durr.count() > duration ) {
+			break;
+		}
+	}
+
+	auto after = clock::now();
+	std::chrono::duration<double> durr = after - before;
+	duration = durr.count();
+
+	using std::chrono::duration_cast;
+	using std::chrono::nanoseconds;
+
+	size_t ops_sec = size_t(double(count) / duration);
+	auto dur_nano = duration_cast<nanoseconds>(std::chrono::duration<double>(1.0)).count();
+
+	std::cout << "CRC           : " << CRC << "\n";
+	std::cout << "Duration      : " << duration << "s\n";
+	std::cout << "Total ops     : " << count << "\n";
+	std::cout << "Ops/sec       : " << ops_sec << "\n";
+	std::cout << "ns/Op         : " << ( dur_nano / ops_sec )<< "\n";
+}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/process.sh
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/process.sh	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/process.sh	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+NAME=$1
+
+if [ ! -f "raw/${NAME}.out" ]; then
+    echo "Not output for ${NAME}"
+    exit 1
+fi
+
+if [ ! -f "raw/${NAME}.data" ]; then
+    echo "Not perf record for ${NAME}"
+    exit 1
+fi
+
+echo "Processing perf data for ${NAME}"
+
+OPS=$(grep -e 'Total ops' raw/${NAME}.out)
+CPOP=$( echo "Hello $OPS" | \grep -oP ", \K[0-9,]+(?=o)" --color | tr -d ',')
+CPUSH=$(echo "Hello $OPS" | \grep -oP "\(\K[0-9,]+(?=i)" --color | tr -d ',')
+
+REPORT=''
+perf report -n --percent-limit 5 --stdio --no-children -i raw/${NAME}.data > raw/.temp
+EVENT=$(cat raw/.temp | grep -e '^# Samples'| cut -d ' ' -f 6)
+SPOP=$( cat raw/.temp | grep -e '] relaxed_list<Node>::pop'  | tr -s ' ' | cut -d ' ' -f 3)
+SPUSH=$(cat raw/.temp | grep -e '] relaxed_list<Node>::push' | tr -s ' ' | cut -d ' ' -f 3)
+SARR=$( cat raw/.temp | grep -e '] snz[i|m]_t::node::arrive_h'   | tr -s ' ' | cut -d ' ' -f 3)
+
+echo "$OPS"
+echo "Push count: $CPUSH"
+echo "Pop  count: $CPOP"
+
+echo "Pop    samples: $SPOP"
+echo "Push   samples: $SPUSH"
+echo "Arrive samples: $SARR"
+
+SpPUSH=$(bc -l <<< "scale=9; $SPUSH / $CPUSH")
+SpPOP=$( bc -l <<< "scale=9; $SPOP  / $CPOP" )
+SpARR=$( bc -l <<< "scale=9; $SARR  / $CPUSH")
+
+printf "%s per push()  : %.9f\n" $EVENT $SpPUSH | sed ':a;s/\B[0-9]\{3\}\>/,&/;ta'
+printf "%s per pop()   : %.9f\n" $EVENT $SpPOP  | sed ':a;s/\B[0-9]\{3\}\>/,&/;ta'
+printf "%s per arrive(): %.9f\n" $EVENT $SpARR  | sed ':a;s/\B[0-9]\{3\}\>/,&/;ta'
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,53 @@
+#include <atomic>
+
+struct thread {};
+
+struct cluster {
+	void add();
+	void remove();
+	thread * next();
+};
+
+struct processor {
+
+	cluster cluster;
+	std::atomic<bool> stop;
+	volatile bool idle;
+};
+
+
+void run(thread * ) {
+	// verify preemption
+
+	// run Thread
+
+	// verify preemption
+
+	// finish Running
+}
+
+void main(processor & self) {
+
+	self.cluster.add();
+
+	while(!self.stop) {
+		if(thread * t = self.cluster.next()) {
+			run(t);
+			continue;
+		}
+
+		self.set_idle();
+		std::atomic_thread_fence();
+
+		if(thread * t = self.cluster.next()) {
+			self.idle = false;
+			run(t);
+			continue;
+		}
+
+		halt();
+	}
+
+	self.cluster.remove();
+
+}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,215 @@
+#include <cassert>
+
+#include <atomic>
+#include <new>
+#include <type_traits>
+
+struct processor;
+
+struct __attribute__((aligned(64))) processor_id {
+	std::atomic<processor *> handle;
+	std::atomic<bool> lock;
+
+	processor_id() = default;
+	processor_id(processor * proc) : handle(proc), lock() {
+		/*paranoid*/ assert(std::atomic_is_lock_free(&lock));
+	}
+};
+
+extern unsigned num();
+
+#define ERROR throw 1
+
+class processor_list {
+private:
+
+	static const constexpr std::size_t cache_line_size = 64;
+
+	static_assert(sizeof (processor_id) <= cache_line_size, "ERROR: Instances must fit in one cache line" );
+	static_assert(alignof(processor_id) == cache_line_size, "ERROR: Instances must aligned to one cache line" );
+
+	const unsigned max;     // total cachelines allocated
+	std::atomic_uint alloc; // cachelines currently in use
+	std::atomic_uint ready; // cachelines ready to iterate over (!= to alloc when thread is in second half of doregister)
+	std::atomic<bool> lock; // writerlock
+	processor_id * data;    // data pointer
+
+private:
+	inline void acquire(std::atomic<bool> & ll) {
+		while( __builtin_expect(ll.exchange(true),false) ) {
+			while(ll.load(std::memory_order_relaxed))
+				asm volatile("pause");
+		}
+		/* paranoid */ assert(ll);
+	}
+
+public:
+	processor_list()
+		: max(num())
+		, alloc(0)
+		, ready(0)
+		, lock{false}
+		, data( new processor_id[max] )
+	{
+		/*paranoid*/ assert(num() == max);
+		/*paranoid*/ assert(std::atomic_is_lock_free(&alloc));
+		/*paranoid*/ assert(std::atomic_is_lock_free(&ready));
+	}
+
+	~processor_list() {
+		delete[] data;
+	}
+
+	//=======================================================================
+	// Lock-Free registering/unregistering of threads
+	unsigned doregister(processor * proc) {
+		// Step - 1 : check if there is already space in the data
+		uint_fast32_t s = ready;
+
+		// Check among all the ready
+		for(uint_fast32_t i = 0; i < s; i++) {
+			processor * null = nullptr; // Re-write every loop since compare thrashes it
+			if( data[i].handle.load(std::memory_order_relaxed) == null
+			 && data[i].handle.compare_exchange_strong(null, proc)) {
+				/*paranoid*/ assert(i < ready);
+				/*paranoid*/ assert(alignof(decltype(data[i])) == cache_line_size);
+				/*paranoid*/ assert((uintptr_t(&data[i]) % cache_line_size) == 0);
+				return i;
+			}
+		}
+
+		if(max <= alloc) ERROR;
+
+		// Step - 2 : F&A to get a new spot in the array.
+		uint_fast32_t n = alloc++;
+		if(max <= n) ERROR;
+
+		// Step - 3 : Mark space as used and then publish it.
+		void * storage = &data[n];
+		new (storage) processor_id( proc );
+		while(true) {
+			unsigned copy = n;
+			if( ready.load(std::memory_order_relaxed) == n
+			 && ready.compare_exchange_weak(copy, n + 1) )
+			 	break;
+			asm volatile("pause");
+		}
+
+		// Return new spot.
+		/*paranoid*/ assert(n < ready);
+		/*paranoid*/ assert(alignof(decltype(data[n])) == cache_line_size);
+		/*paranoid*/ assert((uintptr_t(&data[n]) % cache_line_size) == 0);
+		return n;
+	}
+
+	processor * unregister(unsigned iproc) {
+		/*paranoid*/ assert(iproc < ready);
+		auto ret = data[iproc].handle.load(std::memory_order_relaxed);
+		data[iproc].handle = nullptr;
+		return ret;
+	}
+
+	// Reset all registration
+	// Unsafe in most cases, use for testing only.
+	void reset() {
+		alloc = 0;
+		ready = 0;
+	}
+
+	processor * get(unsigned iproc) {
+		return data[iproc].handle.load(std::memory_order_relaxed);
+	}
+
+	//=======================================================================
+	// Reader-writer lock implementation
+	// Concurrent with doregister/unregister,
+	//    i.e., threads can be added at any point during or between the entry/exit
+
+	//-----------------------------------------------------------------------
+	// Reader side
+	void read_lock(unsigned iproc) {
+		/*paranoid*/ assert(iproc < ready);
+
+		// Step 1 : make sure no writer are in the middle of the critical section
+		while(lock.load(std::memory_order_relaxed))
+			asm volatile("pause");
+
+		// Fence needed because we don't want to start trying to acquire the lock
+		// before we read a false.
+		// Not needed on x86
+		// std::atomic_thread_fence(std::memory_order_seq_cst);
+
+		// Step 2 : acquire our local lock
+		acquire( data[iproc].lock );
+		/*paranoid*/ assert(data[iproc].lock);
+	}
+
+	void read_unlock(unsigned iproc) {
+		/*paranoid*/ assert(iproc < ready);
+		/*paranoid*/ assert(data[iproc].lock);
+		data[iproc].lock.store(false, std::memory_order_release);
+	}
+
+	//-----------------------------------------------------------------------
+	// Writer side
+	uint_fast32_t write_lock() {
+		// Step 1 : lock global lock
+		// It is needed to avoid processors that register mid Critical-Section
+		//   to simply lock their own lock and enter.
+		acquire(lock);
+
+		// Step 2 : lock per-proc lock
+		// Processors that are currently being registered aren't counted
+		//   but can't be in read_lock or in the critical section.
+		// All other processors are counted
+		uint_fast32_t s = ready;
+		for(uint_fast32_t i = 0; i < s; i++) {
+			acquire( data[i].lock );
+		}
+
+		return s;
+	}
+
+	void write_unlock(uint_fast32_t last_s) {
+		// Step 1 : release local locks
+		// This must be done while the global lock is held to avoid
+		//   threads that where created mid critical section
+		//   to race to lock their local locks and have the writer
+		//   immidiately unlock them
+		// Alternative solution : return s in write_lock and pass it to write_unlock
+		for(uint_fast32_t i = 0; i < last_s; i++) {
+			assert(data[i].lock);
+			data[i].lock.store(false, std::memory_order_release);
+		}
+
+		// Step 2 : release global lock
+		/*paranoid*/ assert(true == lock);
+		lock.store(false, std::memory_order_release);
+	}
+
+	//-----------------------------------------------------------------------
+	// Checking support
+	uint_fast32_t epoch_check() {
+		// Step 1 : lock global lock
+		// It is needed to avoid processors that register mid Critical-Section
+		//   to simply lock their own lock and enter.
+		while(lock.load(std::memory_order_relaxed))
+			asm volatile("pause");
+
+		// Step 2 : lock per-proc lock
+		// Processors that are currently being registered aren't counted
+		//   but can't be in read_lock or in the critical section.
+		// All other processors are counted
+		uint_fast32_t s = ready;
+		for(uint_fast32_t i = 0; i < s; i++) {
+			while(data[i].lock.load(std::memory_order_relaxed))
+				asm volatile("pause");
+		}
+
+		return s;
+	}
+
+public:
+};
+
+#undef ERROR
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list_fast.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list_fast.cpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list_fast.cpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,173 @@
+#include "processor_list.hpp"
+
+#include <array>
+#include <iomanip>
+#include <iostream>
+#include <locale>
+#include <string>
+#include <thread>
+
+#include "utils.hpp"
+
+unsigned num() {
+	return 0x1000000;
+}
+
+//-------------------
+
+struct processor {
+	unsigned id;
+};
+void run(unsigned nthread, double duration, unsigned writes, unsigned epochs) {
+	assert(writes < 100);
+
+	// List being tested
+	processor_list list = {};
+
+	// Barrier for synchronization
+	barrier_t barrier(nthread + 1);
+
+	// Data to check everything is OK
+	size_t write_committed = 0ul;
+	struct {
+		std::atomic_size_t write = { 0ul };
+		std::atomic_size_t read  = { 0ul };
+		std::atomic_size_t epoch = { 0ul };
+	} lock_cnt;
+
+	// Flag to signal termination
+	std::atomic_bool done = { false };
+
+	std::thread * threads[nthread];
+	unsigned i = 1;
+	for(auto & t : threads) {
+		t = new std::thread([&done, &list, &barrier, &write_committed, &lock_cnt, writes, epochs](unsigned tid) {
+			Random rand(tid + rdtscl());
+			processor proc;
+			proc.id = list.doregister(&proc);
+			size_t writes_cnt = 0;
+			size_t reads_cnt = 0;
+			size_t epoch_cnt = 0;
+
+			affinity(tid);
+
+			barrier.wait(tid);
+
+			while(__builtin_expect(!done, true)) {
+				auto r = rand.next() % 100;
+				if (r < writes) {
+					auto n = list.write_lock();
+					write_committed++;
+					writes_cnt++;
+					assert(writes_cnt < -2ul);
+					list.write_unlock(n);
+				}
+				else if(r < epochs) {
+					list.epoch_check();
+					epoch_cnt++;
+				}
+				else {
+					list.read_lock(proc.id);
+					reads_cnt++;
+					assert(reads_cnt < -2ul);
+					list.read_unlock(proc.id);
+				}
+			}
+
+			barrier.wait(tid);
+
+			auto p = list.unregister(proc.id);
+			assert(&proc == p);
+			lock_cnt.write += writes_cnt;
+			lock_cnt.read  += reads_cnt;
+			lock_cnt.epoch += epoch_cnt;
+		}, i++);
+	}
+
+	auto before = Clock::now();
+	barrier.wait(0);
+
+	while(true) {
+		usleep(1000);
+		auto now = Clock::now();
+		duration_t durr = now - before;
+		if( durr.count() > duration ) {
+			done = true;
+			break;
+		}
+	}
+
+	barrier.wait(0);
+	auto after = Clock::now();
+	duration_t durr = after - before;
+	duration = durr.count();
+
+	for(auto t : threads) {
+		t->join();
+		delete t;
+	}
+
+	assert(write_committed == lock_cnt.write);
+
+	size_t totalop = lock_cnt.read + lock_cnt.write + lock_cnt.epoch;
+	size_t ops_sec = size_t(double(totalop) / duration);
+	size_t ops_thread = ops_sec / nthread;
+	double dur_nano = duration_cast<std::nano>(1.0);
+
+	std::cout << "Duration      : " << duration << "s\n";
+	std::cout << "Total ops     : " << totalop << "(" << lock_cnt.read << "r, " << lock_cnt.write << "w, " << lock_cnt.epoch << "e)\n";
+	std::cout << "Ops/sec       : " << ops_sec << "\n";
+	std::cout << "Ops/sec/thread: " << ops_thread << "\n";
+	std::cout << "ns/Op         : " << ( dur_nano / ops_thread )<< "\n";
+}
+
+void usage(char * argv[]) {
+	std::cerr << argv[0] << ": [DURATION (FLOAT:SEC)] [NTHREADS] [%WRITES]" << std::endl;;
+	std::exit(1);
+}
+
+int main(int argc, char * argv[]) {
+
+	double duration   = 5.0;
+	unsigned nthreads = 2;
+	unsigned writes   = 0;
+	unsigned epochs   = 0;
+
+	std::cout.imbue(std::locale(""));
+
+	switch (argc)
+	{
+	case 5:
+		epochs = std::stoul(argv[4]);
+		[[fallthrough]];
+	case 4:
+		writes = std::stoul(argv[3]);
+		if( (writes + epochs) > 100 ) {
+			std::cerr << "Writes + Epochs must be valid percentage, was " << argv[3] << " + " << argv[4] << "(" << writes << " + " << epochs << ")" << std::endl;
+			usage(argv);
+		}
+		[[fallthrough]];
+	case 3:
+		nthreads = std::stoul(argv[2]);
+		[[fallthrough]];
+	case 2:
+		duration = std::stod(argv[1]);
+		if( duration <= 0.0 ) {
+			std::cerr << "Duration must be positive, was " << argv[1] << "(" << duration << ")" << std::endl;
+			usage(argv);
+		}
+		[[fallthrough]];
+	case 1:
+		break;
+	default:
+		usage(argv);
+		break;
+	}
+
+	check_cache_line_size();
+
+	std::cout << "Running " << nthreads << " threads for " << duration << " seconds with " << writes << "% writes and " << epochs << "% epochs" << std::endl;
+	run(nthreads, duration, writes, epochs + writes);
+
+	return 0;
+}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list_good.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list_good.cpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list_good.cpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,269 @@
+#include "processor_list.hpp"
+
+#include <iostream>
+#include <string>
+#include <thread>
+
+unsigned num() {
+	return 0x1000000;
+}
+
+// Barrier from
+class barrier_t {
+public:
+	barrier_t(size_t total)
+		: waiting(0)
+		, total(total)
+	{}
+
+	void wait(unsigned) {
+		size_t target = waiting++;
+		target = (target - (target % total)) + total;
+		while(waiting < target)
+			asm volatile("pause");
+
+		assert(waiting < (1ul << 60));
+    	}
+
+private:
+	std::atomic<size_t> waiting;
+	size_t total;
+};
+
+class Random {
+private:
+	unsigned int seed;
+public:
+	Random(int seed) {
+		this->seed = seed;
+	}
+
+	/** returns pseudorandom x satisfying 0 <= x < n. **/
+	unsigned int next() {
+		seed ^= seed << 6;
+		seed ^= seed >> 21;
+		seed ^= seed << 7;
+		return seed;
+    	}
+};
+
+//-------------------
+
+struct processor {
+	unsigned id;
+};
+
+// Stage 1
+// Make sure that the early registration works correctly
+// Registration uses a different process if the act of
+// registering the processor makes it the highest processor count
+// seen yet.
+void stage1(unsigned nthread, unsigned repeats) {
+	const int n = repeats;
+	const int nproc = 10;
+
+	// List being tested
+	processor_list list;
+
+	// Barrier for synchronization
+	barrier_t barrier(nthread + 1);
+
+	// Seen values to detect duplicattion
+	std::atomic<processor *> ids[nthread * nproc];
+	for(auto & i : ids) {
+		i = nullptr;
+	}
+
+	// Can't pass VLA to lambda
+	std::atomic<processor *> * idsp = ids;
+
+	// Threads which will run the code
+	std::thread * threads[nthread];
+	unsigned i = 1;
+	for(auto & t : threads) {
+		// Each thread will try to register a processor then add it to the
+		// list of registerd processor
+		t = new std::thread([&list, &barrier, idsp, n](unsigned tid){
+			processor proc[nproc];
+			for(int i = 0; i < n; i++) {
+				for(auto & p : proc) {
+					// Register the thread
+					p.id = list.doregister(&p);
+				}
+
+				for(auto & p : proc) {
+					// Make sure no one got this id before
+					processor * prev = idsp[p.id].exchange(&p);
+					assert(nullptr == prev);
+
+					// Make sure id is still consistend
+					assert(&p == list.get(p.id));
+				}
+
+				// wait for round to finish
+				barrier.wait(tid);
+
+				// wait for reset
+				barrier.wait(tid);
+			}
+		}, i++);
+	}
+
+	for(int i = 0; i < n; i++) {
+		//Wait for round to finish
+		barrier.wait(0);
+
+		// Reset list
+		list.reset();
+
+		std::cout << i << "\r";
+
+		// Reset seen values
+		for(auto & i : ids) {
+			i = nullptr;
+		}
+
+		// Start next round
+		barrier.wait(0);
+	}
+
+	for(auto t : threads) {
+		t->join();
+		delete t;
+	}
+}
+
+// Stage 2
+// Check that once churning starts, registration is still consistent.
+void stage2(unsigned nthread, unsigned repeats) {
+	// List being tested
+	processor_list list;
+
+	// Threads which will run the code
+	std::thread * threads[nthread];
+	unsigned i = 1;
+	for(auto & t : threads) {
+		// Each thread will try to register a few processors and
+		// unregister them, making sure that the registration is
+		// consistent
+		t = new std::thread([&list, repeats](unsigned tid){
+			processor procs[10];
+			for(unsigned i = 0; i < repeats; i++) {
+				// register the procs and note the id
+				for(auto & p : procs) {
+					p.id = list.doregister(&p);
+				}
+
+				if(1 == tid) std::cout << i << "\r";
+
+				// check the id is still consistent
+				for(const auto & p : procs) {
+					assert(&p == list.get(p.id));
+				}
+
+				// unregister and check the id is consistent
+				for(const auto & p : procs) {
+					assert(&p == list.unregister(p.id));
+				}
+			}
+		}, i++);
+	}
+
+	for(auto t : threads) {
+		t->join();
+		delete t;
+	}
+}
+
+bool is_writer();
+
+// Stage 3
+// Check that the reader writer lock works.
+void stage3(unsigned nthread, unsigned repeats) {
+	// List being tested
+	processor_list list;
+
+	size_t before = 0;
+
+	std::unique_ptr<size_t> after( new size_t(0) );
+
+	std::atomic<bool> done ( false );
+
+	// Threads which will run the code
+	std::thread * threads[nthread];
+	unsigned i = 1;
+	for(auto & t : threads) {
+		// Each thread will try to register a few processors and
+		// unregister them, making sure that the registration is
+		// consistent
+		t = new std::thread([&list, repeats, &before, &after, &done](unsigned tid){
+			Random rng(tid);
+			processor proc;
+			proc.id = list.doregister(&proc);
+			while(!done) {
+
+				if( (rng.next() % 100) == 0 ) {
+					auto r = list.write_lock();
+
+					auto b = before++;
+
+					std::cout << b << "\r";
+
+					(*after)++;
+
+					if(b >= repeats) done = true;
+
+					list.write_unlock(r);
+				}
+				else {
+					list.read_lock(proc.id);
+					assert(before == *after);
+					list.read_unlock(proc.id);
+				}
+
+			}
+
+			list.unregister(proc.id);
+		}, i++);
+	}
+
+	for(auto t : threads) {
+		t->join();
+		delete t;
+	}
+}
+
+int main(int argc, char * argv[]) {
+
+	unsigned nthreads = 1;
+	if( argc >= 3 ) {
+		size_t idx;
+		nthreads = std::stoul(argv[2], &idx);
+		assert('\0' == argv[2][idx]);
+	}
+
+	unsigned repeats = 100;
+	if( argc >= 2 ) {
+		size_t idx;
+		repeats = std::stoul(argv[1], &idx);
+		assert('\0' == argv[1][idx]);
+	}
+
+	processor_list::check_cache_line_size();
+
+	std::cout << "Running " << repeats << " repetitions on " << nthreads << " threads" << std::endl;
+	std::cout << "Checking registration - early" << std::endl;
+	stage1(nthreads, repeats);
+	std::cout << "Done                         " << std::endl;
+
+	std::cout << "Checking registration - churn" << std::endl;
+	stage2(nthreads, repeats);
+	std::cout << "Done                         " << std::endl;
+
+	std::cout << "Checking RW lock             " << std::endl;
+	stage3(nthreads, repeats);
+	std::cout << "Done                         " << std::endl;
+
+
+	return 0;
+}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/randbit.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/randbit.cpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/randbit.cpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,236 @@
+#include <cstddef>
+#include <cstdint>
+#include <x86intrin.h>
+
+__attribute__((noinline)) unsigned nthSetBit(size_t mask, unsigned bit) {
+	uint64_t v = mask;   // Input value to find position with rank r.
+	unsigned int r = bit;// Input: bit's desired rank [1-64].
+	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
+	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
+	unsigned int t;      // Bit count temporary.
+
+	// Do a normal parallel bit count for a 64-bit integer,
+	// but store all intermediate steps.
+	// a = (v & 0x5555...) + ((v >> 1) & 0x5555...);
+	a =  v - ((v >> 1) & ~0UL/3);
+	// b = (a & 0x3333...) + ((a >> 2) & 0x3333...);
+	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
+	// c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...);
+	c = (b + (b >> 4)) & ~0UL/0x11;
+	// d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...);
+	d = (c + (c >> 8)) & ~0UL/0x101;
+
+
+	t = (d >> 32) + (d >> 48);
+	// Now do branchless select!
+	s  = 64;
+	// if (r > t) {s -= 32; r -= t;}
+	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
+	t  = (d >> (s - 16)) & 0xff;
+	// if (r > t) {s -= 16; r -= t;}
+	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
+	t  = (c >> (s - 8)) & 0xf;
+	// if (r > t) {s -= 8; r -= t;}
+	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
+	t  = (b >> (s - 4)) & 0x7;
+	// if (r > t) {s -= 4; r -= t;}
+	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
+	t  = (a >> (s - 2)) & 0x3;
+	// if (r > t) {s -= 2; r -= t;}
+	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
+	t  = (v >> (s - 1)) & 0x1;
+	// if (r > t) s--;
+	s -= ((t - r) & 256) >> 8;
+	// s = 65 - s;
+	return s;
+}
+
+unsigned rand_bit(unsigned rnum, uint64_t mask) {
+	unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
+#if defined(BRANCHLESS)
+	uint64_t v = mask;   // Input value to find position with rank r.
+	unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
+	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
+	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
+	unsigned int t;      // Bit count temporary.
+
+	// Do a normal parallel bit count for a 64-bit integer,
+	// but store all intermediate steps.
+	// a = (v & 0x5555...) + ((v >> 1) & 0x5555...);
+	a =  v - ((v >> 1) & ~0UL/3);
+	// b = (a & 0x3333...) + ((a >> 2) & 0x3333...);
+	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
+	// c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...);
+	c = (b + (b >> 4)) & ~0UL/0x11;
+	// d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...);
+	d = (c + (c >> 8)) & ~0UL/0x101;
+
+
+	t = (d >> 32) + (d >> 48);
+	// Now do branchless select!
+	s  = 64;
+	// if (r > t) {s -= 32; r -= t;}
+	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
+	t  = (d >> (s - 16)) & 0xff;
+	// if (r > t) {s -= 16; r -= t;}
+	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
+	t  = (c >> (s - 8)) & 0xf;
+	// if (r > t) {s -= 8; r -= t;}
+	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
+	t  = (b >> (s - 4)) & 0x7;
+	// if (r > t) {s -= 4; r -= t;}
+	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
+	t  = (a >> (s - 2)) & 0x3;
+	// if (r > t) {s -= 2; r -= t;}
+	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
+	t  = (v >> (s - 1)) & 0x1;
+	// if (r > t) s--;
+	s -= ((t - r) & 256) >> 8;
+	// s = 65 - s;
+	return s - 1;
+#elif defined(LOOP)
+	for(unsigned i = 0; i < bit; i++) {
+		mask ^= (1ul << (__builtin_ffsl(mask) - 1ul));
+	}
+	return __builtin_ffsl(mask) - 1ul;
+#elif defined(PDEP)
+	uint64_t picked = _pdep_u64(1ul << bit, mask);
+	return __builtin_ffsl(picked) - 1ul;
+#else
+#error must define LOOP, PDEP or BRANCHLESS
+#endif
+}
+
+#include <cassert>
+#include <atomic>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <locale>
+#include <thread>
+
+#include <unistd.h>
+
+class barrier_t {
+public:
+	barrier_t(size_t total)
+		: waiting(0)
+		, total(total)
+	{}
+
+	void wait(unsigned) {
+		size_t target = waiting++;
+		target = (target - (target % total)) + total;
+		while(waiting < target)
+			asm volatile("pause");
+
+		assert(waiting < (1ul << 60));
+    	}
+
+private:
+	std::atomic<size_t> waiting;
+	size_t total;
+};
+
+class Random {
+private:
+	unsigned int seed;
+public:
+	Random(int seed) {
+		this->seed = seed;
+	}
+
+	/** returns pseudorandom x satisfying 0 <= x < n. **/
+	unsigned int next() {
+		seed ^= seed << 6;
+		seed ^= seed >> 21;
+		seed ^= seed << 7;
+		return seed;
+    	}
+};
+
+using Clock = std::chrono::high_resolution_clock;
+using duration_t = std::chrono::duration<double>;
+using std::chrono::nanoseconds;
+
+template<typename Ratio, typename T>
+T duration_cast(T seconds) {
+	return std::chrono::duration_cast<std::chrono::duration<T, Ratio>>(std::chrono::duration<T>(seconds)).count();
+}
+
+void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) {
+
+
+	std::cout << "Starting" << std::endl;
+	auto before = Clock::now();
+	barrier.wait(0);
+
+	while(true) {
+		usleep(100000);
+		auto now = Clock::now();
+		duration_t durr = now - before;
+		if( durr.count() > duration ) {
+			done = true;
+			break;
+		}
+		std::cout << "\r" << std::setprecision(4) << durr.count();
+		std::cout.flush();
+	}
+
+	barrier.wait(0);
+	auto after = Clock::now();
+	duration_t durr = after - before;
+	duration = durr.count();
+	std::cout << "\rClosing down" << std::endl;
+}
+
+__attribute__((noinline)) void body(Random & rand) {
+	uint64_t mask = (uint64_t(rand.next()) << 32ul) | uint64_t(rand.next());
+	unsigned idx = rand.next();
+
+	unsigned bit = rand_bit(idx, mask);
+
+	if(__builtin_expect(((1ul << bit) & mask) == 0, false)) {
+		std::cerr << std::hex <<  "Rand " << idx << " from " << mask;
+		std::cerr << " gave " << (1ul << bit) << "(" << std::dec << bit << ")" << std::endl;
+		std::abort();
+	}
+}
+
+void runRandBit(double duration) {
+
+	std::atomic_bool done  = { false };
+	barrier_t barrier(2);
+
+	size_t count = 0;
+	std::thread thread([&done, &barrier, &count]() {
+
+		Random rand(22);
+
+		barrier.wait(1);
+
+		for(;!done; count++) {
+			body(rand);
+		}
+
+		barrier.wait(1);
+	});
+
+	waitfor(duration, barrier, done);
+	thread.join();
+
+	size_t ops = count;
+	size_t ops_sec = size_t(double(ops) / duration);
+	auto dur_nano = duration_cast<std::nano>(1.0);
+
+	std::cout << "Duration      : " << duration << "s\n";
+	std::cout << "ns/Op         : " << ( dur_nano / ops )<< "\n";
+	std::cout << "Ops/sec       : " << ops_sec << "\n";
+	std::cout << "Total ops     : " << ops << std::endl;
+
+}
+
+int main() {
+	std::cout.imbue(std::locale(""));
+	runRandBit(5);
+}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list.cpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list.cpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,1141 @@
+#if !defined(LIST_VARIANT_HPP)
+#define LIST_VARIANT_HPP "relaxed_list.hpp"
+#endif
+
+#include LIST_VARIANT_HPP
+#if !defined(LIST_VARIANT)
+#error not variant selected
+#endif
+
+#include <array>
+#include <iomanip>
+#include <iostream>
+#include <locale>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include <getopt.h>
+#include <unistd.h>
+#include <sys/sysinfo.h>
+
+#include "utils.hpp"
+
+struct __attribute__((aligned(64))) Node {
+	static std::atomic_size_t creates;
+	static std::atomic_size_t destroys;
+
+	_LinksFields_t<Node> _links;
+
+	int value;
+	int id;
+
+	Node() { creates++; }
+	Node(int value): value(value) { creates++; }
+	~Node() { destroys++; }
+};
+
+std::atomic_size_t Node::creates  = { 0 };
+std::atomic_size_t Node::destroys = { 0 };
+
+bool enable_stats = false;
+
+template<>
+thread_local LIST_VARIANT<Node>::TLS LIST_VARIANT<Node>::tls = {};
+
+template<>
+std::atomic_uint32_t LIST_VARIANT<Node>::ticket = { 0 };
+
+#ifndef NO_STATS
+template<>
+LIST_VARIANT<Node>::GlobalStats LIST_VARIANT<Node>::global_stats = {};
+#endif
+
+// ================================================================================================
+//                        UTILS
+// ================================================================================================
+
+struct local_stat_t {
+	size_t in  = 0;
+	size_t out = 0;
+	size_t empty = 0;
+	size_t crc_in  = 0;
+	size_t crc_out = 0;
+	size_t valmax = 0;
+	size_t valmin = 100000000ul;
+	struct {
+		size_t val = 0;
+		size_t cnt = 0;
+	} comp;
+	struct {
+		size_t val = 0;
+		size_t cnt = 0;
+	} subm;
+};
+
+struct global_stat_t {
+	std::atomic_size_t in  = { 0 };
+	std::atomic_size_t out = { 0 };
+	std::atomic_size_t empty = { 0 };
+	std::atomic_size_t crc_in  = { 0 };
+	std::atomic_size_t crc_out = { 0 };
+	std::atomic_size_t valmax = { 0 };
+	std::atomic_size_t valmin = { 100000000ul };
+	struct {
+		std::atomic_size_t val = { 0 };
+		std::atomic_size_t cnt = { 0 };
+	} comp;
+	struct {
+		std::atomic_size_t val = { 0 };
+		std::atomic_size_t cnt = { 0 };
+	} subm;
+};
+
+void atomic_max(std::atomic_size_t & target, size_t value) {
+	for(;;) {
+		size_t expect = target.load(std::memory_order_relaxed);
+		if(value <= expect) return;
+		bool success = target.compare_exchange_strong(expect, value);
+		if(success) return;
+	}
+}
+
+void atomic_min(std::atomic_size_t & target, size_t value) {
+	for(;;) {
+		size_t expect = target.load(std::memory_order_relaxed);
+		if(value >= expect) return;
+		bool success = target.compare_exchange_strong(expect, value);
+		if(success) return;
+	}
+}
+
+void tally_stats(global_stat_t & global, local_stat_t & local) {
+
+	global.in    += local.in;
+	global.out   += local.out;
+	global.empty += local.empty;
+
+	global.crc_in  += local.crc_in;
+	global.crc_out += local.crc_out;
+
+	global.comp.val += local.comp.val;
+	global.comp.cnt += local.comp.cnt;
+	global.subm.val += local.subm.val;
+	global.subm.cnt += local.subm.cnt;
+
+	atomic_max(global.valmax, local.valmax);
+	atomic_min(global.valmin, local.valmin);
+
+	LIST_VARIANT<Node>::stats_tls_tally();
+}
+
+void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) {
+	std::cout << "Starting" << std::endl;
+	auto before = Clock::now();
+	barrier.wait(0);
+	bool is_tty = isatty(STDOUT_FILENO);
+
+	while(true) {
+		usleep(100000);
+		auto now = Clock::now();
+		duration_t durr = now - before;
+		if( durr.count() > duration ) {
+			done = true;
+			break;
+		}
+		if(is_tty) {
+			std::cout << "\r" << std::setprecision(4) << durr.count();
+			std::cout.flush();
+		}
+	}
+
+	barrier.wait(0);
+	auto after = Clock::now();
+	duration_t durr = after - before;
+	duration = durr.count();
+	std::cout << "\rClosing down" << std::endl;
+}
+
+void waitfor(double & duration, barrier_t & barrier, const std::atomic_size_t & count) {
+	std::cout << "Starting" << std::endl;
+	auto before = Clock::now();
+	barrier.wait(0);
+
+	while(true) {
+		usleep(100000);
+		size_t c = count.load();
+		if( c == 0 ) {
+			break;
+		}
+		std::cout << "\r" << c;
+		std::cout.flush();
+	}
+
+	barrier.wait(0);
+	auto after = Clock::now();
+	duration_t durr = after - before;
+	duration = durr.count();
+	std::cout << "\rClosing down" << std::endl;
+}
+
+void print_stats(double duration, unsigned nthread, global_stat_t & global) {
+	assert(Node::creates == Node::destroys);
+	assert(global.crc_in == global.crc_out);
+
+	std::cout << "Done" << std::endl;
+
+	size_t ops = global.in + global.out;
+	size_t ops_sec = size_t(double(ops) / duration);
+	size_t ops_thread = ops_sec / nthread;
+	auto dur_nano = duration_cast<std::nano>(1.0);
+
+	if(global.valmax != 0) {
+		std::cout << "Max runs      : " << global.valmax << "\n";
+		std::cout << "Min runs      : " << global.valmin << "\n";
+	}
+	if(global.comp.cnt != 0) {
+		std::cout << "Submit count  : " << global.subm.cnt << "\n";
+		std::cout << "Submit average: " << ((double(global.subm.val)) / global.subm.cnt) << "\n";
+		std::cout << "Complete count: " << global.comp.cnt << "\n";
+		std::cout << "Complete avg  : " << ((double(global.comp.val)) / global.comp.cnt) << "\n";
+	}
+	std::cout << "Duration      : " << duration << "s\n";
+	std::cout << "ns/Op         : " << ( dur_nano / ops_thread )<< "\n";
+	std::cout << "Ops/sec/thread: " << ops_thread << "\n";
+	std::cout << "Ops/sec       : " << ops_sec << "\n";
+	std::cout << "Total ops     : " << ops << "(" << global.in << "i, " << global.out << "o, " << global.empty << "e)\n";
+	#ifndef NO_STATS
+		LIST_VARIANT<Node>::stats_print(std::cout);
+	#endif
+}
+
+void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output);
+
+// ================================================================================================
+//                        EXPERIMENTS
+// ================================================================================================
+
+// ================================================================================================
+__attribute__((noinline)) void runChurn_body(
+	std::atomic<bool>& done,
+	Random & rand,
+	Node * my_nodes[],
+	unsigned nslots,
+	local_stat_t & local,
+	LIST_VARIANT<Node> & list
+) {
+	while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
+		int idx = rand.next() % nslots;
+		if (auto node = my_nodes[idx]) {
+			local.crc_in += node->value;
+			list.push(node);
+			my_nodes[idx] = nullptr;
+			local.in++;
+		}
+		else if(auto node = list.pop()) {
+			local.crc_out += node->value;
+			my_nodes[idx] = node;
+			local.out++;
+		}
+		else {
+			local.empty++;
+		}
+	}
+}
+
+void runChurn(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes, const unsigned nslots) {
+	std::cout << "Churn Benchmark" << std::endl;
+	assert(nnodes <= nslots);
+	// List being tested
+
+	// Barrier for synchronization
+	barrier_t barrier(nthread + 1);
+
+	// Data to check everything is OK
+	global_stat_t global;
+
+	// Flag to signal termination
+	std::atomic_bool done  = { false };
+
+	// Prep nodes
+	std::cout << "Initializing ";
+	size_t npushed = 0;
+	LIST_VARIANT<Node> list = { nthread, nqueues };
+	{
+		Node** all_nodes[nthread];
+		for(auto & nodes : all_nodes) {
+			nodes = new __attribute__((aligned(64))) Node*[nslots + 8];
+			Random rand(rdtscl());
+			for(unsigned i = 0; i < nnodes; i++) {
+				nodes[i] = new Node(rand.next() % 100);
+			}
+
+			for(unsigned i = nnodes; i < nslots; i++) {
+				nodes[i] = nullptr;
+			}
+
+			for(int i = 0; i < 10 && i < (int)nslots; i++) {
+				int idx = rand.next() % nslots;
+				if (auto node = nodes[idx]) {
+					global.crc_in += node->value;
+					list.push(node);
+					npushed++;
+					nodes[idx] = nullptr;
+				}
+			}
+		}
+
+		std::cout << nnodes << " nodes (" << nslots << " slots)" << std::endl;
+
+		enable_stats = true;
+
+		std::thread * threads[nthread];
+		unsigned i = 1;
+		for(auto & t : threads) {
+			auto & my_nodes = all_nodes[i - 1];
+			t = new std::thread([&done, &list, &barrier, &global, &my_nodes, nslots](unsigned tid) {
+				Random rand(tid + rdtscl());
+
+				local_stat_t local;
+
+				// affinity(tid);
+
+				barrier.wait(tid);
+
+				// EXPERIMENT START
+
+				runChurn_body(done, rand, my_nodes, nslots, local, list);
+
+				// EXPERIMENT END
+
+				barrier.wait(tid);
+
+				tally_stats(global, local);
+
+				for(unsigned i = 0; i < nslots; i++) {
+					delete my_nodes[i];
+				}
+			}, i++);
+		}
+
+		waitfor(duration, barrier, done);
+
+		for(auto t : threads) {
+			t->join();
+			delete t;
+		}
+
+		enable_stats = false;
+
+		while(auto node = list.pop()) {
+			global.crc_out += node->value;
+			delete node;
+		}
+
+		for(auto nodes : all_nodes) {
+			delete[] nodes;
+		}
+	}
+
+	print_stats(duration, nthread, global);
+}
+
+// ================================================================================================
+__attribute__((noinline)) void runPingPong_body(
+	std::atomic<bool>& done,
+	Node initial_nodes[],
+	unsigned nnodes,
+	local_stat_t & local,
+	LIST_VARIANT<Node> & list
+) {
+	Node * nodes[nnodes];
+	{
+		unsigned i = 0;
+		for(auto & n : nodes) {
+			n = &initial_nodes[i++];
+		}
+	}
+
+	while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
+
+		for(Node * & node : nodes) {
+			local.crc_in += node->value;
+			list.push(node);
+			local.in++;
+		}
+
+		// -----
+
+		for(Node * & node : nodes) {
+			node = list.pop();
+			assert(node);
+			local.crc_out += node->value;
+			local.out++;
+		}
+	}
+}
+
+void runPingPong(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes) {
+	std::cout << "PingPong Benchmark" << std::endl;
+
+
+	// Barrier for synchronization
+	barrier_t barrier(nthread + 1);
+
+	// Data to check everything is OK
+	global_stat_t global;
+
+	// Flag to signal termination
+	std::atomic_bool done  = { false };
+
+	std::cout << "Initializing ";
+	// List being tested
+	LIST_VARIANT<Node> list = { nthread, nqueues };
+	{
+		enable_stats = true;
+
+		std::thread * threads[nthread];
+		unsigned i = 1;
+		for(auto & t : threads) {
+			t = new std::thread([&done, &list, &barrier, &global, nnodes](unsigned tid) {
+				Random rand(tid + rdtscl());
+
+				Node nodes[nnodes];
+				for(auto & n : nodes) {
+					n.value = (int)rand.next() % 100;
+				}
+
+				local_stat_t local;
+
+				// affinity(tid);
+
+				barrier.wait(tid);
+
+				// EXPERIMENT START
+
+				runPingPong_body(done, nodes, nnodes, local, list);
+
+				// EXPERIMENT END
+
+				barrier.wait(tid);
+
+				tally_stats(global, local);
+			}, i++);
+		}
+
+		waitfor(duration, barrier, done);
+
+		for(auto t : threads) {
+			t->join();
+			delete t;
+		}
+
+		enable_stats = false;
+	}
+
+	print_stats(duration, nthread, global);
+}
+
+// ================================================================================================
+struct __attribute__((aligned(64))) Slot {
+	Node * volatile node;
+};
+
+__attribute__((noinline)) void runProducer_body(
+	std::atomic<bool>& done,
+	Random & rand,
+	Slot * slots,
+	int nslots,
+	local_stat_t & local,
+	LIST_VARIANT<Node> & list
+) {
+	while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
+
+		Node * node = list.pop();
+		if(!node) {
+			local.empty ++;
+			continue;
+		}
+
+		local.crc_out += node->value;
+		local.out++;
+
+		if(node->id == 0) {
+			unsigned cnt = 0;
+			for(int i = 0; i < nslots; i++) {
+				Node * found = __atomic_exchange_n( &slots[i].node, nullptr, __ATOMIC_SEQ_CST );
+				if( found ) {
+					local.crc_in += found->value;
+					local.in++;
+					cnt++;
+					list.push( found );
+				}
+			}
+
+			local.crc_in += node->value;
+			local.in++;
+			list.push( node );
+
+			local.comp.cnt++;
+			local.comp.val += cnt;
+		}
+		else {
+			unsigned len = 0;
+			while(true) {
+				auto off = rand.next();
+				for(int i = 0; i < nslots; i++) {
+					Node * expected = nullptr;
+					int idx = (i + off) % nslots;
+					Slot & slot = slots[ idx ];
+					if(
+						slot.node == nullptr &&
+						__atomic_compare_exchange_n( &slot.node, &expected, node, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST )
+					) {
+						local.subm.cnt++;
+						local.subm.val += len;
+						goto LOOP;
+					}
+					assert( expected != node );
+					len++;
+				}
+			}
+		}
+
+		LOOP:;
+	}
+}
+
+void runProducer(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes) {
+	std::cout << "Producer Benchmark" << std::endl;
+
+	// Barrier for synchronization
+	barrier_t barrier(nthread + 1);
+
+	// Data to check everything is OK
+	global_stat_t global;
+
+	// Flag to signal termination
+	std::atomic_bool done  = { false };
+
+	std::cout << "Initializing ";
+
+	int nslots = nnodes * 4;
+	Slot * slots = new Slot[nslots];
+	std::cout << nnodes << " nodes (" << nslots << " slots)" << std::endl;
+
+	// List being tested
+	LIST_VARIANT<Node> list = { nthread, nqueues };
+	{
+		Random rand(rdtscl());
+		for(unsigned i = 0; i < nnodes; i++) {
+			Node * node = new Node(rand.next() % 100);
+			node->id = i;
+			global.crc_in += node->value;
+			list.push(node);
+		}
+
+		for(int i = 0; i < nslots; i++) {
+			slots[i].node = nullptr;
+		}
+	}
+
+	{
+		enable_stats = true;
+
+		std::thread * threads[nthread];
+		unsigned i = 1;
+		for(auto & t : threads) {
+			t = new std::thread([&done, &list, &barrier, &global, slots, nslots](unsigned tid) {
+				Random rand(tid + rdtscl());
+
+				local_stat_t local;
+				barrier.wait(tid);
+
+				// EXPERIMENT START
+
+				runProducer_body(done, rand, slots, nslots, local, list);
+
+				// EXPERIMENT END
+
+				barrier.wait(tid);
+
+				tally_stats(global, local);
+			}, i++);
+		}
+
+		waitfor(duration, barrier, done);
+
+		for(auto t : threads) {
+			t->join();
+			delete t;
+		}
+
+		enable_stats = false;
+	}
+
+	{
+		while(Node * node = list.pop()) {
+			global.crc_out += node->value;
+			delete node;
+		}
+
+		for(int i = 0; i < nslots; i++) {
+			delete slots[i].node;
+		}
+
+		delete [] slots;
+	}
+
+	print_stats(duration, nthread, global);
+}
+
+// ================================================================================================
+__attribute__((noinline)) void runFairness_body(
+	unsigned tid,
+	size_t width,
+	size_t length,
+	int output[],
+	std::atomic_size_t & count,
+	Node initial_nodes[],
+	unsigned nnodes,
+	local_stat_t & local,
+	LIST_VARIANT<Node> & list
+) {
+	Node * nodes[nnodes];
+	{
+		unsigned i = 0;
+		for(auto & n : nodes) {
+			n = &initial_nodes[i++];
+		}
+	}
+
+	while(__builtin_expect(0 != count.load(std::memory_order_relaxed), true)) {
+
+		for(Node * & node : nodes) {
+			local.crc_in += node->id;
+			list.push(node);
+			local.in++;
+		}
+
+		// -----
+
+		for(Node * & node : nodes) {
+			node = list.pop();
+			assert(node);
+
+			if (unsigned(node->value) < length) {
+				size_t idx = (node->value * width) + node->id;
+				assert(idx < (width * length));
+				output[idx] = tid;
+			}
+
+			node->value++;
+			if(unsigned(node->value) == length) count--;
+
+			local.crc_out += node->id;
+			local.out++;
+		}
+	}
+}
+
+void runFairness(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes, const std::string & output) {
+	std::cout << "Fairness Benchmark, outputing to : " << output << std::endl;
+
+	// Barrier for synchronization
+	barrier_t barrier(nthread + 1);
+
+	// Data to check everything is OK
+	global_stat_t global;
+
+	std::cout << "Initializing ";
+
+	// Check fairness by creating a png of where the threads ran
+	size_t width = nthread * nnodes;
+	size_t length = 100000;
+
+	std::unique_ptr<int[]> data_out { new int[width * length] };
+
+	// Flag to signal termination
+	std::atomic_size_t count = width;
+
+	// List being tested
+	LIST_VARIANT<Node> list = { nthread, nqueues };
+	{
+		enable_stats = true;
+
+		std::thread * threads[nthread];
+		unsigned i = 1;
+		for(auto & t : threads) {
+			t = new std::thread([&count, &list, &barrier, &global, nnodes, width, length, data_out = data_out.get()](unsigned tid) {
+				unsigned int start = (tid - 1) * nnodes;
+				Node nodes[nnodes];
+				for(auto & n : nodes) {
+					n.id = start;
+					n.value = 0;
+					start++;
+				}
+
+				local_stat_t local;
+
+				// affinity(tid);
+
+				barrier.wait(tid);
+
+				// EXPERIMENT START
+
+				runFairness_body(tid, width, length, data_out, count, nodes, nnodes, local, list);
+
+				// EXPERIMENT END
+
+				barrier.wait(tid);
+
+				for(const auto & n : nodes) {
+					local.valmax = max(local.valmax, size_t(n.value));
+					local.valmin = min(local.valmin, size_t(n.value));
+				}
+
+				tally_stats(global, local);
+			}, i++);
+		}
+
+		waitfor(duration, barrier, count);
+
+		for(auto t : threads) {
+			t->join();
+			delete t;
+		}
+
+		enable_stats = false;
+	}
+
+	print_stats(duration, nthread, global);
+
+	// save_fairness(data_out.get(), 100, nthread, width, length, output);
+}
+
+// ================================================================================================
+
+bool iequals(const std::string& a, const std::string& b)
+{
+    return std::equal(a.begin(), a.end(),
+                      b.begin(), b.end(),
+                      [](char a, char b) {
+                          return std::tolower(a) == std::tolower(b);
+                      });
+}
+
+int main(int argc, char * argv[]) {
+
+	double duration   = 5.0;
+	unsigned nthreads = 2;
+	unsigned nqueues  = 4;
+	unsigned nnodes   = 100;
+	unsigned nslots   = 100;
+	std::string out   = "fairness.png";
+
+	enum {
+		Churn,
+		PingPong,
+		Producer,
+		Fairness,
+		NONE
+	} benchmark = NONE;
+
+	std::cout.imbue(std::locale(""));
+
+	for(;;) {
+		static struct option options[] = {
+			{"duration",  required_argument, 0, 'd'},
+			{"nthreads",  required_argument, 0, 't'},
+			{"nqueues",   required_argument, 0, 'q'},
+			{"benchmark", required_argument, 0, 'b'},
+			{0, 0, 0, 0}
+		};
+
+		int idx = 0;
+		int opt = getopt_long(argc, argv, "d:t:q:b:", options, &idx);
+
+		std::string arg = optarg ? optarg : "";
+		size_t len = 0;
+		switch(opt) {
+			// Exit Case
+			case -1:
+				/* paranoid */ assert(optind <= argc);
+				switch(benchmark) {
+				case NONE:
+					std::cerr << "Must specify a benchmark" << std::endl;
+					goto usage;
+				case PingPong:
+					nnodes = 1;
+					switch(argc - optind) {
+					case 0: break;
+					case 1:
+						try {
+							arg = optarg = argv[optind];
+							nnodes = stoul(optarg, &len);
+							if(len != arg.size()) { throw std::invalid_argument(""); }
+						} catch(std::invalid_argument &) {
+							std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl;
+							goto usage;
+						}
+						break;
+					default:
+						std::cerr << "'PingPong' benchmark doesn't accept more than 1 extra arguments" << std::endl;
+						goto usage;
+					}
+					break;
+				case Producer:
+					nnodes = 32;
+					switch(argc - optind) {
+					case 0: break;
+					case 1:
+						try {
+							arg = optarg = argv[optind];
+							nnodes = stoul(optarg, &len);
+							if(len != arg.size()) { throw std::invalid_argument(""); }
+						} catch(std::invalid_argument &) {
+							std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl;
+							goto usage;
+						}
+						break;
+					default:
+						std::cerr << "'Producer' benchmark doesn't accept more than 1 extra arguments" << std::endl;
+						goto usage;
+					}
+					break;
+				case Churn:
+					nnodes = 100;
+					nslots = 100;
+					switch(argc - optind) {
+					case 0: break;
+					case 1:
+						try {
+							arg = optarg = argv[optind];
+							nnodes = stoul(optarg, &len);
+							if(len != arg.size()) { throw std::invalid_argument(""); }
+							nslots = nnodes;
+						} catch(std::invalid_argument &) {
+							std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl;
+							goto usage;
+						}
+						break;
+					case 2:
+						try {
+							arg = optarg = argv[optind];
+							nnodes = stoul(optarg, &len);
+							if(len != arg.size()) { throw std::invalid_argument(""); }
+						} catch(std::invalid_argument &) {
+							std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl;
+							goto usage;
+						}
+						try {
+							arg = optarg = argv[optind + 1];
+							nslots = stoul(optarg, &len);
+							if(len != arg.size()) { throw std::invalid_argument(""); }
+						} catch(std::invalid_argument &) {
+							std::cerr << "Number of slots must be a positive integer, was " << arg << std::endl;
+							goto usage;
+						}
+						break;
+					default:
+						std::cerr << "'Churn' benchmark doesn't accept more than 2 extra arguments" << std::endl;
+						goto usage;
+					}
+					break;
+				case Fairness:
+					nnodes = 1;
+					switch(argc - optind) {
+					case 0: break;
+					case 1:
+						arg = optarg = argv[optind];
+						out = arg;
+						break;
+					default:
+						std::cerr << "'Churn' benchmark doesn't accept more than 2 extra arguments" << std::endl;
+						goto usage;
+					}
+				}
+				goto run;
+			// Benchmarks
+			case 'b':
+				if(benchmark != NONE) {
+					std::cerr << "Only when benchmark can be run" << std::endl;
+					goto usage;
+				}
+				if(iequals(arg, "churn")) {
+					benchmark = Churn;
+					break;
+				}
+				if(iequals(arg, "pingpong")) {
+					benchmark = PingPong;
+					break;
+				}
+				if(iequals(arg, "producer")) {
+					benchmark = Producer;
+					break;
+				}
+				if(iequals(arg, "fairness")) {
+					benchmark = Fairness;
+					break;
+				}
+				std::cerr << "Unkown benchmark " << arg << std::endl;
+				goto usage;
+			// Numeric Arguments
+			case 'd':
+				try {
+					duration = stod(optarg, &len);
+					if(len != arg.size()) { throw std::invalid_argument(""); }
+				} catch(std::invalid_argument &) {
+					std::cerr << "Duration must be a valid double, was " << arg << std::endl;
+					goto usage;
+				}
+				break;
+			case 't':
+				try {
+					nthreads = stoul(optarg, &len);
+					if(len != arg.size()) { throw std::invalid_argument(""); }
+				} catch(std::invalid_argument &) {
+					std::cerr << "Number of threads must be a positive integer, was " << arg << std::endl;
+					goto usage;
+				}
+				break;
+			case 'q':
+				try {
+					nqueues = stoul(optarg, &len);
+					if(len != arg.size()) { throw std::invalid_argument(""); }
+				} catch(std::invalid_argument &) {
+					std::cerr << "Number of queues must be a positive integer, was " << arg << std::endl;
+					goto usage;
+				}
+				break;
+			// Other cases
+			default: /* ? */
+				std::cerr << opt << std::endl;
+			usage:
+				std::cerr << "Usage: " << argv[0] << ": [options] -b churn [NNODES] [NSLOTS = NNODES]" << std::endl;
+				std::cerr << "  or:  " << argv[0] << ": [options] -b pingpong [NNODES]" << std::endl;
+				std::cerr << "  or:  " << argv[0] << ": [options] -b producer [NNODES]" << std::endl;
+				std::cerr << std::endl;
+				std::cerr << "  -d, --duration=DURATION  Duration of the experiment, in seconds" << std::endl;
+				std::cerr << "  -t, --nthreads=NTHREADS  Number of kernel threads" << std::endl;
+				std::cerr << "  -q, --nqueues=NQUEUES    Number of queues per threads" << std::endl;
+				std::exit(1);
+		}
+	}
+	run:
+
+	check_cache_line_size();
+
+	std::cout << "Running " << nthreads << " threads (" << (nthreads * nqueues) << " queues) for " << duration << " seconds" << std::endl;
+	std::cout << "Relaxed list variant: " << LIST_VARIANT<Node>::name() << std::endl;
+	switch(benchmark) {
+		case Churn:
+			runChurn(nthreads, nqueues, duration, nnodes, nslots);
+			break;
+		case PingPong:
+			runPingPong(nthreads, nqueues, duration, nnodes);
+			break;
+		case Producer:
+			runProducer(nthreads, nqueues, duration, nnodes);
+			break;
+		case Fairness:
+			runFairness(nthreads, nqueues, duration, nnodes, out);
+			break;
+		default:
+			abort();
+	}
+	return 0;
+}
+
+const char * __my_progname = "Relaxed List";
+
+struct rgb_t {
+    double r;       // a fraction between 0 and 1
+    double g;       // a fraction between 0 and 1
+    double b;       // a fraction between 0 and 1
+};
+
+struct hsv_t {
+    double h;       // angle in degrees
+    double s;       // a fraction between 0 and 1
+    double v;       // a fraction between 0 and 1
+};
+
+rgb_t hsv2rgb(hsv_t in) {
+	double hh, p, q, t, ff;
+	long   i;
+	rgb_t  out;
+
+	if(in.s <= 0.0) {       // < is bogus, just shuts up warnings
+		out.r = in.v;
+		out.g = in.v;
+		out.b = in.v;
+		return out;
+	}
+	hh = in.h;
+	if(hh >= 360.0) hh = 0.0;
+	hh /= 60.0;
+	i = (long)hh;
+	ff = hh - i;
+	p = in.v * (1.0 - in.s);
+	q = in.v * (1.0 - (in.s * ff));
+	t = in.v * (1.0 - (in.s * (1.0 - ff)));
+
+	switch(i) {
+	case 0:
+		out.r = in.v;
+		out.g = t;
+		out.b = p;
+		break;
+	case 1:
+		out.r = q;
+		out.g = in.v;
+		out.b = p;
+		break;
+	case 2:
+		out.r = p;
+		out.g = in.v;
+		out.b = t;
+		break;
+
+	case 3:
+		out.r = p;
+		out.g = q;
+		out.b = in.v;
+		break;
+	case 4:
+		out.r = t;
+		out.g = p;
+		out.b = in.v;
+		break;
+	case 5:
+	default:
+		out.r = in.v;
+		out.g = p;
+		out.b = q;
+		break;
+	}
+	return out;
+}
+
+// void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output) {
+// 	std::ofstream os(output);
+// 	os << "<html>\n";
+// 	os << "<head>\n";
+// 	os << "<style>\n";
+// 	os << "</style>\n";
+// 	os << "</head>\n";
+// 	os << "<body>\n";
+// 	os << "<table style=\"width=100%\">\n";
+
+// 	size_t idx = 0;
+// 	for(size_t r = 0ul; r < rows; r++) {
+// 		os << "<tr>\n";
+// 		for(size_t c = 0ul; c < columns; c++) {
+// 			os << "<td class=\"custom custom" << data[idx] << "\"></td>\n";
+// 			idx++;
+// 		}
+// 		os << "</tr>\n";
+// 	}
+
+// 	os << "</table>\n";
+// 	os << "</body>\n";
+// 	os << "</html>\n";
+// 	os << std::endl;
+// }
+
+// #include <png.h>
+// #include <setjmp.h>
+
+/*
+void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output) {
+	int width  = columns * factor;
+	int height = rows / factor;
+
+	int code = 0;
+	int idx = 0;
+	FILE *fp = NULL;
+	png_structp png_ptr = NULL;
+	png_infop info_ptr = NULL;
+	png_bytep row = NULL;
+
+	// Open file for writing (binary mode)
+	fp = fopen(output.c_str(), "wb");
+	if (fp == NULL) {
+		fprintf(stderr, "Could not open file %s for writing\n", output.c_str());
+		code = 1;
+		goto finalise;
+	}
+
+	   // Initialize write structure
+	png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+	if (png_ptr == NULL) {
+		fprintf(stderr, "Could not allocate write struct\n");
+		code = 1;
+		goto finalise;
+	}
+
+	// Initialize info structure
+	info_ptr = png_create_info_struct(png_ptr);
+	if (info_ptr == NULL) {
+		fprintf(stderr, "Could not allocate info struct\n");
+		code = 1;
+		goto finalise;
+	}
+
+	// Setup Exception handling
+	if (setjmp(png_jmpbuf(png_ptr))) {
+		fprintf(stderr, "Error during png creation\n");
+		code = 1;
+		goto finalise;
+	}
+
+	png_init_io(png_ptr, fp);
+
+	// Write header (8 bit colour depth)
+	png_set_IHDR(png_ptr, info_ptr, width, height,
+		8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE,
+		PNG_COMPRESSION_TYPE_BASE, PNG_FILTER_TYPE_BASE);
+
+	png_write_info(png_ptr, info_ptr);
+
+	// Allocate memory for one row (3 bytes per pixel - RGB)
+	row = (png_bytep) malloc(3 * width * sizeof(png_byte));
+
+	// Write image data
+	int x, y;
+	for (y=0 ; y<height ; y++) {
+		for (x=0 ; x<width ; x++) {
+			auto & r = row[(x * 3) + 0];
+			auto & g = row[(x * 3) + 1];
+			auto & b = row[(x * 3) + 2];
+			assert(idx < (rows * columns));
+			int color = data[idx] - 1;
+			assert(color < nthreads);
+			assert(color >= 0);
+			idx++;
+
+			double angle = double(color) / double(nthreads);
+
+			auto c = hsv2rgb({ 360.0 * angle, 0.8, 0.8 });
+
+			r = char(c.r * 255.0);
+			g = char(c.g * 255.0);
+			b = char(c.b * 255.0);
+
+		}
+		png_write_row(png_ptr, row);
+	}
+
+	assert(idx == (rows * columns));
+
+	// End write
+	png_write_end(png_ptr, NULL);
+
+	finalise:
+	if (fp != NULL) fclose(fp);
+	if (info_ptr != NULL) png_free_data(png_ptr, info_ptr, PNG_FREE_ALL, -1);
+	if (png_ptr != NULL) png_destroy_write_struct(&png_ptr, (png_infopp)NULL);
+	if (row != NULL) free(row);
+}
+*/
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,555 @@
+#pragma once
+#define LIST_VARIANT relaxed_list
+
+#define VANILLA 0
+#define SNZI 1
+#define BITMASK 2
+#define DISCOVER 3
+#define SNZM 4
+#define BIAS 5
+#define BACK 6
+#define BACKBIAS 7
+
+#ifndef VARIANT
+#define VARIANT VANILLA
+#endif
+
+#ifndef NO_STATS
+#include <iostream>
+#endif
+
+#include <cmath>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <type_traits>
+
+#include "assert.hpp"
+#include "utils.hpp"
+#include "links.hpp"
+#include "snzi.hpp"
+#include "snzi-packed.hpp"
+#include "snzm.hpp"
+
+using namespace std;
+
+struct pick_stat {
+	struct {
+		size_t attempt = 0;
+		size_t success = 0;
+		size_t local = 0;
+	} push;
+	struct {
+		size_t attempt = 0;
+		size_t success = 0;
+		size_t mask_attempt = 0;
+		size_t mask_reset = 0;
+		size_t local = 0;
+	} pop;
+};
+
+struct empty_stat {
+	struct {
+		size_t value = 0;
+		size_t count = 0;
+	} push;
+	struct {
+		size_t value = 0;
+		size_t count = 0;
+	} pop;
+};
+
+template<typename node_t>
+class __attribute__((aligned(128))) relaxed_list {
+	static_assert(std::is_same<decltype(node_t::_links), _LinksFields_t<node_t>>::value, "Node must have a links field");
+
+public:
+	static const char * name() {
+		const char * names[] = {
+			"RELAXED: VANILLA",
+			"RELAXED: SNZI",
+			"RELAXED: BITMASK",
+			"RELAXED: SNZI + DISCOVERED MASK",
+			"RELAXED: SNZI + MASK",
+			"RELAXED: SNZI + LOCAL BIAS",
+			"RELAXED: SNZI + REVERSE RNG",
+			"RELAXED: SNZI + LOCAL BIAS + REVERSE RNG"
+		};
+		return names[VARIANT];
+	}
+
+	relaxed_list(unsigned numThreads, unsigned numQueues)
+		: numLists(numThreads * numQueues)
+	  	, lists(new intrusive_queue_t<node_t>[numLists])
+		#if VARIANT == SNZI || VARIANT == BACK
+			, snzi( std::log2( numLists / (2 * numQueues) ), 2 )
+		#elif VARIANT == BIAS || VARIANT == BACKBIAS
+			#ifdef SNZI_PACKED
+				, snzi( std::ceil( std::log2(numLists) ) )
+			#else
+				, snzi( std::log2( numLists / (2 * numQueues) ), 2 )
+			#endif
+		#elif VARIANT == SNZM || VARIANT == DISCOVER
+			, snzm( numLists )
+		#endif
+	{
+		assertf(7 * 8 * 8 >= numLists, "List currently only supports 448 sublists");
+		std::cout << "Constructing Relaxed List with " << numLists << std::endl;
+	}
+
+	~relaxed_list() {
+		std::cout << "Destroying Relaxed List" << std::endl;
+		lists.reset();
+	}
+
+    	__attribute__((noinline, hot)) void push(node_t * node) {
+		node->_links.ts = rdtscl();
+
+		while(true) {
+			// Pick a random list
+			unsigned i = idx_from_r(tls.rng1.next(), VARIANT == BIAS || VARIANT == BACKBIAS);
+
+			#ifndef NO_STATS
+				tls.pick.push.attempt++;
+			#endif
+
+			// If we can't lock it retry
+			if( !lists[i].lock.try_lock() ) continue;
+
+			#if VARIANT == VANILLA || VARIANT == BITMASK
+				__attribute__((unused)) int num = numNonEmpty;
+			#endif
+
+			// Actually push it
+			if(lists[i].push(node)) {
+				#if VARIANT == DISCOVER
+					size_t qword = i >> 6ull;
+					size_t bit   = i & 63ull;
+					assert(qword == 0);
+					bts(tls.mask, bit);
+					snzm.arrive(i);
+				#elif VARIANT == SNZI || VARIANT == BIAS
+					snzi.arrive(i);
+				#elif VARIANT == BACK || VARIANT == BACKBIAS
+					snzi.arrive(i);
+					tls.rng2.set_raw_state( tls.rng1.get_raw_state());
+				#elif VARIANT == SNZM
+					snzm.arrive(i);
+				#elif VARIANT == BITMASK
+					numNonEmpty++;
+					size_t qword = i >> 6ull;
+					size_t bit   = i & 63ull;
+					assertf((list_mask[qword] & (1ul << bit)) == 0, "Before set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit));
+					__attribute__((unused)) bool ret = bts(list_mask[qword], bit);
+					assert(!ret);
+					assertf((list_mask[qword] & (1ul << bit)) != 0, "After set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit));
+				#else
+					numNonEmpty++;
+				#endif
+			}
+			#if VARIANT == VANILLA || VARIANT == BITMASK
+				assert(numNonEmpty <= (int)numLists);
+			#endif
+
+			// Unlock and return
+			lists[i].lock.unlock();
+
+			#ifndef NO_STATS
+				tls.pick.push.success++;
+				#if VARIANT == VANILLA || VARIANT == BITMASK
+					tls.empty.push.value += num;
+					tls.empty.push.count += 1;
+				#endif
+			#endif
+			return;
+		}
+    	}
+
+	__attribute__((noinline, hot)) node_t * pop() {
+		#if VARIANT == DISCOVER
+			assert(numLists <= 64);
+			while(snzm.query()) {
+				tls.pick.pop.mask_attempt++;
+				unsigned i, j;
+				{
+					// Pick first list totally randomly
+					i = tls.rng1.next() % numLists;
+
+					// Pick the other according to the bitmask
+					unsigned r = tls.rng1.next();
+
+					size_t mask = tls.mask.load(std::memory_order_relaxed);
+					if(mask == 0) {
+						tls.pick.pop.mask_reset++;
+						mask = (1U << numLists) - 1;
+						tls.mask.store(mask, std::memory_order_relaxed);
+					}
+
+					unsigned b = rand_bit(r, mask);
+
+					assertf(b < 64, "%zu %u", mask, b);
+
+					j = b;
+
+					assert(j < numLists);
+				}
+
+				if(auto node = try_pop(i, j)) return node;
+			}
+		#elif VARIANT == SNZI
+			while(snzi.query()) {
+				// Pick two lists at random
+				int i = tls.rng1.next() % numLists;
+				int j = tls.rng1.next() % numLists;
+
+				if(auto node = try_pop(i, j)) return node;
+			}
+
+		#elif VARIANT == BACK
+			while(snzi.query()) {
+				// Pick two lists at random
+				int i = tls.rng2.prev() % numLists;
+				int j = tls.rng2.prev() % numLists;
+
+				if(auto node = try_pop(i, j)) return node;
+			}
+
+		#elif VARIANT == BACKBIAS
+			while(snzi.query()) {
+				// Pick two lists at random
+				int i = idx_from_r(tls.rng2.prev(), true);
+				int j = idx_from_r(tls.rng2.prev(), true);
+
+				if(auto node = try_pop(i, j)) return node;
+			}
+
+		#elif VARIANT == BIAS
+			while(snzi.query()) {
+				// Pick two lists at random
+				unsigned ri = tls.rng1.next();
+				unsigned i;
+				unsigned j = tls.rng1.next();
+				if(0 == (ri & 0xF)) {
+					i = (ri >> 4) % numLists;
+				} else {
+					i = tls.my_queue + ((ri >> 4) % 4);
+					j = tls.my_queue + ((j >> 4) % 4);
+					tls.pick.pop.local++;
+				}
+				i %= numLists;
+				j %= numLists;
+
+				if(auto node = try_pop(i, j)) return node;
+			}
+		#elif VARIANT == SNZM
+			//*
+			while(snzm.query()) {
+				tls.pick.pop.mask_attempt++;
+				unsigned i, j;
+				{
+					// Pick two random number
+					unsigned ri = tls.rng1.next();
+					unsigned rj = tls.rng1.next();
+
+					// Pick two nodes from it
+					unsigned wdxi = ri & snzm.mask;
+					// unsigned wdxj = rj & snzm.mask;
+
+					// Get the masks from the nodes
+					// size_t maski = snzm.masks(wdxi);
+					size_t maskj = snzm.masks(wdxj);
+
+					if(maski == 0 && maskj == 0) continue;
+
+					#if defined(__BMI2__)
+						uint64_t idxsi = _pext_u64(snzm.indexes, maski);
+						// uint64_t idxsj = _pext_u64(snzm.indexes, maskj);
+
+						auto pi = __builtin_popcountll(maski);
+						// auto pj = __builtin_popcountll(maskj);
+
+						ri = pi ? ri & ((pi >> 3) - 1) : 0;
+						rj = pj ? rj & ((pj >> 3) - 1) : 0;
+
+						unsigned bi = (idxsi >> (ri << 3)) & 0xff;
+						unsigned bj = (idxsj >> (rj << 3)) & 0xff;
+					#else
+						unsigned bi = rand_bit(ri >> snzm.depth, maski);
+						unsigned bj = rand_bit(rj >> snzm.depth, maskj);
+					#endif
+
+					i = (bi << snzm.depth) | wdxi;
+					j = (bj << snzm.depth) | wdxj;
+
+					/* paranoid */ assertf(i < numLists, "%u %u", bj, wdxi);
+					/* paranoid */ assertf(j < numLists, "%u %u", bj, wdxj);
+				}
+
+				if(auto node = try_pop(i, j)) return node;
+			}
+			/*/
+			while(snzm.query()) {
+				// Pick two lists at random
+				int i = tls.rng1.next() % numLists;
+				int j = tls.rng1.next() % numLists;
+
+				if(auto node = try_pop(i, j)) return node;
+			}
+			//*/
+		#elif VARIANT == BITMASK
+			int nnempty;
+			while(0 != (nnempty = numNonEmpty)) {
+				tls.pick.pop.mask_attempt++;
+				unsigned i, j;
+				{
+					// Pick two lists at random
+					unsigned num = ((numLists - 1) >> 6) + 1;
+
+					unsigned ri = tls.rng1.next();
+					unsigned rj = tls.rng1.next();
+
+					unsigned wdxi = (ri >> 6u) % num;
+					unsigned wdxj = (rj >> 6u) % num;
+
+					size_t maski = list_mask[wdxi].load(std::memory_order_relaxed);
+					size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed);
+
+					if(maski == 0 && maskj == 0) continue;
+
+					unsigned bi = rand_bit(ri, maski);
+					unsigned bj = rand_bit(rj, maskj);
+
+					assertf(bi < 64, "%zu %u", maski, bi);
+					assertf(bj < 64, "%zu %u", maskj, bj);
+
+					i = bi | (wdxi << 6);
+					j = bj | (wdxj << 6);
+
+					assertf(i < numLists, "%u", wdxi << 6);
+					assertf(j < numLists, "%u", wdxj << 6);
+				}
+
+				if(auto node = try_pop(i, j)) return node;
+			}
+		#else
+			while(numNonEmpty != 0) {
+				// Pick two lists at random
+				int i = tls.rng1.next() % numLists;
+				int j = tls.rng1.next() % numLists;
+
+				if(auto node = try_pop(i, j)) return node;
+			}
+		#endif
+
+		return nullptr;
+    	}
+
+private:
+	node_t * try_pop(unsigned i, unsigned j) {
+		#ifndef NO_STATS
+			tls.pick.pop.attempt++;
+		#endif
+
+		#if VARIANT == DISCOVER
+			if(lists[i].ts() > 0) bts(tls.mask, i); else btr(tls.mask, i);
+			if(lists[j].ts() > 0) bts(tls.mask, j); else btr(tls.mask, j);
+		#endif
+
+		// Pick the bet list
+		int w = i;
+		if( __builtin_expect(lists[j].ts() != 0, true) ) {
+			w = (lists[i].ts() < lists[j].ts()) ? i : j;
+		}
+
+		auto & list = lists[w];
+		// If list looks empty retry
+		if( list.ts() == 0 ) return nullptr;
+
+		// If we can't get the lock retry
+		if( !list.lock.try_lock() ) return nullptr;
+
+		#if VARIANT == VANILLA || VARIANT == BITMASK
+			__attribute__((unused)) int num = numNonEmpty;
+		#endif
+
+		// If list is empty, unlock and retry
+		if( list.ts() == 0 ) {
+			list.lock.unlock();
+			return nullptr;
+		}
+
+		// Actually pop the list
+		node_t * node;
+		bool emptied;
+		std::tie(node, emptied) = list.pop();
+		assert(node);
+
+		if(emptied) {
+			#if VARIANT == DISCOVER
+				size_t qword = w >> 6ull;
+				size_t bit   = w & 63ull;
+				assert(qword == 0);
+				__attribute__((unused)) bool ret = btr(tls.mask, bit);
+				snzm.depart(w);
+			#elif VARIANT == SNZI || VARIANT == BIAS || VARIANT == BACK || VARIANT == BACKBIAS
+				snzi.depart(w);
+			#elif VARIANT == SNZM
+				snzm.depart(w);
+			#elif VARIANT == BITMASK
+				numNonEmpty--;
+				size_t qword = w >> 6ull;
+				size_t bit   = w & 63ull;
+				assert((list_mask[qword] & (1ul << bit)) != 0);
+				__attribute__((unused)) bool ret = btr(list_mask[qword], bit);
+				assert(ret);
+				assert((list_mask[qword] & (1ul << bit)) == 0);
+			#else
+				numNonEmpty--;
+			#endif
+		}
+
+		// Unlock and return
+		list.lock.unlock();
+		#if VARIANT == VANILLA || VARIANT == BITMASK
+			assert(numNonEmpty >= 0);
+		#endif
+		#ifndef NO_STATS
+			tls.pick.pop.success++;
+			#if VARIANT == VANILLA || VARIANT == BITMASK
+				tls.empty.pop.value += num;
+				tls.empty.pop.count += 1;
+			#endif
+		#endif
+		return node;
+	}
+
+	inline unsigned idx_from_r(unsigned r, bool bias) {
+		unsigned i;
+		if(bias) {
+			if(0 == (r & 0x3F)) {
+				i = r >> 6;
+			} else {
+				i = tls.my_queue + ((r >> 6) % 4);
+				tls.pick.push.local++;
+			}
+		} else {
+			i = r;
+		}
+		return i % numLists;
+	}
+
+public:
+
+	static __attribute__((aligned(128))) thread_local struct TLS {
+		Random     rng1 = { unsigned(std::hash<std::thread::id>{}(std::this_thread::get_id()) ^ rdtscl()) };
+		Random     rng2 = { unsigned(std::hash<std::thread::id>{}(std::this_thread::get_id()) ^ rdtscl()) };
+		unsigned   my_queue = (ticket++) * 4;
+		pick_stat  pick;
+		empty_stat empty;
+		__attribute__((aligned(64))) std::atomic_size_t mask = { 0 };
+	} tls;
+
+private:
+	const unsigned numLists;
+    	__attribute__((aligned(64))) std::unique_ptr<intrusive_queue_t<node_t> []> lists;
+private:
+	#if VARIANT == SNZI || VARIANT == BACK
+		snzi_t snzi;
+	#elif VARIANT == BIAS || VARIANT == BACKBIAS
+		#ifdef SNZI_PACKED
+			snzip_t snzi;
+		#else
+			snzi_t snzi;
+		#endif
+	#elif VARIANT == SNZM || VARIANT == DISCOVER
+		snzm_t snzm;
+	#else
+		std::atomic_int numNonEmpty  = { 0 };  // number of non-empty lists
+	#endif
+	#if VARIANT == BITMASK
+		std::atomic_size_t list_mask[7] = { {0}, {0}, {0}, {0}, {0}, {0}, {0} }; // which queues are empty
+	#endif
+
+public:
+	static const constexpr size_t sizeof_queue = sizeof(intrusive_queue_t<node_t>);
+	static std::atomic_uint32_t ticket;
+
+#ifndef NO_STATS
+	static void stats_tls_tally() {
+		global_stats.pick.push.attempt += tls.pick.push.attempt;
+		global_stats.pick.push.success += tls.pick.push.success;
+		global_stats.pick.push.local += tls.pick.push.local;
+		global_stats.pick.pop .attempt += tls.pick.pop.attempt;
+		global_stats.pick.pop .success += tls.pick.pop.success;
+		global_stats.pick.pop .mask_attempt += tls.pick.pop.mask_attempt;
+		global_stats.pick.pop .mask_reset += tls.pick.pop.mask_reset;
+		global_stats.pick.pop .local += tls.pick.pop.local;
+
+		global_stats.qstat.push.value += tls.empty.push.value;
+		global_stats.qstat.push.count += tls.empty.push.count;
+		global_stats.qstat.pop .value += tls.empty.pop .value;
+		global_stats.qstat.pop .count += tls.empty.pop .count;
+	}
+
+private:
+	static struct GlobalStats {
+		struct {
+			struct {
+				std::atomic_size_t attempt = { 0 };
+				std::atomic_size_t success = { 0 };
+				std::atomic_size_t local = { 0 };
+			} push;
+			struct {
+				std::atomic_size_t attempt = { 0 };
+				std::atomic_size_t success = { 0 };
+				std::atomic_size_t mask_attempt = { 0 };
+				std::atomic_size_t mask_reset = { 0 };
+				std::atomic_size_t local = { 0 };
+			} pop;
+		} pick;
+		struct {
+			struct {
+				std::atomic_size_t value = { 0 };
+				std::atomic_size_t count = { 0 };
+			} push;
+			struct {
+				std::atomic_size_t value = { 0 };
+				std::atomic_size_t count = { 0 };
+			} pop;
+		} qstat;
+	} global_stats;
+
+public:
+	static void stats_print(std::ostream & os ) {
+		std::cout << "----- Relaxed List Stats -----" << std::endl;
+
+		const auto & global = global_stats;
+
+		double push_sur = (100.0 * double(global.pick.push.success) / global.pick.push.attempt);
+		double pop_sur  = (100.0 * double(global.pick.pop .success) / global.pick.pop .attempt);
+		double mpop_sur = (100.0 * double(global.pick.pop .success) / global.pick.pop .mask_attempt);
+		double rpop_sur = (100.0 * double(global.pick.pop .success) / global.pick.pop .mask_reset);
+
+		double push_len = double(global.pick.push.attempt     ) / global.pick.push.success;
+		double pop_len  = double(global.pick.pop .attempt     ) / global.pick.pop .success;
+		double mpop_len = double(global.pick.pop .mask_attempt) / global.pick.pop .success;
+		double rpop_len = double(global.pick.pop .mask_reset  ) / global.pick.pop .success;
+
+		os << "Push   Pick   : " << push_sur << " %, len " << push_len << " (" << global.pick.push.attempt      << " / " << global.pick.push.success << ")\n";
+		os << "Pop    Pick   : " << pop_sur  << " %, len " << pop_len  << " (" << global.pick.pop .attempt      << " / " << global.pick.pop .success << ")\n";
+		os << "TryPop Pick   : " << mpop_sur << " %, len " << mpop_len << " (" << global.pick.pop .mask_attempt << " / " << global.pick.pop .success << ")\n";
+		os << "Pop M Reset   : " << rpop_sur << " %, len " << rpop_len << " (" << global.pick.pop .mask_reset   << " / " << global.pick.pop .success << ")\n";
+
+		double avgQ_push = double(global.qstat.push.value) / global.qstat.push.count;
+		double avgQ_pop  = double(global.qstat.pop .value) / global.qstat.pop .count;
+		double avgQ      = double(global.qstat.push.value + global.qstat.pop .value) / (global.qstat.push.count + global.qstat.pop .count);
+		os << "Push   Avg Qs : " << avgQ_push << " (" << global.qstat.push.count << "ops)\n";
+		os << "Pop    Avg Qs : " << avgQ_pop  << " (" << global.qstat.pop .count << "ops)\n";
+		os << "Global Avg Qs : " << avgQ      << " (" << (global.qstat.push.count + global.qstat.pop .count) << "ops)\n";
+
+		os << "Local Push    : " << global.pick.push.local << "\n";
+		os << "Local Pop     : " << global.pick.pop .local << "\n";
+	}
+#endif
+};
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list_layout.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list_layout.cpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list_layout.cpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,23 @@
+#define NO_IO
+#define NDEBUG
+#include "relaxed_list.hpp"
+
+struct __attribute__((aligned(64))) Node {
+	static std::atomic_size_t creates;
+	static std::atomic_size_t destroys;
+
+	_LinksFields_t<Node> _links;
+
+	int value;
+	Node(int value): value(value) {
+		creates++;
+	}
+
+	~Node() {
+		destroys++;
+	}
+};
+
+int main() {
+	return sizeof(relaxed_list<Node>) + relaxed_list<Node>::sizeof_queue;
+}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/runperf.sh
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/runperf.sh	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/runperf.sh	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+
+name=$1
+event=$2
+
+shift 2
+
+echo "perf record -F 99 -a -g -o raw/$name.data -e $event -- $@ > raw/$name.out"
+perf record -F 99 -a -g -o raw/$name.data -e $event -- $@ > raw/$name.out
+echo "=============================="
+cat raw/$name.out
+echo "=============================="
+./process.sh $name
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/scale.sh
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/scale.sh	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/scale.sh	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,7 @@
+#!/bin/bash
+taskset -c 24-31 ./a.out -t  1 -b churn | grep --color -E "(ns|Ops|Running)"
+taskset -c 24-31 ./a.out -t  2 -b churn | grep --color -E "(ns|Ops|Running)"
+taskset -c 24-31 ./a.out -t  4 -b churn | grep --color -E "(ns|Ops|Running)"
+taskset -c 24-31 ./a.out -t  8 -b churn | grep --color -E "(ns|Ops|Running)"
+taskset -c 16-31 ./a.out -t 16 -b churn | grep --color -E "(ns|Ops|Running)"
+taskset -c  0-31 ./a.out -t 32 -b churn | grep --color -E "(ns|Ops|Running)"
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzi-packed.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzi-packed.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzi-packed.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,179 @@
+#pragma once
+
+#define SNZI_PACKED
+
+#include "utils.hpp"
+
+
+class snzip_t {
+	class node;
+	class node_aligned;
+public:
+	const unsigned mask;
+	const int root;
+	std::unique_ptr<snzip_t::node[]> leafs;
+	std::unique_ptr<snzip_t::node_aligned[]> nodes;
+
+	snzip_t(unsigned depth);
+
+	void arrive(int idx) {
+		// idx >>= 1;
+		idx %= mask;
+		leafs[idx].arrive();
+	}
+
+	void depart(int idx) {
+		// idx >>= 1;
+		idx %= mask;
+		leafs[idx].depart();
+	}
+
+	bool query() const {
+		return nodes[root].query();
+	}
+
+
+private:
+	class __attribute__((aligned(32))) node {
+		friend class snzip_t;
+	private:
+
+		union val_t {
+			static constexpr char Half = -1;
+
+			uint64_t _all;
+			struct __attribute__((packed)) {
+				char cnt;
+				uint64_t ver:56;
+			};
+
+			bool cas(val_t & exp, char _cnt, uint64_t _ver) volatile {
+				val_t t;
+				t.ver = _ver;
+				t.cnt = _cnt;
+				/* paranoid */ assert(t._all == ((_ver << 8) | ((unsigned char)_cnt)));
+				return __atomic_compare_exchange_n(&this->_all, &exp._all, t._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+			}
+
+			bool cas(val_t & exp, const val_t & tar) volatile {
+				return __atomic_compare_exchange_n(&this->_all, &exp._all, tar._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+			}
+
+			val_t() : _all(0) {}
+			val_t(const volatile val_t & o) : _all(o._all) {}
+		};
+
+		//--------------------------------------------------
+		// Hierarchical node
+		void arrive_h() {
+			int undoArr = 0;
+			bool success = false;
+			while(!success) {
+				auto x{ value };
+				/* paranoid */ assert(x.cnt <= 120);
+				if( x.cnt >= 1 ) {
+					if( value.cas(x, x.cnt + 1, x.ver ) ) {
+						success = true;
+					}
+				}
+				/* paranoid */ assert(x.cnt <= 120);
+				if( x.cnt == 0 ) {
+					if( value.cas(x, val_t::Half, x.ver + 1) ) {
+						success = true;
+						x.cnt = val_t::Half;
+						x.ver = x.ver + 1;
+					}
+				}
+				/* paranoid */ assert(x.cnt <= 120);
+				if( x.cnt == val_t::Half ) {
+					/* paranoid */ assert(parent);
+					if(undoArr == 2) {
+						undoArr--;
+					} else {
+						parent->arrive();
+					}
+					if( !value.cas(x, 1, x.ver) ) {
+						undoArr = undoArr + 1;
+					}
+				}
+			}
+
+			for(int i = 0; i < undoArr; i++) {
+				/* paranoid */ assert(parent);
+				parent->depart();
+			}
+		}
+
+		void depart_h() {
+			while(true) {
+				auto x = (const val_t)value;
+				/* paranoid */ assertf(x.cnt >= 1, "%d", x.cnt);
+				if( value.cas( x, x.cnt - 1, x.ver ) ) {
+					if( x.cnt == 1 ) {
+						/* paranoid */ assert(parent);
+						parent->depart();
+					}
+					return;
+				}
+			}
+		}
+
+		//--------------------------------------------------
+		// Root node
+		void arrive_r() {
+			__atomic_fetch_add(&value._all, 1, __ATOMIC_SEQ_CST);
+		}
+
+		void depart_r() {
+			__atomic_fetch_sub(&value._all, 1, __ATOMIC_SEQ_CST);
+		}
+
+	private:
+		volatile val_t value;
+		class node * parent = nullptr;
+
+		bool is_root() {
+			return parent == nullptr;
+		}
+
+	public:
+		void arrive() {
+			if(is_root()) arrive_r();
+			else arrive_h();
+		}
+
+		void depart() {
+			if(is_root()) depart_r();
+			else depart_h();
+		}
+
+		bool query() {
+			/* paranoid */ assert(is_root());
+			return value._all > 0;
+		}
+	};
+
+	class __attribute__((aligned(128))) node_aligned : public node {};
+};
+
+snzip_t::snzip_t(unsigned depth)
+	: mask( std::pow(2, depth) )
+	, root( ((std::pow(2, depth + 1) - 1) / (2 -1)) - 1 - mask )
+	, leafs(new node[ mask ]())
+	, nodes(new node_aligned[ root + 1 ]())
+{
+	int width = std::pow(2, depth);
+	int hwdith = width / 2;
+	std::cout << "SNZI: " << depth << "x" << width << "(" << mask - 1 << ") " << (sizeof(snzip_t::node) * (root + 1)) << " bytes" << std::endl;
+	for(int i = 0; i < width; i++) {
+		int idx = i % hwdith;
+		std::cout << i << " -> " << idx + width << std::endl;
+		leafs[i].parent = &nodes[ idx ];
+	}
+
+	for(int i = 0; i < root; i++) {
+		int idx = (i / 2) + hwdith;
+		std::cout << i + width << " -> " << idx + width << std::endl;
+		nodes[i].parent = &nodes[ idx ];
+	}
+}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzi.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzi.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzi.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,164 @@
+#pragma once
+
+#include "utils.hpp"
+
+
+class snzi_t {
+	class node;
+public:
+	const unsigned mask;
+	const int root;
+	std::unique_ptr<snzi_t::node[]> nodes;
+
+	snzi_t(unsigned depth, unsigned base = 2);
+
+	void arrive(int idx) {
+		idx >>= 2;
+		idx %= mask;
+		nodes[idx].arrive();
+	}
+
+	void depart(int idx) {
+		idx >>= 2;
+		idx %= mask;
+		nodes[idx].depart();
+	}
+
+	bool query() const {
+		return nodes[root].query();
+	}
+
+
+private:
+	class __attribute__((aligned(128))) node {
+		friend class snzi_t;
+	private:
+
+		union val_t {
+			static constexpr char Half = -1;
+
+			uint64_t _all;
+			struct __attribute__((packed)) {
+				char cnt;
+				uint64_t ver:56;
+			};
+
+			bool cas(val_t & exp, char _cnt, uint64_t _ver) volatile {
+				val_t t;
+				t.ver = _ver;
+				t.cnt = _cnt;
+				/* paranoid */ assert(t._all == ((_ver << 8) | ((unsigned char)_cnt)));
+				return __atomic_compare_exchange_n(&this->_all, &exp._all, t._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+			}
+
+			bool cas(val_t & exp, const val_t & tar) volatile {
+				return __atomic_compare_exchange_n(&this->_all, &exp._all, tar._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+			}
+
+			val_t() : _all(0) {}
+			val_t(const volatile val_t & o) : _all(o._all) {}
+		};
+
+		//--------------------------------------------------
+		// Hierarchical node
+		void arrive_h() {
+			int undoArr = 0;
+			bool success = false;
+			while(!success) {
+				auto x{ value };
+				/* paranoid */ assert(x.cnt <= 120);
+				if( x.cnt >= 1 ) {
+					if( value.cas(x, x.cnt + 1, x.ver ) ) {
+						success = true;
+					}
+				}
+				/* paranoid */ assert(x.cnt <= 120);
+				if( x.cnt == 0 ) {
+					if( value.cas(x, val_t::Half, x.ver + 1) ) {
+						success = true;
+						x.cnt = val_t::Half;
+						x.ver = x.ver + 1;
+					}
+				}
+				/* paranoid */ assert(x.cnt <= 120);
+				if( x.cnt == val_t::Half ) {
+					/* paranoid */ assert(parent);
+					if(undoArr == 2) {
+						undoArr--;
+					} else {
+						parent->arrive();
+					}
+					if( !value.cas(x, 1, x.ver) ) {
+						undoArr = undoArr + 1;
+					}
+				}
+			}
+
+			for(int i = 0; i < undoArr; i++) {
+				/* paranoid */ assert(parent);
+				parent->depart();
+			}
+		}
+
+		void depart_h() {
+			while(true) {
+				auto x = (const val_t)value;
+				/* paranoid */ assertf(x.cnt >= 1, "%d", x.cnt);
+				if( value.cas( x, x.cnt - 1, x.ver ) ) {
+					if( x.cnt == 1 ) {
+						/* paranoid */ assert(parent);
+						parent->depart();
+					}
+					return;
+				}
+			}
+		}
+
+		//--------------------------------------------------
+		// Root node
+		void arrive_r() {
+			__atomic_fetch_add(&value._all, 1, __ATOMIC_SEQ_CST);
+		}
+
+		void depart_r() {
+			__atomic_fetch_sub(&value._all, 1, __ATOMIC_SEQ_CST);
+		}
+
+	private:
+		volatile val_t value;
+		class node * parent = nullptr;
+
+		bool is_root() {
+			return parent == nullptr;
+		}
+
+	public:
+		void arrive() {
+			if(is_root()) arrive_r();
+			else arrive_h();
+		}
+
+		void depart() {
+			if(is_root()) depart_r();
+			else depart_h();
+		}
+
+		bool query() {
+			/* paranoid */ assert(is_root());
+			return value._all > 0;
+		}
+	};
+};
+
+snzi_t::snzi_t(unsigned depth, unsigned base)
+	: mask( std::pow(base, depth) )
+	, root( ((std::pow(base, depth + 1) - 1) / (base -1)) - 1 )
+	, nodes(new node[ root + 1 ]())
+{
+	int width = std::pow(base, depth);
+	std::cout << "SNZI: " << depth << "x" << width << "(" << mask - 1 << ") " << (sizeof(snzi_t::node) * (root + 1)) << " bytes" << std::endl;
+	for(int i = 0; i < root; i++) {
+		std::cout << i << " -> " << (i / base) + width << std::endl;
+		nodes[i].parent = &nodes[(i / base) + width];
+	}
+}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzm.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzm.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzm.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,213 @@
+#pragma once
+
+#include "utils.hpp"
+
+
+class snzm_t {
+	class node;
+public:
+	const unsigned depth;
+	const unsigned mask;
+	const int root;
+	std::unique_ptr<snzm_t::node[]> nodes;
+
+	#if defined(__BMI2__)
+		const uint64_t indexes = 0x0706050403020100;
+	#endif
+
+	snzm_t(unsigned numLists);
+
+	void arrive(int idx) {
+		int i = idx & mask;
+		nodes[i].arrive( idx >> depth);
+	}
+
+	void depart(int idx) {
+		int i = idx & mask;
+		nodes[i].depart( idx >> depth );
+	}
+
+	bool query() const {
+		return nodes[root].query();
+	}
+
+	uint64_t masks( unsigned node ) {
+		/* paranoid */ assert( (node & mask) == node );
+		#if defined(__BMI2__)
+			return nodes[node].mask_all;
+		#else
+			return nodes[node].mask;
+		#endif
+	}
+
+private:
+	class __attribute__((aligned(128))) node {
+		friend class snzm_t;
+	private:
+
+		union val_t {
+			static constexpr char Half = -1;
+
+			uint64_t _all;
+			struct __attribute__((packed)) {
+				char cnt;
+				uint64_t ver:56;
+			};
+
+			bool cas(val_t & exp, char _cnt, uint64_t _ver) volatile {
+				val_t t;
+				t.ver = _ver;
+				t.cnt = _cnt;
+				/* paranoid */ assert(t._all == ((_ver << 8) | ((unsigned char)_cnt)));
+				return __atomic_compare_exchange_n(&this->_all, &exp._all, t._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+			}
+
+			bool cas(val_t & exp, const val_t & tar) volatile {
+				return __atomic_compare_exchange_n(&this->_all, &exp._all, tar._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+			}
+
+			val_t() : _all(0) {}
+			val_t(const volatile val_t & o) : _all(o._all) {}
+		};
+
+		//--------------------------------------------------
+		// Hierarchical node
+		void arrive_h() {
+			int undoArr = 0;
+			bool success = false;
+			while(!success) {
+				auto x{ value };
+				/* paranoid */ assert(x.cnt <= 120);
+				if( x.cnt >= 1 ) {
+					if( value.cas(x, x.cnt + 1, x.ver ) ) {
+						success = true;
+					}
+				}
+				/* paranoid */ assert(x.cnt <= 120);
+				if( x.cnt == 0 ) {
+					if( value.cas(x, val_t::Half, x.ver + 1) ) {
+						success = true;
+						x.cnt = val_t::Half;
+						x.ver = x.ver + 1;
+					}
+				}
+				/* paranoid */ assert(x.cnt <= 120);
+				if( x.cnt == val_t::Half ) {
+					/* paranoid */ assert(parent);
+					parent->arrive();
+					if( !value.cas(x, 1, x.ver) ) {
+						undoArr = undoArr + 1;
+					}
+				}
+			}
+
+			for(int i = 0; i < undoArr; i++) {
+				/* paranoid */ assert(parent);
+				parent->depart();
+			}
+		}
+
+		void depart_h() {
+			while(true) {
+				auto x = (const val_t)value;
+				/* paranoid */ assertf(x.cnt >= 1, "%d", x.cnt);
+				if( value.cas( x, x.cnt - 1, x.ver ) ) {
+					if( x.cnt == 1 ) {
+						/* paranoid */ assert(parent);
+						parent->depart();
+					}
+					return;
+				}
+			}
+		}
+
+		//--------------------------------------------------
+		// Root node
+		void arrive_r() {
+			__atomic_fetch_add(&value._all, 1, __ATOMIC_SEQ_CST);
+		}
+
+		void depart_r() {
+			__atomic_fetch_sub(&value._all, 1, __ATOMIC_SEQ_CST);
+		}
+
+		//--------------------------------------------------
+		// Interface node
+		void arrive() {
+			/* paranoid */ assert(!is_leaf);
+			if(is_root()) arrive_r();
+			else arrive_h();
+		}
+
+		void depart() {
+			/* paranoid */ assert(!is_leaf);
+			if(is_root()) depart_r();
+			else depart_h();
+		}
+
+	private:
+		volatile val_t value;
+		#if defined(__BMI2__)
+			union __attribute__((packed)) {
+				volatile uint8_t mask[8];
+				volatile uint64_t mask_all;
+			};
+		#else
+			volatile size_t mask = 0;
+		#endif
+
+		class node * parent = nullptr;
+		bool is_leaf = false;
+
+		bool is_root() {
+			return parent == nullptr;
+		}
+
+	public:
+		void arrive( int bit ) {
+			/* paranoid */ assert( is_leaf );
+
+			arrive_h();
+			#if defined(__BMI2__)
+				/* paranoid */ assert( bit < 8 );
+				mask[bit] = 0xff;
+			#else
+				/* paranoid */ assert( (mask & ( 1 << bit )) == 0 );
+				__atomic_fetch_add( &mask, 1 << bit, __ATOMIC_RELAXED );
+			#endif
+
+		}
+
+		void depart( int bit ) {
+			/* paranoid */ assert( is_leaf );
+
+			#if defined(__BMI2__)
+				/* paranoid */ assert( bit < 8 );
+				mask[bit] = 0x00;
+			#else
+				/* paranoid */ assert( (mask & ( 1 << bit )) != 0 );
+				__atomic_fetch_sub( &mask, 1 << bit, __ATOMIC_RELAXED );
+			#endif
+			depart_h();
+		}
+
+		bool query() {
+			/* paranoid */ assert(is_root());
+			return value._all > 0;
+		}
+	};
+};
+
+snzm_t::snzm_t(unsigned numLists)
+	: depth( std::log2( numLists / 8 ) )
+	, mask( (1 << depth) - 1 )
+	, root( (1 << (depth + 1)) - 2 )
+	, nodes(new node[ root + 1 ]())
+{
+	int width = 1 << depth;
+	std::cout << "SNZI with Mask: " << depth << "x" << width << "(" << mask << ")" << std::endl;
+	for(int i = 0; i < root; i++) {
+		nodes[i].is_leaf = i < width;
+		nodes[i].parent = &nodes[(i / 2) + width ];
+	}
+}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/utils.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/utils.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/utils.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,250 @@
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <atomic>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+
+#include <unistd.h>
+#include <sys/sysinfo.h>
+
+#include <x86intrin.h>
+
+// Barrier from
+class barrier_t {
+public:
+	barrier_t(size_t total)
+		: waiting(0)
+		, total(total)
+	{}
+
+	void wait(unsigned) {
+		size_t target = waiting++;
+		target = (target - (target % total)) + total;
+		while(waiting < target)
+			asm volatile("pause");
+
+		assert(waiting < (1ul << 60));
+    	}
+
+private:
+	std::atomic<size_t> waiting;
+	size_t total;
+};
+
+// class Random {
+// private:
+// 	unsigned int seed;
+// public:
+// 	Random(int seed) {
+// 		this->seed = seed;
+// 	}
+
+// 	/** returns pseudorandom x satisfying 0 <= x < n. **/
+// 	unsigned int next() {
+// 		seed ^= seed << 6;
+// 		seed ^= seed >> 21;
+// 		seed ^= seed << 7;
+// 		return seed;
+//     	}
+// };
+
+constexpr uint64_t extendedEuclidY(uint64_t a, uint64_t b);
+constexpr uint64_t extendedEuclidX(uint64_t a, uint64_t b){
+    return (b==0) ? 1 : extendedEuclidY(b, a - b * (a / b));
+}
+constexpr uint64_t extendedEuclidY(uint64_t a, uint64_t b){
+    return (b==0) ? 0 : extendedEuclidX(b, a - b * (a / b)) - (a / b) * extendedEuclidY(b, a - b * (a / b));
+}
+
+class Random {
+private:
+	uint64_t x;
+
+	static constexpr const uint64_t M  = 1ul << 48ul;
+	static constexpr const uint64_t A  = 25214903917;
+	static constexpr const uint64_t C  = 11;
+	static constexpr const uint64_t D  = 16;
+
+public:
+	static constexpr const uint64_t m  = M;
+	static constexpr const uint64_t a  = A;
+	static constexpr const uint64_t c  = C;
+	static constexpr const uint64_t d  = D;
+	static constexpr const uint64_t ai = extendedEuclidX(A, M);
+public:
+	Random(unsigned int seed) {
+		this->x = seed * a;
+	}
+
+	/** returns pseudorandom x satisfying 0 <= x < n. **/
+	unsigned int next() {
+		//nextx = (a * x + c) % m;
+		x = (A * x + C) & (M - 1);
+		return x >> D;
+	}
+	unsigned int prev() {
+		//prevx = (ainverse * (x - c)) mod m
+		unsigned int r = x >> D;
+		x = ai * (x - C) & (M - 1);
+		return r;
+	}
+
+	void set_raw_state(uint64_t _x) {
+		this->x = _x;
+	}
+
+	uint64_t get_raw_state() {
+		return this->x;
+	}
+};
+
+static inline long long rdtscl(void) {
+    unsigned int lo, hi;
+    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
+}
+
+static inline void affinity(int tid) {
+	static int cpus = get_nprocs();
+
+	cpu_set_t  mask;
+	CPU_ZERO(&mask);
+	int cpu = cpus - tid;  // Set CPU affinity to tid, starting from the end
+	CPU_SET(cpu, &mask);
+	auto result = sched_setaffinity(0, sizeof(mask), &mask);
+	if(result != 0) {
+		std::cerr << "Affinity set failed with " << result<< ", wanted " << cpu << std::endl;
+	}
+}
+
+static const constexpr std::size_t cache_line_size = 64;
+static inline void check_cache_line_size() {
+	std::cout << "Checking cache line size" << std::endl;
+	const std::string cache_file = "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size";
+
+	std::ifstream ifs (cache_file, std::ifstream::in);
+
+	if(!ifs.good()) {
+		std::cerr << "Could not open file to check cache line size" << std::endl;
+		std::cerr << "Looking for: " << cache_file << std::endl;
+		std::exit(2);
+	}
+
+	size_t got;
+	ifs >> got;
+
+	ifs.close();
+
+	if(cache_line_size != got) {
+		std::cerr << "Cache line has incorrect size : " << got << std::endl;
+		std::exit(1);
+	}
+
+	std::cout << "Done" << std::endl;
+}
+
+using Clock = std::chrono::high_resolution_clock;
+using duration_t = std::chrono::duration<double>;
+using std::chrono::nanoseconds;
+
+template<typename Ratio, typename T>
+T duration_cast(T seconds) {
+	return std::chrono::duration_cast<std::chrono::duration<T, Ratio>>(std::chrono::duration<T>(seconds)).count();
+}
+
+static inline unsigned rand_bit(unsigned rnum, size_t mask) __attribute__((artificial));
+static inline unsigned rand_bit(unsigned rnum, size_t mask) {
+	unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
+#if !defined(__BMI2__)
+	uint64_t v = mask;   // Input value to find position with rank r.
+	unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
+	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
+	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
+	unsigned int t;      // Bit count temporary.
+
+	// Do a normal parallel bit count for a 64-bit integer,
+	// but store all intermediate steps.
+	a =  v - ((v >> 1) & ~0UL/3);
+	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
+	c = (b + (b >> 4)) & ~0UL/0x11;
+	d = (c + (c >> 8)) & ~0UL/0x101;
+
+
+	t = (d >> 32) + (d >> 48);
+	// Now do branchless select!
+	s  = 64;
+	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
+	t  = (d >> (s - 16)) & 0xff;
+	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
+	t  = (c >> (s - 8)) & 0xf;
+	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
+	t  = (b >> (s - 4)) & 0x7;
+	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
+	t  = (a >> (s - 2)) & 0x3;
+	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
+	t  = (v >> (s - 1)) & 0x1;
+	s -= ((t - r) & 256) >> 8;
+	return s - 1;
+#else
+	uint64_t picked = _pdep_u64(1ul << bit, mask);
+	return picked ? __builtin_ctzl(picked) : 0;
+#endif
+}
+
+struct spinlock_t {
+	std::atomic_bool ll = { false };
+
+	inline void lock() {
+		while( __builtin_expect(ll.exchange(true),false) ) {
+			while(ll.load(std::memory_order_relaxed))
+				asm volatile("pause");
+		}
+	}
+
+	inline bool try_lock() {
+		return false == ll.exchange(true);
+	}
+
+	inline void unlock() {
+		ll.store(false, std::memory_order_release);
+	}
+
+	inline explicit operator bool() {
+		return ll.load(std::memory_order_relaxed);
+	}
+};
+
+static inline bool bts(std::atomic_size_t & target, size_t bit ) {
+	//*
+	int result = 0;
+	asm volatile(
+		"LOCK btsq %[bit], %[target]\n\t"
+		:"=@ccc" (result)
+		: [target] "m" (target), [bit] "r" (bit)
+	);
+	return result != 0;
+	/*/
+	size_t mask = 1ul << bit;
+	size_t ret = target.fetch_or(mask, std::memory_order_relaxed);
+	return (ret & mask) != 0;
+	//*/
+}
+
+static inline bool btr(std::atomic_size_t & target, size_t bit ) {
+	//*
+	int result = 0;
+	asm volatile(
+		"LOCK btrq %[bit], %[target]\n\t"
+		:"=@ccc" (result)
+		: [target] "m" (target), [bit] "r" (bit)
+	);
+	return result != 0;
+	/*/
+	size_t mask = 1ul << bit;
+	size_t ret = target.fetch_and(~mask, std::memory_order_relaxed);
+	return (ret & mask) != 0;
+	//*/
+}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/work_stealing.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/work_stealing.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/work_stealing.hpp	(revision f9f3775fd90af89c066bf99ce0152f83c082b8f0)
@@ -0,0 +1,222 @@
+#pragma once
+#define LIST_VARIANT work_stealing
+
+#include <cmath>
+#include <iomanip>
+#include <memory>
+#include <mutex>
+#include <type_traits>
+
+#include "assert.hpp"
+#include "utils.hpp"
+#include "links.hpp"
+#include "snzi.hpp"
+
+using namespace std;
+
+template<typename node_t>
+class __attribute__((aligned(128))) work_stealing {
+	static_assert(std::is_same<decltype(node_t::_links), _LinksFields_t<node_t>>::value, "Node must have a links field");
+
+public:
+	static const char * name() {
+		return "Work Stealing";
+	}
+
+	work_stealing(unsigned _numThreads, unsigned)
+		: numThreads(_numThreads)
+		, lists(new intrusive_queue_t<node_t>[numThreads])
+		, snzi( std::log2( numThreads / 2 ), 2 )
+
+	{
+		std::cout << "Constructing Work Stealer with " << numThreads << std::endl;
+	}
+
+	~work_stealing() {
+		std::cout << "Destroying Work Stealer" << std::endl;
+		lists.reset();
+	}
+
+	__attribute__((noinline, hot)) void push(node_t * node) {
+		node->_links.ts = rdtscl();
+		if( node->_links.hint > numThreads ) {
+			node->_links.hint = tls.rng.next() % numThreads;
+			tls.stat.push.nhint++;
+		}
+
+		unsigned i = node->_links.hint;
+		auto & list = lists[i];
+		list.lock.lock();
+
+		if(list.push( node )) {
+			snzi.arrive(i);
+		}
+
+		list.lock.unlock();
+	}
+
+	__attribute__((noinline, hot)) node_t * pop() {
+		node_t * node;
+		while(true) {
+			if(!snzi.query()) {
+				return nullptr;
+			}
+
+			{
+				unsigned i = tls.my_queue;
+				auto & list = lists[i];
+				if( list.ts() != 0 ) {
+					list.lock.lock();
+					if((node = try_pop(i))) {
+						tls.stat.pop.local.success++;
+						break;
+					}
+					else {
+						tls.stat.pop.local.elock++;
+					}
+				}
+				else {
+					tls.stat.pop.local.espec++;
+				}
+			}
+
+			tls.stat.pop.steal.tried++;
+
+			int i = tls.rng.next() % numThreads;
+			auto & list = lists[i];
+			if( list.ts() == 0 ) {
+				tls.stat.pop.steal.empty++;
+				continue;
+			}
+
+			if( !list.lock.try_lock() ) {
+				tls.stat.pop.steal.locked++;
+				continue;
+			}
+
+			if((node = try_pop(i))) {
+				tls.stat.pop.steal.success++;
+				break;
+			}
+		}
+
+		#if defined(READ)
+			const unsigned f = READ;
+			if(0 == (tls.it % f)) {
+				unsigned i = tls.it / f;
+				lists[i % numThreads].ts();
+			}
+			// lists[tls.it].ts();
+			tls.it++;
+		#endif
+
+
+		return node;
+	}
+
+private:
+	node_t * try_pop(unsigned i) {
+		auto & list = lists[i];
+
+		// If list is empty, unlock and retry
+		if( list.ts() == 0 ) {
+			list.lock.unlock();
+			return nullptr;
+		}
+
+			// Actually pop the list
+		node_t * node;
+		bool emptied;
+		std::tie(node, emptied) = list.pop();
+		assert(node);
+
+		if(emptied) {
+			snzi.depart(i);
+		}
+
+		// Unlock and return
+		list.lock.unlock();
+		return node;
+	}
+
+
+public:
+
+	static std::atomic_uint32_t ticket;
+	static __attribute__((aligned(128))) thread_local struct TLS {
+		Random     rng = { int(rdtscl()) };
+		unsigned   my_queue = ticket++;
+		#if defined(READ)
+			unsigned it = 0;
+		#endif
+		struct {
+			struct {
+				std::size_t nhint = { 0 };
+			} push;
+			struct {
+				struct {
+					std::size_t success = { 0 };
+					std::size_t espec = { 0 };
+					std::size_t elock = { 0 };
+				} local;
+				struct {
+					std::size_t tried   = { 0 };
+					std::size_t locked  = { 0 };
+					std::size_t empty   = { 0 };
+					std::size_t success = { 0 };
+				} steal;
+			} pop;
+		} stat;
+	} tls;
+
+private:
+	const unsigned numThreads;
+    	std::unique_ptr<intrusive_queue_t<node_t> []> lists;
+	__attribute__((aligned(64))) snzi_t snzi;
+
+#ifndef NO_STATS
+private:
+	static struct GlobalStats {
+		struct {
+			std::atomic_size_t nhint = { 0 };
+		} push;
+		struct {
+			struct {
+				std::atomic_size_t success = { 0 };
+				std::atomic_size_t espec = { 0 };
+				std::atomic_size_t elock = { 0 };
+			} local;
+			struct {
+				std::atomic_size_t tried   = { 0 };
+				std::atomic_size_t locked  = { 0 };
+				std::atomic_size_t empty   = { 0 };
+				std::atomic_size_t success = { 0 };
+			} steal;
+		} pop;
+	} global_stats;
+
+public:
+	static void stats_tls_tally() {
+		global_stats.push.nhint += tls.stat.push.nhint;
+		global_stats.pop.local.success += tls.stat.pop.local.success;
+		global_stats.pop.local.espec   += tls.stat.pop.local.espec  ;
+		global_stats.pop.local.elock   += tls.stat.pop.local.elock  ;
+		global_stats.pop.steal.tried   += tls.stat.pop.steal.tried  ;
+		global_stats.pop.steal.locked  += tls.stat.pop.steal.locked ;
+		global_stats.pop.steal.empty   += tls.stat.pop.steal.empty  ;
+		global_stats.pop.steal.success += tls.stat.pop.steal.success;
+	}
+
+	static void stats_print(std::ostream & os ) {
+		std::cout << "----- Work Stealing Stats -----" << std::endl;
+
+		double stealSucc = double(global_stats.pop.steal.success) / global_stats.pop.steal.tried;
+		os << "Push to new Q : " << std::setw(15) << global_stats.push.nhint << "\n";
+		os << "Local Pop     : " << std::setw(15) << global_stats.pop.local.success << "\n";
+		os << "Steal Pop     : " << std::setw(15) << global_stats.pop.steal.success << "(" << global_stats.pop.local.espec << "s, " << global_stats.pop.local.elock << "l)\n";
+		os << "Steal Success : " << std::setw(15) << stealSucc << "(" << global_stats.pop.steal.tried << " tries)\n";
+		os << "Steal Fails   : " << std::setw(15) << global_stats.pop.steal.empty << "e, " << global_stats.pop.steal.locked << "l\n";
+	}
+private:
+#endif
+};
Index: doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,1141 +1,0 @@
-#if !defined(LIST_VARIANT_HPP)
-#define LIST_VARIANT_HPP "relaxed_list.hpp"
-#endif
-
-#include LIST_VARIANT_HPP
-#if !defined(LIST_VARIANT)
-#error not variant selected
-#endif
-
-#include <array>
-#include <iomanip>
-#include <iostream>
-#include <locale>
-#include <string>
-#include <thread>
-#include <vector>
-
-#include <getopt.h>
-#include <unistd.h>
-#include <sys/sysinfo.h>
-
-#include "utils.hpp"
-
-struct __attribute__((aligned(64))) Node {
-	static std::atomic_size_t creates;
-	static std::atomic_size_t destroys;
-
-	_LinksFields_t<Node> _links;
-
-	int value;
-	int id;
-
-	Node() { creates++; }
-	Node(int value): value(value) { creates++; }
-	~Node() { destroys++; }
-};
-
-std::atomic_size_t Node::creates  = { 0 };
-std::atomic_size_t Node::destroys = { 0 };
-
-bool enable_stats = false;
-
-template<>
-thread_local LIST_VARIANT<Node>::TLS LIST_VARIANT<Node>::tls = {};
-
-template<>
-std::atomic_uint32_t LIST_VARIANT<Node>::ticket = { 0 };
-
-#ifndef NO_STATS
-template<>
-LIST_VARIANT<Node>::GlobalStats LIST_VARIANT<Node>::global_stats = {};
-#endif
-
-// ================================================================================================
-//                        UTILS
-// ================================================================================================
-
-struct local_stat_t {
-	size_t in  = 0;
-	size_t out = 0;
-	size_t empty = 0;
-	size_t crc_in  = 0;
-	size_t crc_out = 0;
-	size_t valmax = 0;
-	size_t valmin = 100000000ul;
-	struct {
-		size_t val = 0;
-		size_t cnt = 0;
-	} comp;
-	struct {
-		size_t val = 0;
-		size_t cnt = 0;
-	} subm;
-};
-
-struct global_stat_t {
-	std::atomic_size_t in  = { 0 };
-	std::atomic_size_t out = { 0 };
-	std::atomic_size_t empty = { 0 };
-	std::atomic_size_t crc_in  = { 0 };
-	std::atomic_size_t crc_out = { 0 };
-	std::atomic_size_t valmax = { 0 };
-	std::atomic_size_t valmin = { 100000000ul };
-	struct {
-		std::atomic_size_t val = { 0 };
-		std::atomic_size_t cnt = { 0 };
-	} comp;
-	struct {
-		std::atomic_size_t val = { 0 };
-		std::atomic_size_t cnt = { 0 };
-	} subm;
-};
-
-void atomic_max(std::atomic_size_t & target, size_t value) {
-	for(;;) {
-		size_t expect = target.load(std::memory_order_relaxed);
-		if(value <= expect) return;
-		bool success = target.compare_exchange_strong(expect, value);
-		if(success) return;
-	}
-}
-
-void atomic_min(std::atomic_size_t & target, size_t value) {
-	for(;;) {
-		size_t expect = target.load(std::memory_order_relaxed);
-		if(value >= expect) return;
-		bool success = target.compare_exchange_strong(expect, value);
-		if(success) return;
-	}
-}
-
-void tally_stats(global_stat_t & global, local_stat_t & local) {
-
-	global.in    += local.in;
-	global.out   += local.out;
-	global.empty += local.empty;
-
-	global.crc_in  += local.crc_in;
-	global.crc_out += local.crc_out;
-
-	global.comp.val += local.comp.val;
-	global.comp.cnt += local.comp.cnt;
-	global.subm.val += local.subm.val;
-	global.subm.cnt += local.subm.cnt;
-
-	atomic_max(global.valmax, local.valmax);
-	atomic_min(global.valmin, local.valmin);
-
-	LIST_VARIANT<Node>::stats_tls_tally();
-}
-
-void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) {
-	std::cout << "Starting" << std::endl;
-	auto before = Clock::now();
-	barrier.wait(0);
-	bool is_tty = isatty(STDOUT_FILENO);
-
-	while(true) {
-		usleep(100000);
-		auto now = Clock::now();
-		duration_t durr = now - before;
-		if( durr.count() > duration ) {
-			done = true;
-			break;
-		}
-		if(is_tty) {
-			std::cout << "\r" << std::setprecision(4) << durr.count();
-			std::cout.flush();
-		}
-	}
-
-	barrier.wait(0);
-	auto after = Clock::now();
-	duration_t durr = after - before;
-	duration = durr.count();
-	std::cout << "\rClosing down" << std::endl;
-}
-
-void waitfor(double & duration, barrier_t & barrier, const std::atomic_size_t & count) {
-	std::cout << "Starting" << std::endl;
-	auto before = Clock::now();
-	barrier.wait(0);
-
-	while(true) {
-		usleep(100000);
-		size_t c = count.load();
-		if( c == 0 ) {
-			break;
-		}
-		std::cout << "\r" << c;
-		std::cout.flush();
-	}
-
-	barrier.wait(0);
-	auto after = Clock::now();
-	duration_t durr = after - before;
-	duration = durr.count();
-	std::cout << "\rClosing down" << std::endl;
-}
-
-void print_stats(double duration, unsigned nthread, global_stat_t & global) {
-	assert(Node::creates == Node::destroys);
-	assert(global.crc_in == global.crc_out);
-
-	std::cout << "Done" << std::endl;
-
-	size_t ops = global.in + global.out;
-	size_t ops_sec = size_t(double(ops) / duration);
-	size_t ops_thread = ops_sec / nthread;
-	auto dur_nano = duration_cast<std::nano>(1.0);
-
-	if(global.valmax != 0) {
-		std::cout << "Max runs      : " << global.valmax << "\n";
-		std::cout << "Min runs      : " << global.valmin << "\n";
-	}
-	if(global.comp.cnt != 0) {
-		std::cout << "Submit count  : " << global.subm.cnt << "\n";
-		std::cout << "Submit average: " << ((double(global.subm.val)) / global.subm.cnt) << "\n";
-		std::cout << "Complete count: " << global.comp.cnt << "\n";
-		std::cout << "Complete avg  : " << ((double(global.comp.val)) / global.comp.cnt) << "\n";
-	}
-	std::cout << "Duration      : " << duration << "s\n";
-	std::cout << "ns/Op         : " << ( dur_nano / ops_thread )<< "\n";
-	std::cout << "Ops/sec/thread: " << ops_thread << "\n";
-	std::cout << "Ops/sec       : " << ops_sec << "\n";
-	std::cout << "Total ops     : " << ops << "(" << global.in << "i, " << global.out << "o, " << global.empty << "e)\n";
-	#ifndef NO_STATS
-		LIST_VARIANT<Node>::stats_print(std::cout);
-	#endif
-}
-
-void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output);
-
-// ================================================================================================
-//                        EXPERIMENTS
-// ================================================================================================
-
-// ================================================================================================
-__attribute__((noinline)) void runChurn_body(
-	std::atomic<bool>& done,
-	Random & rand,
-	Node * my_nodes[],
-	unsigned nslots,
-	local_stat_t & local,
-	LIST_VARIANT<Node> & list
-) {
-	while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
-		int idx = rand.next() % nslots;
-		if (auto node = my_nodes[idx]) {
-			local.crc_in += node->value;
-			list.push(node);
-			my_nodes[idx] = nullptr;
-			local.in++;
-		}
-		else if(auto node = list.pop()) {
-			local.crc_out += node->value;
-			my_nodes[idx] = node;
-			local.out++;
-		}
-		else {
-			local.empty++;
-		}
-	}
-}
-
-void runChurn(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes, const unsigned nslots) {
-	std::cout << "Churn Benchmark" << std::endl;
-	assert(nnodes <= nslots);
-	// List being tested
-
-	// Barrier for synchronization
-	barrier_t barrier(nthread + 1);
-
-	// Data to check everything is OK
-	global_stat_t global;
-
-	// Flag to signal termination
-	std::atomic_bool done  = { false };
-
-	// Prep nodes
-	std::cout << "Initializing ";
-	size_t npushed = 0;
-	LIST_VARIANT<Node> list = { nthread, nqueues };
-	{
-		Node** all_nodes[nthread];
-		for(auto & nodes : all_nodes) {
-			nodes = new __attribute__((aligned(64))) Node*[nslots + 8];
-			Random rand(rdtscl());
-			for(unsigned i = 0; i < nnodes; i++) {
-				nodes[i] = new Node(rand.next() % 100);
-			}
-
-			for(unsigned i = nnodes; i < nslots; i++) {
-				nodes[i] = nullptr;
-			}
-
-			for(int i = 0; i < 10 && i < (int)nslots; i++) {
-				int idx = rand.next() % nslots;
-				if (auto node = nodes[idx]) {
-					global.crc_in += node->value;
-					list.push(node);
-					npushed++;
-					nodes[idx] = nullptr;
-				}
-			}
-		}
-
-		std::cout << nnodes << " nodes (" << nslots << " slots)" << std::endl;
-
-		enable_stats = true;
-
-		std::thread * threads[nthread];
-		unsigned i = 1;
-		for(auto & t : threads) {
-			auto & my_nodes = all_nodes[i - 1];
-			t = new std::thread([&done, &list, &barrier, &global, &my_nodes, nslots](unsigned tid) {
-				Random rand(tid + rdtscl());
-
-				local_stat_t local;
-
-				// affinity(tid);
-
-				barrier.wait(tid);
-
-				// EXPERIMENT START
-
-				runChurn_body(done, rand, my_nodes, nslots, local, list);
-
-				// EXPERIMENT END
-
-				barrier.wait(tid);
-
-				tally_stats(global, local);
-
-				for(unsigned i = 0; i < nslots; i++) {
-					delete my_nodes[i];
-				}
-			}, i++);
-		}
-
-		waitfor(duration, barrier, done);
-
-		for(auto t : threads) {
-			t->join();
-			delete t;
-		}
-
-		enable_stats = false;
-
-		while(auto node = list.pop()) {
-			global.crc_out += node->value;
-			delete node;
-		}
-
-		for(auto nodes : all_nodes) {
-			delete[] nodes;
-		}
-	}
-
-	print_stats(duration, nthread, global);
-}
-
-// ================================================================================================
-__attribute__((noinline)) void runPingPong_body(
-	std::atomic<bool>& done,
-	Node initial_nodes[],
-	unsigned nnodes,
-	local_stat_t & local,
-	LIST_VARIANT<Node> & list
-) {
-	Node * nodes[nnodes];
-	{
-		unsigned i = 0;
-		for(auto & n : nodes) {
-			n = &initial_nodes[i++];
-		}
-	}
-
-	while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
-
-		for(Node * & node : nodes) {
-			local.crc_in += node->value;
-			list.push(node);
-			local.in++;
-		}
-
-		// -----
-
-		for(Node * & node : nodes) {
-			node = list.pop();
-			assert(node);
-			local.crc_out += node->value;
-			local.out++;
-		}
-	}
-}
-
-void runPingPong(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes) {
-	std::cout << "PingPong Benchmark" << std::endl;
-
-
-	// Barrier for synchronization
-	barrier_t barrier(nthread + 1);
-
-	// Data to check everything is OK
-	global_stat_t global;
-
-	// Flag to signal termination
-	std::atomic_bool done  = { false };
-
-	std::cout << "Initializing ";
-	// List being tested
-	LIST_VARIANT<Node> list = { nthread, nqueues };
-	{
-		enable_stats = true;
-
-		std::thread * threads[nthread];
-		unsigned i = 1;
-		for(auto & t : threads) {
-			t = new std::thread([&done, &list, &barrier, &global, nnodes](unsigned tid) {
-				Random rand(tid + rdtscl());
-
-				Node nodes[nnodes];
-				for(auto & n : nodes) {
-					n.value = (int)rand.next() % 100;
-				}
-
-				local_stat_t local;
-
-				// affinity(tid);
-
-				barrier.wait(tid);
-
-				// EXPERIMENT START
-
-				runPingPong_body(done, nodes, nnodes, local, list);
-
-				// EXPERIMENT END
-
-				barrier.wait(tid);
-
-				tally_stats(global, local);
-			}, i++);
-		}
-
-		waitfor(duration, barrier, done);
-
-		for(auto t : threads) {
-			t->join();
-			delete t;
-		}
-
-		enable_stats = false;
-	}
-
-	print_stats(duration, nthread, global);
-}
-
-// ================================================================================================
-struct __attribute__((aligned(64))) Slot {
-	Node * volatile node;
-};
-
-__attribute__((noinline)) void runProducer_body(
-	std::atomic<bool>& done,
-	Random & rand,
-	Slot * slots,
-	int nslots,
-	local_stat_t & local,
-	LIST_VARIANT<Node> & list
-) {
-	while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
-
-		Node * node = list.pop();
-		if(!node) {
-			local.empty ++;
-			continue;
-		}
-
-		local.crc_out += node->value;
-		local.out++;
-
-		if(node->id == 0) {
-			unsigned cnt = 0;
-			for(int i = 0; i < nslots; i++) {
-				Node * found = __atomic_exchange_n( &slots[i].node, nullptr, __ATOMIC_SEQ_CST );
-				if( found ) {
-					local.crc_in += found->value;
-					local.in++;
-					cnt++;
-					list.push( found );
-				}
-			}
-
-			local.crc_in += node->value;
-			local.in++;
-			list.push( node );
-
-			local.comp.cnt++;
-			local.comp.val += cnt;
-		}
-		else {
-			unsigned len = 0;
-			while(true) {
-				auto off = rand.next();
-				for(int i = 0; i < nslots; i++) {
-					Node * expected = nullptr;
-					int idx = (i + off) % nslots;
-					Slot & slot = slots[ idx ];
-					if(
-						slot.node == nullptr &&
-						__atomic_compare_exchange_n( &slot.node, &expected, node, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST )
-					) {
-						local.subm.cnt++;
-						local.subm.val += len;
-						goto LOOP;
-					}
-					assert( expected != node );
-					len++;
-				}
-			}
-		}
-
-		LOOP:;
-	}
-}
-
-void runProducer(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes) {
-	std::cout << "Producer Benchmark" << std::endl;
-
-	// Barrier for synchronization
-	barrier_t barrier(nthread + 1);
-
-	// Data to check everything is OK
-	global_stat_t global;
-
-	// Flag to signal termination
-	std::atomic_bool done  = { false };
-
-	std::cout << "Initializing ";
-
-	int nslots = nnodes * 4;
-	Slot * slots = new Slot[nslots];
-	std::cout << nnodes << " nodes (" << nslots << " slots)" << std::endl;
-
-	// List being tested
-	LIST_VARIANT<Node> list = { nthread, nqueues };
-	{
-		Random rand(rdtscl());
-		for(unsigned i = 0; i < nnodes; i++) {
-			Node * node = new Node(rand.next() % 100);
-			node->id = i;
-			global.crc_in += node->value;
-			list.push(node);
-		}
-
-		for(int i = 0; i < nslots; i++) {
-			slots[i].node = nullptr;
-		}
-	}
-
-	{
-		enable_stats = true;
-
-		std::thread * threads[nthread];
-		unsigned i = 1;
-		for(auto & t : threads) {
-			t = new std::thread([&done, &list, &barrier, &global, slots, nslots](unsigned tid) {
-				Random rand(tid + rdtscl());
-
-				local_stat_t local;
-				barrier.wait(tid);
-
-				// EXPERIMENT START
-
-				runProducer_body(done, rand, slots, nslots, local, list);
-
-				// EXPERIMENT END
-
-				barrier.wait(tid);
-
-				tally_stats(global, local);
-			}, i++);
-		}
-
-		waitfor(duration, barrier, done);
-
-		for(auto t : threads) {
-			t->join();
-			delete t;
-		}
-
-		enable_stats = false;
-	}
-
-	{
-		while(Node * node = list.pop()) {
-			global.crc_out += node->value;
-			delete node;
-		}
-
-		for(int i = 0; i < nslots; i++) {
-			delete slots[i].node;
-		}
-
-		delete [] slots;
-	}
-
-	print_stats(duration, nthread, global);
-}
-
-// ================================================================================================
-__attribute__((noinline)) void runFairness_body(
-	unsigned tid,
-	size_t width,
-	size_t length,
-	int output[],
-	std::atomic_size_t & count,
-	Node initial_nodes[],
-	unsigned nnodes,
-	local_stat_t & local,
-	LIST_VARIANT<Node> & list
-) {
-	Node * nodes[nnodes];
-	{
-		unsigned i = 0;
-		for(auto & n : nodes) {
-			n = &initial_nodes[i++];
-		}
-	}
-
-	while(__builtin_expect(0 != count.load(std::memory_order_relaxed), true)) {
-
-		for(Node * & node : nodes) {
-			local.crc_in += node->id;
-			list.push(node);
-			local.in++;
-		}
-
-		// -----
-
-		for(Node * & node : nodes) {
-			node = list.pop();
-			assert(node);
-
-			if (unsigned(node->value) < length) {
-				size_t idx = (node->value * width) + node->id;
-				assert(idx < (width * length));
-				output[idx] = tid;
-			}
-
-			node->value++;
-			if(unsigned(node->value) == length) count--;
-
-			local.crc_out += node->id;
-			local.out++;
-		}
-	}
-}
-
-void runFairness(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes, const std::string & output) {
-	std::cout << "Fairness Benchmark, outputing to : " << output << std::endl;
-
-	// Barrier for synchronization
-	barrier_t barrier(nthread + 1);
-
-	// Data to check everything is OK
-	global_stat_t global;
-
-	std::cout << "Initializing ";
-
-	// Check fairness by creating a png of where the threads ran
-	size_t width = nthread * nnodes;
-	size_t length = 100000;
-
-	std::unique_ptr<int[]> data_out { new int[width * length] };
-
-	// Flag to signal termination
-	std::atomic_size_t count = width;
-
-	// List being tested
-	LIST_VARIANT<Node> list = { nthread, nqueues };
-	{
-		enable_stats = true;
-
-		std::thread * threads[nthread];
-		unsigned i = 1;
-		for(auto & t : threads) {
-			t = new std::thread([&count, &list, &barrier, &global, nnodes, width, length, data_out = data_out.get()](unsigned tid) {
-				unsigned int start = (tid - 1) * nnodes;
-				Node nodes[nnodes];
-				for(auto & n : nodes) {
-					n.id = start;
-					n.value = 0;
-					start++;
-				}
-
-				local_stat_t local;
-
-				// affinity(tid);
-
-				barrier.wait(tid);
-
-				// EXPERIMENT START
-
-				runFairness_body(tid, width, length, data_out, count, nodes, nnodes, local, list);
-
-				// EXPERIMENT END
-
-				barrier.wait(tid);
-
-				for(const auto & n : nodes) {
-					local.valmax = max(local.valmax, size_t(n.value));
-					local.valmin = min(local.valmin, size_t(n.value));
-				}
-
-				tally_stats(global, local);
-			}, i++);
-		}
-
-		waitfor(duration, barrier, count);
-
-		for(auto t : threads) {
-			t->join();
-			delete t;
-		}
-
-		enable_stats = false;
-	}
-
-	print_stats(duration, nthread, global);
-
-	// save_fairness(data_out.get(), 100, nthread, width, length, output);
-}
-
-// ================================================================================================
-
-bool iequals(const std::string& a, const std::string& b)
-{
-    return std::equal(a.begin(), a.end(),
-                      b.begin(), b.end(),
-                      [](char a, char b) {
-                          return std::tolower(a) == std::tolower(b);
-                      });
-}
-
-int main(int argc, char * argv[]) {
-
-	double duration   = 5.0;
-	unsigned nthreads = 2;
-	unsigned nqueues  = 4;
-	unsigned nnodes   = 100;
-	unsigned nslots   = 100;
-	std::string out   = "fairness.png";
-
-	enum {
-		Churn,
-		PingPong,
-		Producer,
-		Fairness,
-		NONE
-	} benchmark = NONE;
-
-	std::cout.imbue(std::locale(""));
-
-	for(;;) {
-		static struct option options[] = {
-			{"duration",  required_argument, 0, 'd'},
-			{"nthreads",  required_argument, 0, 't'},
-			{"nqueues",   required_argument, 0, 'q'},
-			{"benchmark", required_argument, 0, 'b'},
-			{0, 0, 0, 0}
-		};
-
-		int idx = 0;
-		int opt = getopt_long(argc, argv, "d:t:q:b:", options, &idx);
-
-		std::string arg = optarg ? optarg : "";
-		size_t len = 0;
-		switch(opt) {
-			// Exit Case
-			case -1:
-				/* paranoid */ assert(optind <= argc);
-				switch(benchmark) {
-				case NONE:
-					std::cerr << "Must specify a benchmark" << std::endl;
-					goto usage;
-				case PingPong:
-					nnodes = 1;
-					switch(argc - optind) {
-					case 0: break;
-					case 1:
-						try {
-							arg = optarg = argv[optind];
-							nnodes = stoul(optarg, &len);
-							if(len != arg.size()) { throw std::invalid_argument(""); }
-						} catch(std::invalid_argument &) {
-							std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl;
-							goto usage;
-						}
-						break;
-					default:
-						std::cerr << "'PingPong' benchmark doesn't accept more than 1 extra arguments" << std::endl;
-						goto usage;
-					}
-					break;
-				case Producer:
-					nnodes = 32;
-					switch(argc - optind) {
-					case 0: break;
-					case 1:
-						try {
-							arg = optarg = argv[optind];
-							nnodes = stoul(optarg, &len);
-							if(len != arg.size()) { throw std::invalid_argument(""); }
-						} catch(std::invalid_argument &) {
-							std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl;
-							goto usage;
-						}
-						break;
-					default:
-						std::cerr << "'Producer' benchmark doesn't accept more than 1 extra arguments" << std::endl;
-						goto usage;
-					}
-					break;
-				case Churn:
-					nnodes = 100;
-					nslots = 100;
-					switch(argc - optind) {
-					case 0: break;
-					case 1:
-						try {
-							arg = optarg = argv[optind];
-							nnodes = stoul(optarg, &len);
-							if(len != arg.size()) { throw std::invalid_argument(""); }
-							nslots = nnodes;
-						} catch(std::invalid_argument &) {
-							std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl;
-							goto usage;
-						}
-						break;
-					case 2:
-						try {
-							arg = optarg = argv[optind];
-							nnodes = stoul(optarg, &len);
-							if(len != arg.size()) { throw std::invalid_argument(""); }
-						} catch(std::invalid_argument &) {
-							std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl;
-							goto usage;
-						}
-						try {
-							arg = optarg = argv[optind + 1];
-							nslots = stoul(optarg, &len);
-							if(len != arg.size()) { throw std::invalid_argument(""); }
-						} catch(std::invalid_argument &) {
-							std::cerr << "Number of slots must be a positive integer, was " << arg << std::endl;
-							goto usage;
-						}
-						break;
-					default:
-						std::cerr << "'Churn' benchmark doesn't accept more than 2 extra arguments" << std::endl;
-						goto usage;
-					}
-					break;
-				case Fairness:
-					nnodes = 1;
-					switch(argc - optind) {
-					case 0: break;
-					case 1:
-						arg = optarg = argv[optind];
-						out = arg;
-						break;
-					default:
-						std::cerr << "'Churn' benchmark doesn't accept more than 2 extra arguments" << std::endl;
-						goto usage;
-					}
-				}
-				goto run;
-			// Benchmarks
-			case 'b':
-				if(benchmark != NONE) {
-					std::cerr << "Only when benchmark can be run" << std::endl;
-					goto usage;
-				}
-				if(iequals(arg, "churn")) {
-					benchmark = Churn;
-					break;
-				}
-				if(iequals(arg, "pingpong")) {
-					benchmark = PingPong;
-					break;
-				}
-				if(iequals(arg, "producer")) {
-					benchmark = Producer;
-					break;
-				}
-				if(iequals(arg, "fairness")) {
-					benchmark = Fairness;
-					break;
-				}
-				std::cerr << "Unkown benchmark " << arg << std::endl;
-				goto usage;
-			// Numeric Arguments
-			case 'd':
-				try {
-					duration = stod(optarg, &len);
-					if(len != arg.size()) { throw std::invalid_argument(""); }
-				} catch(std::invalid_argument &) {
-					std::cerr << "Duration must be a valid double, was " << arg << std::endl;
-					goto usage;
-				}
-				break;
-			case 't':
-				try {
-					nthreads = stoul(optarg, &len);
-					if(len != arg.size()) { throw std::invalid_argument(""); }
-				} catch(std::invalid_argument &) {
-					std::cerr << "Number of threads must be a positive integer, was " << arg << std::endl;
-					goto usage;
-				}
-				break;
-			case 'q':
-				try {
-					nqueues = stoul(optarg, &len);
-					if(len != arg.size()) { throw std::invalid_argument(""); }
-				} catch(std::invalid_argument &) {
-					std::cerr << "Number of queues must be a positive integer, was " << arg << std::endl;
-					goto usage;
-				}
-				break;
-			// Other cases
-			default: /* ? */
-				std::cerr << opt << std::endl;
-			usage:
-				std::cerr << "Usage: " << argv[0] << ": [options] -b churn [NNODES] [NSLOTS = NNODES]" << std::endl;
-				std::cerr << "  or:  " << argv[0] << ": [options] -b pingpong [NNODES]" << std::endl;
-				std::cerr << "  or:  " << argv[0] << ": [options] -b producer [NNODES]" << std::endl;
-				std::cerr << std::endl;
-				std::cerr << "  -d, --duration=DURATION  Duration of the experiment, in seconds" << std::endl;
-				std::cerr << "  -t, --nthreads=NTHREADS  Number of kernel threads" << std::endl;
-				std::cerr << "  -q, --nqueues=NQUEUES    Number of queues per threads" << std::endl;
-				std::exit(1);
-		}
-	}
-	run:
-
-	check_cache_line_size();
-
-	std::cout << "Running " << nthreads << " threads (" << (nthreads * nqueues) << " queues) for " << duration << " seconds" << std::endl;
-	std::cout << "Relaxed list variant: " << LIST_VARIANT<Node>::name() << std::endl;
-	switch(benchmark) {
-		case Churn:
-			runChurn(nthreads, nqueues, duration, nnodes, nslots);
-			break;
-		case PingPong:
-			runPingPong(nthreads, nqueues, duration, nnodes);
-			break;
-		case Producer:
-			runProducer(nthreads, nqueues, duration, nnodes);
-			break;
-		case Fairness:
-			runFairness(nthreads, nqueues, duration, nnodes, out);
-			break;
-		default:
-			abort();
-	}
-	return 0;
-}
-
-const char * __my_progname = "Relaxed List";
-
-struct rgb_t {
-    double r;       // a fraction between 0 and 1
-    double g;       // a fraction between 0 and 1
-    double b;       // a fraction between 0 and 1
-};
-
-struct hsv_t {
-    double h;       // angle in degrees
-    double s;       // a fraction between 0 and 1
-    double v;       // a fraction between 0 and 1
-};
-
-rgb_t hsv2rgb(hsv_t in) {
-	double hh, p, q, t, ff;
-	long   i;
-	rgb_t  out;
-
-	if(in.s <= 0.0) {       // < is bogus, just shuts up warnings
-		out.r = in.v;
-		out.g = in.v;
-		out.b = in.v;
-		return out;
-	}
-	hh = in.h;
-	if(hh >= 360.0) hh = 0.0;
-	hh /= 60.0;
-	i = (long)hh;
-	ff = hh - i;
-	p = in.v * (1.0 - in.s);
-	q = in.v * (1.0 - (in.s * ff));
-	t = in.v * (1.0 - (in.s * (1.0 - ff)));
-
-	switch(i) {
-	case 0:
-		out.r = in.v;
-		out.g = t;
-		out.b = p;
-		break;
-	case 1:
-		out.r = q;
-		out.g = in.v;
-		out.b = p;
-		break;
-	case 2:
-		out.r = p;
-		out.g = in.v;
-		out.b = t;
-		break;
-
-	case 3:
-		out.r = p;
-		out.g = q;
-		out.b = in.v;
-		break;
-	case 4:
-		out.r = t;
-		out.g = p;
-		out.b = in.v;
-		break;
-	case 5:
-	default:
-		out.r = in.v;
-		out.g = p;
-		out.b = q;
-		break;
-	}
-	return out;
-}
-
-// void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output) {
-// 	std::ofstream os(output);
-// 	os << "<html>\n";
-// 	os << "<head>\n";
-// 	os << "<style>\n";
-// 	os << "</style>\n";
-// 	os << "</head>\n";
-// 	os << "<body>\n";
-// 	os << "<table style=\"width=100%\">\n";
-
-// 	size_t idx = 0;
-// 	for(size_t r = 0ul; r < rows; r++) {
-// 		os << "<tr>\n";
-// 		for(size_t c = 0ul; c < columns; c++) {
-// 			os << "<td class=\"custom custom" << data[idx] << "\"></td>\n";
-// 			idx++;
-// 		}
-// 		os << "</tr>\n";
-// 	}
-
-// 	os << "</table>\n";
-// 	os << "</body>\n";
-// 	os << "</html>\n";
-// 	os << std::endl;
-// }
-
-// #include <png.h>
-// #include <setjmp.h>
-
-/*
-void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output) {
-	int width  = columns * factor;
-	int height = rows / factor;
-
-	int code = 0;
-	int idx = 0;
-	FILE *fp = NULL;
-	png_structp png_ptr = NULL;
-	png_infop info_ptr = NULL;
-	png_bytep row = NULL;
-
-	// Open file for writing (binary mode)
-	fp = fopen(output.c_str(), "wb");
-	if (fp == NULL) {
-		fprintf(stderr, "Could not open file %s for writing\n", output.c_str());
-		code = 1;
-		goto finalise;
-	}
-
-	   // Initialize write structure
-	png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
-	if (png_ptr == NULL) {
-		fprintf(stderr, "Could not allocate write struct\n");
-		code = 1;
-		goto finalise;
-	}
-
-	// Initialize info structure
-	info_ptr = png_create_info_struct(png_ptr);
-	if (info_ptr == NULL) {
-		fprintf(stderr, "Could not allocate info struct\n");
-		code = 1;
-		goto finalise;
-	}
-
-	// Setup Exception handling
-	if (setjmp(png_jmpbuf(png_ptr))) {
-		fprintf(stderr, "Error during png creation\n");
-		code = 1;
-		goto finalise;
-	}
-
-	png_init_io(png_ptr, fp);
-
-	// Write header (8 bit colour depth)
-	png_set_IHDR(png_ptr, info_ptr, width, height,
-		8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE,
-		PNG_COMPRESSION_TYPE_BASE, PNG_FILTER_TYPE_BASE);
-
-	png_write_info(png_ptr, info_ptr);
-
-	// Allocate memory for one row (3 bytes per pixel - RGB)
-	row = (png_bytep) malloc(3 * width * sizeof(png_byte));
-
-	// Write image data
-	int x, y;
-	for (y=0 ; y<height ; y++) {
-		for (x=0 ; x<width ; x++) {
-			auto & r = row[(x * 3) + 0];
-			auto & g = row[(x * 3) + 1];
-			auto & b = row[(x * 3) + 2];
-			assert(idx < (rows * columns));
-			int color = data[idx] - 1;
-			assert(color < nthreads);
-			assert(color >= 0);
-			idx++;
-
-			double angle = double(color) / double(nthreads);
-
-			auto c = hsv2rgb({ 360.0 * angle, 0.8, 0.8 });
-
-			r = char(c.r * 255.0);
-			g = char(c.g * 255.0);
-			b = char(c.b * 255.0);
-
-		}
-		png_write_row(png_ptr, row);
-	}
-
-	assert(idx == (rows * columns));
-
-	// End write
-	png_write_end(png_ptr, NULL);
-
-	finalise:
-	if (fp != NULL) fclose(fp);
-	if (info_ptr != NULL) png_free_data(png_ptr, info_ptr, PNG_FREE_ALL, -1);
-	if (png_ptr != NULL) png_destroy_write_struct(&png_ptr, (png_infopp)NULL);
-	if (row != NULL) free(row);
-}
-*/
Index: doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,555 +1,0 @@
-#pragma once
-#define LIST_VARIANT relaxed_list
-
-#define VANILLA 0
-#define SNZI 1
-#define BITMASK 2
-#define DISCOVER 3
-#define SNZM 4
-#define BIAS 5
-#define BACK 6
-#define BACKBIAS 7
-
-#ifndef VARIANT
-#define VARIANT VANILLA
-#endif
-
-#ifndef NO_STATS
-#include <iostream>
-#endif
-
-#include <cmath>
-#include <functional>
-#include <memory>
-#include <mutex>
-#include <thread>
-#include <type_traits>
-
-#include "assert.hpp"
-#include "utils.hpp"
-#include "links.hpp"
-#include "snzi.hpp"
-#include "snzi-packed.hpp"
-#include "snzm.hpp"
-
-using namespace std;
-
-struct pick_stat {
-	struct {
-		size_t attempt = 0;
-		size_t success = 0;
-		size_t local = 0;
-	} push;
-	struct {
-		size_t attempt = 0;
-		size_t success = 0;
-		size_t mask_attempt = 0;
-		size_t mask_reset = 0;
-		size_t local = 0;
-	} pop;
-};
-
-struct empty_stat {
-	struct {
-		size_t value = 0;
-		size_t count = 0;
-	} push;
-	struct {
-		size_t value = 0;
-		size_t count = 0;
-	} pop;
-};
-
-template<typename node_t>
-class __attribute__((aligned(128))) relaxed_list {
-	static_assert(std::is_same<decltype(node_t::_links), _LinksFields_t<node_t>>::value, "Node must have a links field");
-
-public:
-	static const char * name() {
-		const char * names[] = {
-			"RELAXED: VANILLA",
-			"RELAXED: SNZI",
-			"RELAXED: BITMASK",
-			"RELAXED: SNZI + DISCOVERED MASK",
-			"RELAXED: SNZI + MASK",
-			"RELAXED: SNZI + LOCAL BIAS",
-			"RELAXED: SNZI + REVERSE RNG",
-			"RELAXED: SNZI + LOCAL BIAS + REVERSE RNG"
-		};
-		return names[VARIANT];
-	}
-
-	relaxed_list(unsigned numThreads, unsigned numQueues)
-		: numLists(numThreads * numQueues)
-	  	, lists(new intrusive_queue_t<node_t>[numLists])
-		#if VARIANT == SNZI || VARIANT == BACK
-			, snzi( std::log2( numLists / (2 * numQueues) ), 2 )
-		#elif VARIANT == BIAS || VARIANT == BACKBIAS
-			#ifdef SNZI_PACKED
-				, snzi( std::ceil( std::log2(numLists) ) )
-			#else
-				, snzi( std::log2( numLists / (2 * numQueues) ), 2 )
-			#endif
-		#elif VARIANT == SNZM || VARIANT == DISCOVER
-			, snzm( numLists )
-		#endif
-	{
-		assertf(7 * 8 * 8 >= numLists, "List currently only supports 448 sublists");
-		std::cout << "Constructing Relaxed List with " << numLists << std::endl;
-	}
-
-	~relaxed_list() {
-		std::cout << "Destroying Relaxed List" << std::endl;
-		lists.reset();
-	}
-
-    	__attribute__((noinline, hot)) void push(node_t * node) {
-		node->_links.ts = rdtscl();
-
-		while(true) {
-			// Pick a random list
-			unsigned i = idx_from_r(tls.rng1.next(), VARIANT == BIAS || VARIANT == BACKBIAS);
-
-			#ifndef NO_STATS
-				tls.pick.push.attempt++;
-			#endif
-
-			// If we can't lock it retry
-			if( !lists[i].lock.try_lock() ) continue;
-
-			#if VARIANT == VANILLA || VARIANT == BITMASK
-				__attribute__((unused)) int num = numNonEmpty;
-			#endif
-
-			// Actually push it
-			if(lists[i].push(node)) {
-				#if VARIANT == DISCOVER
-					size_t qword = i >> 6ull;
-					size_t bit   = i & 63ull;
-					assert(qword == 0);
-					bts(tls.mask, bit);
-					snzm.arrive(i);
-				#elif VARIANT == SNZI || VARIANT == BIAS
-					snzi.arrive(i);
-				#elif VARIANT == BACK || VARIANT == BACKBIAS
-					snzi.arrive(i);
-					tls.rng2.set_raw_state( tls.rng1.get_raw_state());
-				#elif VARIANT == SNZM
-					snzm.arrive(i);
-				#elif VARIANT == BITMASK
-					numNonEmpty++;
-					size_t qword = i >> 6ull;
-					size_t bit   = i & 63ull;
-					assertf((list_mask[qword] & (1ul << bit)) == 0, "Before set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit));
-					__attribute__((unused)) bool ret = bts(list_mask[qword], bit);
-					assert(!ret);
-					assertf((list_mask[qword] & (1ul << bit)) != 0, "After set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit));
-				#else
-					numNonEmpty++;
-				#endif
-			}
-			#if VARIANT == VANILLA || VARIANT == BITMASK
-				assert(numNonEmpty <= (int)numLists);
-			#endif
-
-			// Unlock and return
-			lists[i].lock.unlock();
-
-			#ifndef NO_STATS
-				tls.pick.push.success++;
-				#if VARIANT == VANILLA || VARIANT == BITMASK
-					tls.empty.push.value += num;
-					tls.empty.push.count += 1;
-				#endif
-			#endif
-			return;
-		}
-    	}
-
-	__attribute__((noinline, hot)) node_t * pop() {
-		#if VARIANT == DISCOVER
-			assert(numLists <= 64);
-			while(snzm.query()) {
-				tls.pick.pop.mask_attempt++;
-				unsigned i, j;
-				{
-					// Pick first list totally randomly
-					i = tls.rng1.next() % numLists;
-
-					// Pick the other according to the bitmask
-					unsigned r = tls.rng1.next();
-
-					size_t mask = tls.mask.load(std::memory_order_relaxed);
-					if(mask == 0) {
-						tls.pick.pop.mask_reset++;
-						mask = (1U << numLists) - 1;
-						tls.mask.store(mask, std::memory_order_relaxed);
-					}
-
-					unsigned b = rand_bit(r, mask);
-
-					assertf(b < 64, "%zu %u", mask, b);
-
-					j = b;
-
-					assert(j < numLists);
-				}
-
-				if(auto node = try_pop(i, j)) return node;
-			}
-		#elif VARIANT == SNZI
-			while(snzi.query()) {
-				// Pick two lists at random
-				int i = tls.rng1.next() % numLists;
-				int j = tls.rng1.next() % numLists;
-
-				if(auto node = try_pop(i, j)) return node;
-			}
-
-		#elif VARIANT == BACK
-			while(snzi.query()) {
-				// Pick two lists at random
-				int i = tls.rng2.prev() % numLists;
-				int j = tls.rng2.prev() % numLists;
-
-				if(auto node = try_pop(i, j)) return node;
-			}
-
-		#elif VARIANT == BACKBIAS
-			while(snzi.query()) {
-				// Pick two lists at random
-				int i = idx_from_r(tls.rng2.prev(), true);
-				int j = idx_from_r(tls.rng2.prev(), true);
-
-				if(auto node = try_pop(i, j)) return node;
-			}
-
-		#elif VARIANT == BIAS
-			while(snzi.query()) {
-				// Pick two lists at random
-				unsigned ri = tls.rng1.next();
-				unsigned i;
-				unsigned j = tls.rng1.next();
-				if(0 == (ri & 0xF)) {
-					i = (ri >> 4) % numLists;
-				} else {
-					i = tls.my_queue + ((ri >> 4) % 4);
-					j = tls.my_queue + ((j >> 4) % 4);
-					tls.pick.pop.local++;
-				}
-				i %= numLists;
-				j %= numLists;
-
-				if(auto node = try_pop(i, j)) return node;
-			}
-		#elif VARIANT == SNZM
-			//*
-			while(snzm.query()) {
-				tls.pick.pop.mask_attempt++;
-				unsigned i, j;
-				{
-					// Pick two random number
-					unsigned ri = tls.rng1.next();
-					unsigned rj = tls.rng1.next();
-
-					// Pick two nodes from it
-					unsigned wdxi = ri & snzm.mask;
-					// unsigned wdxj = rj & snzm.mask;
-
-					// Get the masks from the nodes
-					// size_t maski = snzm.masks(wdxi);
-					size_t maskj = snzm.masks(wdxj);
-
-					if(maski == 0 && maskj == 0) continue;
-
-					#if defined(__BMI2__)
-						uint64_t idxsi = _pext_u64(snzm.indexes, maski);
-						// uint64_t idxsj = _pext_u64(snzm.indexes, maskj);
-
-						auto pi = __builtin_popcountll(maski);
-						// auto pj = __builtin_popcountll(maskj);
-
-						ri = pi ? ri & ((pi >> 3) - 1) : 0;
-						rj = pj ? rj & ((pj >> 3) - 1) : 0;
-
-						unsigned bi = (idxsi >> (ri << 3)) & 0xff;
-						unsigned bj = (idxsj >> (rj << 3)) & 0xff;
-					#else
-						unsigned bi = rand_bit(ri >> snzm.depth, maski);
-						unsigned bj = rand_bit(rj >> snzm.depth, maskj);
-					#endif
-
-					i = (bi << snzm.depth) | wdxi;
-					j = (bj << snzm.depth) | wdxj;
-
-					/* paranoid */ assertf(i < numLists, "%u %u", bj, wdxi);
-					/* paranoid */ assertf(j < numLists, "%u %u", bj, wdxj);
-				}
-
-				if(auto node = try_pop(i, j)) return node;
-			}
-			/*/
-			while(snzm.query()) {
-				// Pick two lists at random
-				int i = tls.rng1.next() % numLists;
-				int j = tls.rng1.next() % numLists;
-
-				if(auto node = try_pop(i, j)) return node;
-			}
-			//*/
-		#elif VARIANT == BITMASK
-			int nnempty;
-			while(0 != (nnempty = numNonEmpty)) {
-				tls.pick.pop.mask_attempt++;
-				unsigned i, j;
-				{
-					// Pick two lists at random
-					unsigned num = ((numLists - 1) >> 6) + 1;
-
-					unsigned ri = tls.rng1.next();
-					unsigned rj = tls.rng1.next();
-
-					unsigned wdxi = (ri >> 6u) % num;
-					unsigned wdxj = (rj >> 6u) % num;
-
-					size_t maski = list_mask[wdxi].load(std::memory_order_relaxed);
-					size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed);
-
-					if(maski == 0 && maskj == 0) continue;
-
-					unsigned bi = rand_bit(ri, maski);
-					unsigned bj = rand_bit(rj, maskj);
-
-					assertf(bi < 64, "%zu %u", maski, bi);
-					assertf(bj < 64, "%zu %u", maskj, bj);
-
-					i = bi | (wdxi << 6);
-					j = bj | (wdxj << 6);
-
-					assertf(i < numLists, "%u", wdxi << 6);
-					assertf(j < numLists, "%u", wdxj << 6);
-				}
-
-				if(auto node = try_pop(i, j)) return node;
-			}
-		#else
-			while(numNonEmpty != 0) {
-				// Pick two lists at random
-				int i = tls.rng1.next() % numLists;
-				int j = tls.rng1.next() % numLists;
-
-				if(auto node = try_pop(i, j)) return node;
-			}
-		#endif
-
-		return nullptr;
-    	}
-
-private:
-	node_t * try_pop(unsigned i, unsigned j) {
-		#ifndef NO_STATS
-			tls.pick.pop.attempt++;
-		#endif
-
-		#if VARIANT == DISCOVER
-			if(lists[i].ts() > 0) bts(tls.mask, i); else btr(tls.mask, i);
-			if(lists[j].ts() > 0) bts(tls.mask, j); else btr(tls.mask, j);
-		#endif
-
-		// Pick the bet list
-		int w = i;
-		if( __builtin_expect(lists[j].ts() != 0, true) ) {
-			w = (lists[i].ts() < lists[j].ts()) ? i : j;
-		}
-
-		auto & list = lists[w];
-		// If list looks empty retry
-		if( list.ts() == 0 ) return nullptr;
-
-		// If we can't get the lock retry
-		if( !list.lock.try_lock() ) return nullptr;
-
-		#if VARIANT == VANILLA || VARIANT == BITMASK
-			__attribute__((unused)) int num = numNonEmpty;
-		#endif
-
-		// If list is empty, unlock and retry
-		if( list.ts() == 0 ) {
-			list.lock.unlock();
-			return nullptr;
-		}
-
-		// Actually pop the list
-		node_t * node;
-		bool emptied;
-		std::tie(node, emptied) = list.pop();
-		assert(node);
-
-		if(emptied) {
-			#if VARIANT == DISCOVER
-				size_t qword = w >> 6ull;
-				size_t bit   = w & 63ull;
-				assert(qword == 0);
-				__attribute__((unused)) bool ret = btr(tls.mask, bit);
-				snzm.depart(w);
-			#elif VARIANT == SNZI || VARIANT == BIAS || VARIANT == BACK || VARIANT == BACKBIAS
-				snzi.depart(w);
-			#elif VARIANT == SNZM
-				snzm.depart(w);
-			#elif VARIANT == BITMASK
-				numNonEmpty--;
-				size_t qword = w >> 6ull;
-				size_t bit   = w & 63ull;
-				assert((list_mask[qword] & (1ul << bit)) != 0);
-				__attribute__((unused)) bool ret = btr(list_mask[qword], bit);
-				assert(ret);
-				assert((list_mask[qword] & (1ul << bit)) == 0);
-			#else
-				numNonEmpty--;
-			#endif
-		}
-
-		// Unlock and return
-		list.lock.unlock();
-		#if VARIANT == VANILLA || VARIANT == BITMASK
-			assert(numNonEmpty >= 0);
-		#endif
-		#ifndef NO_STATS
-			tls.pick.pop.success++;
-			#if VARIANT == VANILLA || VARIANT == BITMASK
-				tls.empty.pop.value += num;
-				tls.empty.pop.count += 1;
-			#endif
-		#endif
-		return node;
-	}
-
-	inline unsigned idx_from_r(unsigned r, bool bias) {
-		unsigned i;
-		if(bias) {
-			if(0 == (r & 0x3F)) {
-				i = r >> 6;
-			} else {
-				i = tls.my_queue + ((r >> 6) % 4);
-				tls.pick.push.local++;
-			}
-		} else {
-			i = r;
-		}
-		return i % numLists;
-	}
-
-public:
-
-	static __attribute__((aligned(128))) thread_local struct TLS {
-		Random     rng1 = { unsigned(std::hash<std::thread::id>{}(std::this_thread::get_id()) ^ rdtscl()) };
-		Random     rng2 = { unsigned(std::hash<std::thread::id>{}(std::this_thread::get_id()) ^ rdtscl()) };
-		unsigned   my_queue = (ticket++) * 4;
-		pick_stat  pick;
-		empty_stat empty;
-		__attribute__((aligned(64))) std::atomic_size_t mask = { 0 };
-	} tls;
-
-private:
-	const unsigned numLists;
-    	__attribute__((aligned(64))) std::unique_ptr<intrusive_queue_t<node_t> []> lists;
-private:
-	#if VARIANT == SNZI || VARIANT == BACK
-		snzi_t snzi;
-	#elif VARIANT == BIAS || VARIANT == BACKBIAS
-		#ifdef SNZI_PACKED
-			snzip_t snzi;
-		#else
-			snzi_t snzi;
-		#endif
-	#elif VARIANT == SNZM || VARIANT == DISCOVER
-		snzm_t snzm;
-	#else
-		std::atomic_int numNonEmpty  = { 0 };  // number of non-empty lists
-	#endif
-	#if VARIANT == BITMASK
-		std::atomic_size_t list_mask[7] = { {0}, {0}, {0}, {0}, {0}, {0}, {0} }; // which queues are empty
-	#endif
-
-public:
-	static const constexpr size_t sizeof_queue = sizeof(intrusive_queue_t<node_t>);
-	static std::atomic_uint32_t ticket;
-
-#ifndef NO_STATS
-	static void stats_tls_tally() {
-		global_stats.pick.push.attempt += tls.pick.push.attempt;
-		global_stats.pick.push.success += tls.pick.push.success;
-		global_stats.pick.push.local += tls.pick.push.local;
-		global_stats.pick.pop .attempt += tls.pick.pop.attempt;
-		global_stats.pick.pop .success += tls.pick.pop.success;
-		global_stats.pick.pop .mask_attempt += tls.pick.pop.mask_attempt;
-		global_stats.pick.pop .mask_reset += tls.pick.pop.mask_reset;
-		global_stats.pick.pop .local += tls.pick.pop.local;
-
-		global_stats.qstat.push.value += tls.empty.push.value;
-		global_stats.qstat.push.count += tls.empty.push.count;
-		global_stats.qstat.pop .value += tls.empty.pop .value;
-		global_stats.qstat.pop .count += tls.empty.pop .count;
-	}
-
-private:
-	static struct GlobalStats {
-		struct {
-			struct {
-				std::atomic_size_t attempt = { 0 };
-				std::atomic_size_t success = { 0 };
-				std::atomic_size_t local = { 0 };
-			} push;
-			struct {
-				std::atomic_size_t attempt = { 0 };
-				std::atomic_size_t success = { 0 };
-				std::atomic_size_t mask_attempt = { 0 };
-				std::atomic_size_t mask_reset = { 0 };
-				std::atomic_size_t local = { 0 };
-			} pop;
-		} pick;
-		struct {
-			struct {
-				std::atomic_size_t value = { 0 };
-				std::atomic_size_t count = { 0 };
-			} push;
-			struct {
-				std::atomic_size_t value = { 0 };
-				std::atomic_size_t count = { 0 };
-			} pop;
-		} qstat;
-	} global_stats;
-
-public:
-	static void stats_print(std::ostream & os ) {
-		std::cout << "----- Relaxed List Stats -----" << std::endl;
-
-		const auto & global = global_stats;
-
-		double push_sur = (100.0 * double(global.pick.push.success) / global.pick.push.attempt);
-		double pop_sur  = (100.0 * double(global.pick.pop .success) / global.pick.pop .attempt);
-		double mpop_sur = (100.0 * double(global.pick.pop .success) / global.pick.pop .mask_attempt);
-		double rpop_sur = (100.0 * double(global.pick.pop .success) / global.pick.pop .mask_reset);
-
-		double push_len = double(global.pick.push.attempt     ) / global.pick.push.success;
-		double pop_len  = double(global.pick.pop .attempt     ) / global.pick.pop .success;
-		double mpop_len = double(global.pick.pop .mask_attempt) / global.pick.pop .success;
-		double rpop_len = double(global.pick.pop .mask_reset  ) / global.pick.pop .success;
-
-		os << "Push   Pick   : " << push_sur << " %, len " << push_len << " (" << global.pick.push.attempt      << " / " << global.pick.push.success << ")\n";
-		os << "Pop    Pick   : " << pop_sur  << " %, len " << pop_len  << " (" << global.pick.pop .attempt      << " / " << global.pick.pop .success << ")\n";
-		os << "TryPop Pick   : " << mpop_sur << " %, len " << mpop_len << " (" << global.pick.pop .mask_attempt << " / " << global.pick.pop .success << ")\n";
-		os << "Pop M Reset   : " << rpop_sur << " %, len " << rpop_len << " (" << global.pick.pop .mask_reset   << " / " << global.pick.pop .success << ")\n";
-
-		double avgQ_push = double(global.qstat.push.value) / global.qstat.push.count;
-		double avgQ_pop  = double(global.qstat.pop .value) / global.qstat.pop .count;
-		double avgQ      = double(global.qstat.push.value + global.qstat.pop .value) / (global.qstat.push.count + global.qstat.pop .count);
-		os << "Push   Avg Qs : " << avgQ_push << " (" << global.qstat.push.count << "ops)\n";
-		os << "Pop    Avg Qs : " << avgQ_pop  << " (" << global.qstat.pop .count << "ops)\n";
-		os << "Global Avg Qs : " << avgQ      << " (" << (global.qstat.push.count + global.qstat.pop .count) << "ops)\n";
-
-		os << "Local Push    : " << global.pick.push.local << "\n";
-		os << "Local Pop     : " << global.pick.pop .local << "\n";
-	}
-#endif
-};
Index: doc/theses/thierry_delisle_PhD/code/relaxed_list_layout.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/relaxed_list_layout.cpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,23 +1,0 @@
-#define NO_IO
-#define NDEBUG
-#include "relaxed_list.hpp"
-
-struct __attribute__((aligned(64))) Node {
-	static std::atomic_size_t creates;
-	static std::atomic_size_t destroys;
-
-	_LinksFields_t<Node> _links;
-
-	int value;
-	Node(int value): value(value) {
-		creates++;
-	}
-
-	~Node() {
-		destroys++;
-	}
-};
-
-int main() {
-	return sizeof(relaxed_list<Node>) + relaxed_list<Node>::sizeof_queue;
-}
Index: doc/theses/thierry_delisle_PhD/code/runperf.sh
===================================================================
--- doc/theses/thierry_delisle_PhD/code/runperf.sh	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,14 +1,0 @@
-#!/bin/bash
-set -e
-
-name=$1
-event=$2
-
-shift 2
-
-echo "perf record -F 99 -a -g -o raw/$name.data -e $event -- $@ > raw/$name.out"
-perf record -F 99 -a -g -o raw/$name.data -e $event -- $@ > raw/$name.out
-echo "=============================="
-cat raw/$name.out
-echo "=============================="
-./process.sh $name
Index: doc/theses/thierry_delisle_PhD/code/scale.sh
===================================================================
--- doc/theses/thierry_delisle_PhD/code/scale.sh	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,7 +1,0 @@
-#!/bin/bash
-taskset -c 24-31 ./a.out -t  1 -b churn | grep --color -E "(ns|Ops|Running)"
-taskset -c 24-31 ./a.out -t  2 -b churn | grep --color -E "(ns|Ops|Running)"
-taskset -c 24-31 ./a.out -t  4 -b churn | grep --color -E "(ns|Ops|Running)"
-taskset -c 24-31 ./a.out -t  8 -b churn | grep --color -E "(ns|Ops|Running)"
-taskset -c 16-31 ./a.out -t 16 -b churn | grep --color -E "(ns|Ops|Running)"
-taskset -c  0-31 ./a.out -t 32 -b churn | grep --color -E "(ns|Ops|Running)"
Index: doc/theses/thierry_delisle_PhD/code/snzi-packed.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/snzi-packed.hpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,179 +1,0 @@
-#pragma once
-
-#define SNZI_PACKED
-
-#include "utils.hpp"
-
-
-class snzip_t {
-	class node;
-	class node_aligned;
-public:
-	const unsigned mask;
-	const int root;
-	std::unique_ptr<snzip_t::node[]> leafs;
-	std::unique_ptr<snzip_t::node_aligned[]> nodes;
-
-	snzip_t(unsigned depth);
-
-	void arrive(int idx) {
-		// idx >>= 1;
-		idx %= mask;
-		leafs[idx].arrive();
-	}
-
-	void depart(int idx) {
-		// idx >>= 1;
-		idx %= mask;
-		leafs[idx].depart();
-	}
-
-	bool query() const {
-		return nodes[root].query();
-	}
-
-
-private:
-	class __attribute__((aligned(32))) node {
-		friend class snzip_t;
-	private:
-
-		union val_t {
-			static constexpr char Half = -1;
-
-			uint64_t _all;
-			struct __attribute__((packed)) {
-				char cnt;
-				uint64_t ver:56;
-			};
-
-			bool cas(val_t & exp, char _cnt, uint64_t _ver) volatile {
-				val_t t;
-				t.ver = _ver;
-				t.cnt = _cnt;
-				/* paranoid */ assert(t._all == ((_ver << 8) | ((unsigned char)_cnt)));
-				return __atomic_compare_exchange_n(&this->_all, &exp._all, t._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-			}
-
-			bool cas(val_t & exp, const val_t & tar) volatile {
-				return __atomic_compare_exchange_n(&this->_all, &exp._all, tar._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-			}
-
-			val_t() : _all(0) {}
-			val_t(const volatile val_t & o) : _all(o._all) {}
-		};
-
-		//--------------------------------------------------
-		// Hierarchical node
-		void arrive_h() {
-			int undoArr = 0;
-			bool success = false;
-			while(!success) {
-				auto x{ value };
-				/* paranoid */ assert(x.cnt <= 120);
-				if( x.cnt >= 1 ) {
-					if( value.cas(x, x.cnt + 1, x.ver ) ) {
-						success = true;
-					}
-				}
-				/* paranoid */ assert(x.cnt <= 120);
-				if( x.cnt == 0 ) {
-					if( value.cas(x, val_t::Half, x.ver + 1) ) {
-						success = true;
-						x.cnt = val_t::Half;
-						x.ver = x.ver + 1;
-					}
-				}
-				/* paranoid */ assert(x.cnt <= 120);
-				if( x.cnt == val_t::Half ) {
-					/* paranoid */ assert(parent);
-					if(undoArr == 2) {
-						undoArr--;
-					} else {
-						parent->arrive();
-					}
-					if( !value.cas(x, 1, x.ver) ) {
-						undoArr = undoArr + 1;
-					}
-				}
-			}
-
-			for(int i = 0; i < undoArr; i++) {
-				/* paranoid */ assert(parent);
-				parent->depart();
-			}
-		}
-
-		void depart_h() {
-			while(true) {
-				auto x = (const val_t)value;
-				/* paranoid */ assertf(x.cnt >= 1, "%d", x.cnt);
-				if( value.cas( x, x.cnt - 1, x.ver ) ) {
-					if( x.cnt == 1 ) {
-						/* paranoid */ assert(parent);
-						parent->depart();
-					}
-					return;
-				}
-			}
-		}
-
-		//--------------------------------------------------
-		// Root node
-		void arrive_r() {
-			__atomic_fetch_add(&value._all, 1, __ATOMIC_SEQ_CST);
-		}
-
-		void depart_r() {
-			__atomic_fetch_sub(&value._all, 1, __ATOMIC_SEQ_CST);
-		}
-
-	private:
-		volatile val_t value;
-		class node * parent = nullptr;
-
-		bool is_root() {
-			return parent == nullptr;
-		}
-
-	public:
-		void arrive() {
-			if(is_root()) arrive_r();
-			else arrive_h();
-		}
-
-		void depart() {
-			if(is_root()) depart_r();
-			else depart_h();
-		}
-
-		bool query() {
-			/* paranoid */ assert(is_root());
-			return value._all > 0;
-		}
-	};
-
-	class __attribute__((aligned(128))) node_aligned : public node {};
-};
-
-snzip_t::snzip_t(unsigned depth)
-	: mask( std::pow(2, depth) )
-	, root( ((std::pow(2, depth + 1) - 1) / (2 -1)) - 1 - mask )
-	, leafs(new node[ mask ]())
-	, nodes(new node_aligned[ root + 1 ]())
-{
-	int width = std::pow(2, depth);
-	int hwdith = width / 2;
-	std::cout << "SNZI: " << depth << "x" << width << "(" << mask - 1 << ") " << (sizeof(snzip_t::node) * (root + 1)) << " bytes" << std::endl;
-	for(int i = 0; i < width; i++) {
-		int idx = i % hwdith;
-		std::cout << i << " -> " << idx + width << std::endl;
-		leafs[i].parent = &nodes[ idx ];
-	}
-
-	for(int i = 0; i < root; i++) {
-		int idx = (i / 2) + hwdith;
-		std::cout << i + width << " -> " << idx + width << std::endl;
-		nodes[i].parent = &nodes[ idx ];
-	}
-}
Index: doc/theses/thierry_delisle_PhD/code/snzi.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/snzi.hpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,164 +1,0 @@
-#pragma once
-
-#include "utils.hpp"
-
-
-class snzi_t {
-	class node;
-public:
-	const unsigned mask;
-	const int root;
-	std::unique_ptr<snzi_t::node[]> nodes;
-
-	snzi_t(unsigned depth, unsigned base = 2);
-
-	void arrive(int idx) {
-		idx >>= 2;
-		idx %= mask;
-		nodes[idx].arrive();
-	}
-
-	void depart(int idx) {
-		idx >>= 2;
-		idx %= mask;
-		nodes[idx].depart();
-	}
-
-	bool query() const {
-		return nodes[root].query();
-	}
-
-
-private:
-	class __attribute__((aligned(128))) node {
-		friend class snzi_t;
-	private:
-
-		union val_t {
-			static constexpr char Half = -1;
-
-			uint64_t _all;
-			struct __attribute__((packed)) {
-				char cnt;
-				uint64_t ver:56;
-			};
-
-			bool cas(val_t & exp, char _cnt, uint64_t _ver) volatile {
-				val_t t;
-				t.ver = _ver;
-				t.cnt = _cnt;
-				/* paranoid */ assert(t._all == ((_ver << 8) | ((unsigned char)_cnt)));
-				return __atomic_compare_exchange_n(&this->_all, &exp._all, t._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-			}
-
-			bool cas(val_t & exp, const val_t & tar) volatile {
-				return __atomic_compare_exchange_n(&this->_all, &exp._all, tar._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-			}
-
-			val_t() : _all(0) {}
-			val_t(const volatile val_t & o) : _all(o._all) {}
-		};
-
-		//--------------------------------------------------
-		// Hierarchical node
-		void arrive_h() {
-			int undoArr = 0;
-			bool success = false;
-			while(!success) {
-				auto x{ value };
-				/* paranoid */ assert(x.cnt <= 120);
-				if( x.cnt >= 1 ) {
-					if( value.cas(x, x.cnt + 1, x.ver ) ) {
-						success = true;
-					}
-				}
-				/* paranoid */ assert(x.cnt <= 120);
-				if( x.cnt == 0 ) {
-					if( value.cas(x, val_t::Half, x.ver + 1) ) {
-						success = true;
-						x.cnt = val_t::Half;
-						x.ver = x.ver + 1;
-					}
-				}
-				/* paranoid */ assert(x.cnt <= 120);
-				if( x.cnt == val_t::Half ) {
-					/* paranoid */ assert(parent);
-					if(undoArr == 2) {
-						undoArr--;
-					} else {
-						parent->arrive();
-					}
-					if( !value.cas(x, 1, x.ver) ) {
-						undoArr = undoArr + 1;
-					}
-				}
-			}
-
-			for(int i = 0; i < undoArr; i++) {
-				/* paranoid */ assert(parent);
-				parent->depart();
-			}
-		}
-
-		void depart_h() {
-			while(true) {
-				auto x = (const val_t)value;
-				/* paranoid */ assertf(x.cnt >= 1, "%d", x.cnt);
-				if( value.cas( x, x.cnt - 1, x.ver ) ) {
-					if( x.cnt == 1 ) {
-						/* paranoid */ assert(parent);
-						parent->depart();
-					}
-					return;
-				}
-			}
-		}
-
-		//--------------------------------------------------
-		// Root node
-		void arrive_r() {
-			__atomic_fetch_add(&value._all, 1, __ATOMIC_SEQ_CST);
-		}
-
-		void depart_r() {
-			__atomic_fetch_sub(&value._all, 1, __ATOMIC_SEQ_CST);
-		}
-
-	private:
-		volatile val_t value;
-		class node * parent = nullptr;
-
-		bool is_root() {
-			return parent == nullptr;
-		}
-
-	public:
-		void arrive() {
-			if(is_root()) arrive_r();
-			else arrive_h();
-		}
-
-		void depart() {
-			if(is_root()) depart_r();
-			else depart_h();
-		}
-
-		bool query() {
-			/* paranoid */ assert(is_root());
-			return value._all > 0;
-		}
-	};
-};
-
-snzi_t::snzi_t(unsigned depth, unsigned base)
-	: mask( std::pow(base, depth) )
-	, root( ((std::pow(base, depth + 1) - 1) / (base -1)) - 1 )
-	, nodes(new node[ root + 1 ]())
-{
-	int width = std::pow(base, depth);
-	std::cout << "SNZI: " << depth << "x" << width << "(" << mask - 1 << ") " << (sizeof(snzi_t::node) * (root + 1)) << " bytes" << std::endl;
-	for(int i = 0; i < root; i++) {
-		std::cout << i << " -> " << (i / base) + width << std::endl;
-		nodes[i].parent = &nodes[(i / base) + width];
-	}
-}
Index: doc/theses/thierry_delisle_PhD/code/snzm.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/snzm.hpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,213 +1,0 @@
-#pragma once
-
-#include "utils.hpp"
-
-
-class snzm_t {
-	class node;
-public:
-	const unsigned depth;
-	const unsigned mask;
-	const int root;
-	std::unique_ptr<snzm_t::node[]> nodes;
-
-	#if defined(__BMI2__)
-		const uint64_t indexes = 0x0706050403020100;
-	#endif
-
-	snzm_t(unsigned numLists);
-
-	void arrive(int idx) {
-		int i = idx & mask;
-		nodes[i].arrive( idx >> depth);
-	}
-
-	void depart(int idx) {
-		int i = idx & mask;
-		nodes[i].depart( idx >> depth );
-	}
-
-	bool query() const {
-		return nodes[root].query();
-	}
-
-	uint64_t masks( unsigned node ) {
-		/* paranoid */ assert( (node & mask) == node );
-		#if defined(__BMI2__)
-			return nodes[node].mask_all;
-		#else
-			return nodes[node].mask;
-		#endif
-	}
-
-private:
-	class __attribute__((aligned(128))) node {
-		friend class snzm_t;
-	private:
-
-		union val_t {
-			static constexpr char Half = -1;
-
-			uint64_t _all;
-			struct __attribute__((packed)) {
-				char cnt;
-				uint64_t ver:56;
-			};
-
-			bool cas(val_t & exp, char _cnt, uint64_t _ver) volatile {
-				val_t t;
-				t.ver = _ver;
-				t.cnt = _cnt;
-				/* paranoid */ assert(t._all == ((_ver << 8) | ((unsigned char)_cnt)));
-				return __atomic_compare_exchange_n(&this->_all, &exp._all, t._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-			}
-
-			bool cas(val_t & exp, const val_t & tar) volatile {
-				return __atomic_compare_exchange_n(&this->_all, &exp._all, tar._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-			}
-
-			val_t() : _all(0) {}
-			val_t(const volatile val_t & o) : _all(o._all) {}
-		};
-
-		//--------------------------------------------------
-		// Hierarchical node
-		void arrive_h() {
-			int undoArr = 0;
-			bool success = false;
-			while(!success) {
-				auto x{ value };
-				/* paranoid */ assert(x.cnt <= 120);
-				if( x.cnt >= 1 ) {
-					if( value.cas(x, x.cnt + 1, x.ver ) ) {
-						success = true;
-					}
-				}
-				/* paranoid */ assert(x.cnt <= 120);
-				if( x.cnt == 0 ) {
-					if( value.cas(x, val_t::Half, x.ver + 1) ) {
-						success = true;
-						x.cnt = val_t::Half;
-						x.ver = x.ver + 1;
-					}
-				}
-				/* paranoid */ assert(x.cnt <= 120);
-				if( x.cnt == val_t::Half ) {
-					/* paranoid */ assert(parent);
-					parent->arrive();
-					if( !value.cas(x, 1, x.ver) ) {
-						undoArr = undoArr + 1;
-					}
-				}
-			}
-
-			for(int i = 0; i < undoArr; i++) {
-				/* paranoid */ assert(parent);
-				parent->depart();
-			}
-		}
-
-		void depart_h() {
-			while(true) {
-				auto x = (const val_t)value;
-				/* paranoid */ assertf(x.cnt >= 1, "%d", x.cnt);
-				if( value.cas( x, x.cnt - 1, x.ver ) ) {
-					if( x.cnt == 1 ) {
-						/* paranoid */ assert(parent);
-						parent->depart();
-					}
-					return;
-				}
-			}
-		}
-
-		//--------------------------------------------------
-		// Root node
-		void arrive_r() {
-			__atomic_fetch_add(&value._all, 1, __ATOMIC_SEQ_CST);
-		}
-
-		void depart_r() {
-			__atomic_fetch_sub(&value._all, 1, __ATOMIC_SEQ_CST);
-		}
-
-		//--------------------------------------------------
-		// Interface node
-		void arrive() {
-			/* paranoid */ assert(!is_leaf);
-			if(is_root()) arrive_r();
-			else arrive_h();
-		}
-
-		void depart() {
-			/* paranoid */ assert(!is_leaf);
-			if(is_root()) depart_r();
-			else depart_h();
-		}
-
-	private:
-		volatile val_t value;
-		#if defined(__BMI2__)
-			union __attribute__((packed)) {
-				volatile uint8_t mask[8];
-				volatile uint64_t mask_all;
-			};
-		#else
-			volatile size_t mask = 0;
-		#endif
-
-		class node * parent = nullptr;
-		bool is_leaf = false;
-
-		bool is_root() {
-			return parent == nullptr;
-		}
-
-	public:
-		void arrive( int bit ) {
-			/* paranoid */ assert( is_leaf );
-
-			arrive_h();
-			#if defined(__BMI2__)
-				/* paranoid */ assert( bit < 8 );
-				mask[bit] = 0xff;
-			#else
-				/* paranoid */ assert( (mask & ( 1 << bit )) == 0 );
-				__atomic_fetch_add( &mask, 1 << bit, __ATOMIC_RELAXED );
-			#endif
-
-		}
-
-		void depart( int bit ) {
-			/* paranoid */ assert( is_leaf );
-
-			#if defined(__BMI2__)
-				/* paranoid */ assert( bit < 8 );
-				mask[bit] = 0x00;
-			#else
-				/* paranoid */ assert( (mask & ( 1 << bit )) != 0 );
-				__atomic_fetch_sub( &mask, 1 << bit, __ATOMIC_RELAXED );
-			#endif
-			depart_h();
-		}
-
-		bool query() {
-			/* paranoid */ assert(is_root());
-			return value._all > 0;
-		}
-	};
-};
-
-snzm_t::snzm_t(unsigned numLists)
-	: depth( std::log2( numLists / 8 ) )
-	, mask( (1 << depth) - 1 )
-	, root( (1 << (depth + 1)) - 2 )
-	, nodes(new node[ root + 1 ]())
-{
-	int width = 1 << depth;
-	std::cout << "SNZI with Mask: " << depth << "x" << width << "(" << mask << ")" << std::endl;
-	for(int i = 0; i < root; i++) {
-		nodes[i].is_leaf = i < width;
-		nodes[i].parent = &nodes[(i / 2) + width ];
-	}
-}
Index: doc/theses/thierry_delisle_PhD/code/utils.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/utils.hpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,250 +1,0 @@
-#pragma once
-
-#include <cassert>
-#include <cstddef>
-#include <atomic>
-#include <chrono>
-#include <fstream>
-#include <iostream>
-
-#include <unistd.h>
-#include <sys/sysinfo.h>
-
-#include <x86intrin.h>
-
-// Barrier from
-class barrier_t {
-public:
-	barrier_t(size_t total)
-		: waiting(0)
-		, total(total)
-	{}
-
-	void wait(unsigned) {
-		size_t target = waiting++;
-		target = (target - (target % total)) + total;
-		while(waiting < target)
-			asm volatile("pause");
-
-		assert(waiting < (1ul << 60));
-    	}
-
-private:
-	std::atomic<size_t> waiting;
-	size_t total;
-};
-
-// class Random {
-// private:
-// 	unsigned int seed;
-// public:
-// 	Random(int seed) {
-// 		this->seed = seed;
-// 	}
-
-// 	/** returns pseudorandom x satisfying 0 <= x < n. **/
-// 	unsigned int next() {
-// 		seed ^= seed << 6;
-// 		seed ^= seed >> 21;
-// 		seed ^= seed << 7;
-// 		return seed;
-//     	}
-// };
-
-constexpr uint64_t extendedEuclidY(uint64_t a, uint64_t b);
-constexpr uint64_t extendedEuclidX(uint64_t a, uint64_t b){
-    return (b==0) ? 1 : extendedEuclidY(b, a - b * (a / b));
-}
-constexpr uint64_t extendedEuclidY(uint64_t a, uint64_t b){
-    return (b==0) ? 0 : extendedEuclidX(b, a - b * (a / b)) - (a / b) * extendedEuclidY(b, a - b * (a / b));
-}
-
-class Random {
-private:
-	uint64_t x;
-
-	static constexpr const uint64_t M  = 1ul << 48ul;
-	static constexpr const uint64_t A  = 25214903917;
-	static constexpr const uint64_t C  = 11;
-	static constexpr const uint64_t D  = 16;
-
-public:
-	static constexpr const uint64_t m  = M;
-	static constexpr const uint64_t a  = A;
-	static constexpr const uint64_t c  = C;
-	static constexpr const uint64_t d  = D;
-	static constexpr const uint64_t ai = extendedEuclidX(A, M);
-public:
-	Random(unsigned int seed) {
-		this->x = seed * a;
-	}
-
-	/** returns pseudorandom x satisfying 0 <= x < n. **/
-	unsigned int next() {
-		//nextx = (a * x + c) % m;
-		x = (A * x + C) & (M - 1);
-		return x >> D;
-	}
-	unsigned int prev() {
-		//prevx = (ainverse * (x - c)) mod m
-		unsigned int r = x >> D;
-		x = ai * (x - C) & (M - 1);
-		return r;
-	}
-
-	void set_raw_state(uint64_t _x) {
-		this->x = _x;
-	}
-
-	uint64_t get_raw_state() {
-		return this->x;
-	}
-};
-
-static inline long long rdtscl(void) {
-    unsigned int lo, hi;
-    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
-    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
-}
-
-static inline void affinity(int tid) {
-	static int cpus = get_nprocs();
-
-	cpu_set_t  mask;
-	CPU_ZERO(&mask);
-	int cpu = cpus - tid;  // Set CPU affinity to tid, starting from the end
-	CPU_SET(cpu, &mask);
-	auto result = sched_setaffinity(0, sizeof(mask), &mask);
-	if(result != 0) {
-		std::cerr << "Affinity set failed with " << result<< ", wanted " << cpu << std::endl;
-	}
-}
-
-static const constexpr std::size_t cache_line_size = 64;
-static inline void check_cache_line_size() {
-	std::cout << "Checking cache line size" << std::endl;
-	const std::string cache_file = "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size";
-
-	std::ifstream ifs (cache_file, std::ifstream::in);
-
-	if(!ifs.good()) {
-		std::cerr << "Could not open file to check cache line size" << std::endl;
-		std::cerr << "Looking for: " << cache_file << std::endl;
-		std::exit(2);
-	}
-
-	size_t got;
-	ifs >> got;
-
-	ifs.close();
-
-	if(cache_line_size != got) {
-		std::cerr << "Cache line has incorrect size : " << got << std::endl;
-		std::exit(1);
-	}
-
-	std::cout << "Done" << std::endl;
-}
-
-using Clock = std::chrono::high_resolution_clock;
-using duration_t = std::chrono::duration<double>;
-using std::chrono::nanoseconds;
-
-template<typename Ratio, typename T>
-T duration_cast(T seconds) {
-	return std::chrono::duration_cast<std::chrono::duration<T, Ratio>>(std::chrono::duration<T>(seconds)).count();
-}
-
-static inline unsigned rand_bit(unsigned rnum, size_t mask) __attribute__((artificial));
-static inline unsigned rand_bit(unsigned rnum, size_t mask) {
-	unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
-#if !defined(__BMI2__)
-	uint64_t v = mask;   // Input value to find position with rank r.
-	unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
-	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
-	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
-	unsigned int t;      // Bit count temporary.
-
-	// Do a normal parallel bit count for a 64-bit integer,
-	// but store all intermediate steps.
-	a =  v - ((v >> 1) & ~0UL/3);
-	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
-	c = (b + (b >> 4)) & ~0UL/0x11;
-	d = (c + (c >> 8)) & ~0UL/0x101;
-
-
-	t = (d >> 32) + (d >> 48);
-	// Now do branchless select!
-	s  = 64;
-	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
-	t  = (d >> (s - 16)) & 0xff;
-	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
-	t  = (c >> (s - 8)) & 0xf;
-	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
-	t  = (b >> (s - 4)) & 0x7;
-	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
-	t  = (a >> (s - 2)) & 0x3;
-	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
-	t  = (v >> (s - 1)) & 0x1;
-	s -= ((t - r) & 256) >> 8;
-	return s - 1;
-#else
-	uint64_t picked = _pdep_u64(1ul << bit, mask);
-	return picked ? __builtin_ctzl(picked) : 0;
-#endif
-}
-
-struct spinlock_t {
-	std::atomic_bool ll = { false };
-
-	inline void lock() {
-		while( __builtin_expect(ll.exchange(true),false) ) {
-			while(ll.load(std::memory_order_relaxed))
-				asm volatile("pause");
-		}
-	}
-
-	inline bool try_lock() {
-		return false == ll.exchange(true);
-	}
-
-	inline void unlock() {
-		ll.store(false, std::memory_order_release);
-	}
-
-	inline explicit operator bool() {
-		return ll.load(std::memory_order_relaxed);
-	}
-};
-
-static inline bool bts(std::atomic_size_t & target, size_t bit ) {
-	//*
-	int result = 0;
-	asm volatile(
-		"LOCK btsq %[bit], %[target]\n\t"
-		:"=@ccc" (result)
-		: [target] "m" (target), [bit] "r" (bit)
-	);
-	return result != 0;
-	/*/
-	size_t mask = 1ul << bit;
-	size_t ret = target.fetch_or(mask, std::memory_order_relaxed);
-	return (ret & mask) != 0;
-	//*/
-}
-
-static inline bool btr(std::atomic_size_t & target, size_t bit ) {
-	//*
-	int result = 0;
-	asm volatile(
-		"LOCK btrq %[bit], %[target]\n\t"
-		:"=@ccc" (result)
-		: [target] "m" (target), [bit] "r" (bit)
-	);
-	return result != 0;
-	/*/
-	size_t mask = 1ul << bit;
-	size_t ret = target.fetch_and(~mask, std::memory_order_relaxed);
-	return (ret & mask) != 0;
-	//*/
-}
Index: doc/theses/thierry_delisle_PhD/code/work_stealing.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/work_stealing.hpp	(revision f0c3120ef113f46364e3c1cf53b2ccae87a6da3b)
+++ 	(revision )
@@ -1,222 +1,0 @@
-#pragma once
-#define LIST_VARIANT work_stealing
-
-#include <cmath>
-#include <iomanip>
-#include <memory>
-#include <mutex>
-#include <type_traits>
-
-#include "assert.hpp"
-#include "utils.hpp"
-#include "links.hpp"
-#include "snzi.hpp"
-
-using namespace std;
-
-template<typename node_t>
-class __attribute__((aligned(128))) work_stealing {
-	static_assert(std::is_same<decltype(node_t::_links), _LinksFields_t<node_t>>::value, "Node must have a links field");
-
-public:
-	static const char * name() {
-		return "Work Stealing";
-	}
-
-	work_stealing(unsigned _numThreads, unsigned)
-		: numThreads(_numThreads)
-		, lists(new intrusive_queue_t<node_t>[numThreads])
-		, snzi( std::log2( numThreads / 2 ), 2 )
-
-	{
-		std::cout << "Constructing Work Stealer with " << numThreads << std::endl;
-	}
-
-	~work_stealing() {
-		std::cout << "Destroying Work Stealer" << std::endl;
-		lists.reset();
-	}
-
-	__attribute__((noinline, hot)) void push(node_t * node) {
-		node->_links.ts = rdtscl();
-		if( node->_links.hint > numThreads ) {
-			node->_links.hint = tls.rng.next() % numThreads;
-			tls.stat.push.nhint++;
-		}
-
-		unsigned i = node->_links.hint;
-		auto & list = lists[i];
-		list.lock.lock();
-
-		if(list.push( node )) {
-			snzi.arrive(i);
-		}
-
-		list.lock.unlock();
-	}
-
-	__attribute__((noinline, hot)) node_t * pop() {
-		node_t * node;
-		while(true) {
-			if(!snzi.query()) {
-				return nullptr;
-			}
-
-			{
-				unsigned i = tls.my_queue;
-				auto & list = lists[i];
-				if( list.ts() != 0 ) {
-					list.lock.lock();
-					if((node = try_pop(i))) {
-						tls.stat.pop.local.success++;
-						break;
-					}
-					else {
-						tls.stat.pop.local.elock++;
-					}
-				}
-				else {
-					tls.stat.pop.local.espec++;
-				}
-			}
-
-			tls.stat.pop.steal.tried++;
-
-			int i = tls.rng.next() % numThreads;
-			auto & list = lists[i];
-			if( list.ts() == 0 ) {
-				tls.stat.pop.steal.empty++;
-				continue;
-			}
-
-			if( !list.lock.try_lock() ) {
-				tls.stat.pop.steal.locked++;
-				continue;
-			}
-
-			if((node = try_pop(i))) {
-				tls.stat.pop.steal.success++;
-				break;
-			}
-		}
-
-		#if defined(READ)
-			const unsigned f = READ;
-			if(0 == (tls.it % f)) {
-				unsigned i = tls.it / f;
-				lists[i % numThreads].ts();
-			}
-			// lists[tls.it].ts();
-			tls.it++;
-		#endif
-
-
-		return node;
-	}
-
-private:
-	node_t * try_pop(unsigned i) {
-		auto & list = lists[i];
-
-		// If list is empty, unlock and retry
-		if( list.ts() == 0 ) {
-			list.lock.unlock();
-			return nullptr;
-		}
-
-			// Actually pop the list
-		node_t * node;
-		bool emptied;
-		std::tie(node, emptied) = list.pop();
-		assert(node);
-
-		if(emptied) {
-			snzi.depart(i);
-		}
-
-		// Unlock and return
-		list.lock.unlock();
-		return node;
-	}
-
-
-public:
-
-	static std::atomic_uint32_t ticket;
-	static __attribute__((aligned(128))) thread_local struct TLS {
-		Random     rng = { int(rdtscl()) };
-		unsigned   my_queue = ticket++;
-		#if defined(READ)
-			unsigned it = 0;
-		#endif
-		struct {
-			struct {
-				std::size_t nhint = { 0 };
-			} push;
-			struct {
-				struct {
-					std::size_t success = { 0 };
-					std::size_t espec = { 0 };
-					std::size_t elock = { 0 };
-				} local;
-				struct {
-					std::size_t tried   = { 0 };
-					std::size_t locked  = { 0 };
-					std::size_t empty   = { 0 };
-					std::size_t success = { 0 };
-				} steal;
-			} pop;
-		} stat;
-	} tls;
-
-private:
-	const unsigned numThreads;
-    	std::unique_ptr<intrusive_queue_t<node_t> []> lists;
-	__attribute__((aligned(64))) snzi_t snzi;
-
-#ifndef NO_STATS
-private:
-	static struct GlobalStats {
-		struct {
-			std::atomic_size_t nhint = { 0 };
-		} push;
-		struct {
-			struct {
-				std::atomic_size_t success = { 0 };
-				std::atomic_size_t espec = { 0 };
-				std::atomic_size_t elock = { 0 };
-			} local;
-			struct {
-				std::atomic_size_t tried   = { 0 };
-				std::atomic_size_t locked  = { 0 };
-				std::atomic_size_t empty   = { 0 };
-				std::atomic_size_t success = { 0 };
-			} steal;
-		} pop;
-	} global_stats;
-
-public:
-	static void stats_tls_tally() {
-		global_stats.push.nhint += tls.stat.push.nhint;
-		global_stats.pop.local.success += tls.stat.pop.local.success;
-		global_stats.pop.local.espec   += tls.stat.pop.local.espec  ;
-		global_stats.pop.local.elock   += tls.stat.pop.local.elock  ;
-		global_stats.pop.steal.tried   += tls.stat.pop.steal.tried  ;
-		global_stats.pop.steal.locked  += tls.stat.pop.steal.locked ;
-		global_stats.pop.steal.empty   += tls.stat.pop.steal.empty  ;
-		global_stats.pop.steal.success += tls.stat.pop.steal.success;
-	}
-
-	static void stats_print(std::ostream & os ) {
-		std::cout << "----- Work Stealing Stats -----" << std::endl;
-
-		double stealSucc = double(global_stats.pop.steal.success) / global_stats.pop.steal.tried;
-		os << "Push to new Q : " << std::setw(15) << global_stats.push.nhint << "\n";
-		os << "Local Pop     : " << std::setw(15) << global_stats.pop.local.success << "\n";
-		os << "Steal Pop     : " << std::setw(15) << global_stats.pop.steal.success << "(" << global_stats.pop.local.espec << "s, " << global_stats.pop.local.elock << "l)\n";
-		os << "Steal Success : " << std::setw(15) << stealSucc << "(" << global_stats.pop.steal.tried << " tries)\n";
-		os << "Steal Fails   : " << std::setw(15) << global_stats.pop.steal.empty << "e, " << global_stats.pop.steal.locked << "l\n";
-	}
-private:
-#endif
-};
