Index: doc/theses/thierry_delisle_PhD/.gitignore
===================================================================
--- doc/theses/thierry_delisle_PhD/.gitignore	(revision 9421f3d8aa85bb55de754037f302d2e96c392242)
+++ doc/theses/thierry_delisle_PhD/.gitignore	(revision 9421f3d8aa85bb55de754037f302d2e96c392242)
@@ -0,0 +1,9 @@
+*/a.out
+
+code/*.s
+code/relaxed_list
+code/layout.ast
+code/build/
+code/raw/
+
+!Makefile
Index: doc/theses/thierry_delisle_PhD/code/Makefile
===================================================================
--- doc/theses/thierry_delisle_PhD/code/Makefile	(revision 9421f3d8aa85bb55de754037f302d2e96c392242)
+++ doc/theses/thierry_delisle_PhD/code/Makefile	(revision 9421f3d8aa85bb55de754037f302d2e96c392242)
@@ -0,0 +1,22 @@
+
+
+CXXFLAGS = -O3 -g -Wall -Wextra -std=c++17 -DNO_STATS
+LDFLAGS = -pthread -latomic
+
+push:
+	clang++ relaxed_list.cpp -g -Wall -Wextra -std=c++17 -fsyntax-only &&  rsync -av relaxed_list.cpp relaxed_list.hpp utils.hpp assert.hpp scale.sh plg7a:~/workspace/sched/.
+
+relaxed_list: $(firstword $(MAKEFILE_LIST)) | build
+	clang++ relaxed_list.cpp $(CXXFLAGS) $(LDFLAGS) -MMD -MF build/$(@).d -o $(@)
+
+-include build/relaxed_list.d
+
+layout.ast: $(firstword $(MAKEFILE_LIST)) | build
+	clang++ relaxed_list_layout.cpp $(CXXFLAGS) -MMD -MF build/$(@).d -MT $(@) -E -o build/$(@).ii
+	clang++ -Xclang -fdump-record-layouts -fsyntax-only $(CXXFLAGS) build/$(@).ii > build/layout.ast.raw
+	cat build/$(@).raw > $(@)
+
+-include build/layout.ast.d
+
+build:
+	mkdir -p build
Index: doc/theses/thierry_delisle_PhD/code/bts_test.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/bts_test.cpp	(revision 9421f3d8aa85bb55de754037f302d2e96c392242)
+++ doc/theses/thierry_delisle_PhD/code/bts_test.cpp	(revision 9421f3d8aa85bb55de754037f302d2e96c392242)
@@ -0,0 +1,32 @@
+#include <cassert>
+#include <iostream>
+
+bool bts(volatile size_t & target, size_t bit ) {
+	bool result = false;
+	asm volatile(
+		"LOCK btsq %[bit], %[target]\n\t"
+		:"=c" (result)
+		: [target] "m" (target), [bit] "r" (bit)
+	);
+ 	return result;
+}
+
+bool btr(volatile size_t & target, size_t bit ) {
+	bool result = false;
+	asm volatile(
+		"LOCK btrq %[bit], %[target]\n\t"
+		:"=c" (result)
+		: [target] "m" (target), [bit] "r" (bit)
+	);
+ 	return result;
+}
+
+int main() {
+	volatile size_t i = 0;
+	std::cout << std::hex << i << std::endl;
+	assert(bts(i, 31));
+	std::cout << std::hex << i << std::endl;
+	assert(btr(i, 31));
+	std::cout << std::hex << i << std::endl;
+	return 0;
+}
Index: doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp	(revision b067d9b9c445bf3cee86e9d129e51949e669d4f5)
+++ doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp	(revision 9421f3d8aa85bb55de754037f302d2e96c392242)
@@ -9,4 +9,5 @@
 #include <vector>
 
+#include <getopt.h>
 #include <unistd.h>
 #include <sys/sysinfo.h>
@@ -21,11 +22,8 @@
 
 	int value;
-	Node(int value): value(value) {
-		creates++;
-	}
-
-	~Node() {
-		destroys++;
-	}
+
+	Node() { creates++; }
+	Node(int value): value(value) { creates++; }
+	~Node() { destroys++; }
 };
 
@@ -33,11 +31,15 @@
 std::atomic_size_t Node::destroys = { 0 };
 
-static const constexpr int nodes_per_threads = 128;
-struct NodeArray {
-	__attribute__((aligned(64))) Node * array[nodes_per_threads];
-	__attribute__((aligned(64))) char pad;
-};
-
 bool enable_stats = false;
+
+template<>
+thread_local relaxed_list<Node>::TLS relaxed_list<Node>::tls = {};
+
+template<>
+relaxed_list<Node>::intrusive_queue_t::stat::Dif relaxed_list<Node>::intrusive_queue_t::stat::dif = {};
+
+// ================================================================================================
+//                        UTILS
+// ================================================================================================
 
 struct local_stat_t {
@@ -49,130 +51,54 @@
 };
 
-__attribute__((noinline)) void run_body(
-	std::atomic<bool>& done,
-	Random & rand,
-	Node * (&my_nodes)[128],
-	local_stat_t & local,
-	relaxed_list<Node> & list
-) {
-	while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
-		int idx = rand.next() % nodes_per_threads;
-		if (auto node = my_nodes[idx]) {
-			local.crc_in += node->value;
-			list.push(node);
-			my_nodes[idx] = nullptr;
-			local.in++;
-		}
-		else if(auto node = list.pop()) {
-			local.crc_out += node->value;
-			my_nodes[idx] = node;
-			local.out++;
-		}
-		else {
-			local.empty++;
-		}
-	}
-}
-
-void run(unsigned nthread, unsigned nqueues, unsigned fill, double duration) {
-	// List being tested
-	relaxed_list<Node> list = { nthread * nqueues };
-
-	// Barrier for synchronization
-	barrier_t barrier(nthread + 1);
-
-	// Data to check everything is OK
+struct global_stat_t {
+	std::atomic_size_t in  = { 0 };
+	std::atomic_size_t out = { 0 };
+	std::atomic_size_t empty = { 0 };
+	std::atomic_size_t crc_in  = { 0 };
+	std::atomic_size_t crc_out = { 0 };
 	struct {
-		std::atomic_size_t in  = { 0 };
-		std::atomic_size_t out = { 0 };
-		std::atomic_size_t empty = { 0 };
-		std::atomic_size_t crc_in  = { 0 };
-		std::atomic_size_t crc_out = { 0 };
 		struct {
-			struct {
-				std::atomic_size_t attempt = { 0 };
-				std::atomic_size_t success = { 0 };
-			} push;
-			struct {
-				std::atomic_size_t attempt = { 0 };
-				std::atomic_size_t success = { 0 };
-			} pop;
-		} pick;
-	} global;
-
-	// Flag to signal termination
-	std::atomic_bool done  = { false };
-
-	// Prep nodes
-	std::cout << "Initializing ";
-	size_t nnodes  = 0;
-	size_t npushed = 0;
-	NodeArray all_nodes[nthread];
-	for(auto & nodes : all_nodes) {
-		Random rand(rdtscl());
-		for(auto & node : nodes.array) {
-			auto r = rand.next() % 100;
-			if(r < fill) {
-				node = new Node(rand.next() % 100);
-				nnodes++;
-			} else {
-				node = nullptr;
-			}
-		}
-
-		for(int i = 0; i < 10; i++) {
-			int idx = rand.next() % nodes_per_threads;
-			if (auto node = nodes.array[idx]) {
-				global.crc_in += node->value;
-				list.push(node);
-				npushed++;
-				nodes.array[idx] = nullptr;
-			}
-		}
-	}
-
-	std::cout << nnodes << " nodes " << fill << "% (" << npushed << " pushed)" << std::endl;
-
-	enable_stats = true;
-
-	std::thread * threads[nthread];
-	unsigned i = 1;
-	for(auto & t : threads) {
-		auto & my_nodes = all_nodes[i - 1].array;
-		t = new std::thread([&done, &list, &barrier, &global, &my_nodes](unsigned tid) {
-			Random rand(tid + rdtscl());
-
-			local_stat_t local;
-
-			// affinity(tid);
-
-			barrier.wait(tid);
-
-			// EXPERIMENT START
-
-			run_body(done, rand, my_nodes, local, list);
-
-			// EXPERIMENT END
-
-			barrier.wait(tid);
-
-			global.in    += local.in;
-			global.out   += local.out;
-			global.empty += local.empty;
-
-			for(auto node : my_nodes) {
-				delete node;
-			}
-
-			global.crc_in  += local.crc_in;
-			global.crc_out += local.crc_out;
-
-			global.pick.push.attempt += relaxed_list<Node>::tls.pick.push.attempt;
-			global.pick.push.success += relaxed_list<Node>::tls.pick.push.success;
-			global.pick.pop .attempt += relaxed_list<Node>::tls.pick.pop.attempt;
-			global.pick.pop .success += relaxed_list<Node>::tls.pick.pop.success;
-		}, i++);
-	}
-
+			std::atomic_size_t attempt = { 0 };
+			std::atomic_size_t success = { 0 };
+		} push;
+		struct {
+			std::atomic_size_t attempt = { 0 };
+			std::atomic_size_t success = { 0 };
+			std::atomic_size_t mask_attempt = { 0 };
+		} pop;
+	} pick;
+	struct {
+		struct {
+			std::atomic_size_t value = { 0 };
+			std::atomic_size_t count = { 0 };
+		} push;
+		struct {
+			std::atomic_size_t value = { 0 };
+			std::atomic_size_t count = { 0 };
+		} pop;
+	} qstat;
+};
+
+void tally_stats(global_stat_t & global, local_stat_t & local) {
+	global.in    += local.in;
+	global.out   += local.out;
+	global.empty += local.empty;
+
+	global.crc_in  += local.crc_in;
+	global.crc_out += local.crc_out;
+
+	global.pick.push.attempt += relaxed_list<Node>::tls.pick.push.attempt;
+	global.pick.push.success += relaxed_list<Node>::tls.pick.push.success;
+	global.pick.pop .attempt += relaxed_list<Node>::tls.pick.pop.attempt;
+	global.pick.pop .success += relaxed_list<Node>::tls.pick.pop.success;
+	global.pick.pop .mask_attempt += relaxed_list<Node>::tls.pick.pop.mask_attempt;
+
+	global.qstat.push.value += relaxed_list<Node>::tls.empty.push.value;
+	global.qstat.push.count += relaxed_list<Node>::tls.empty.push.count;
+	global.qstat.pop .value += relaxed_list<Node>::tls.empty.pop .value;
+	global.qstat.pop .count += relaxed_list<Node>::tls.empty.pop .count;
+}
+
+void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) {
 	std::cout << "Starting" << std::endl;
 	auto before = Clock::now();
@@ -196,17 +122,7 @@
 	duration = durr.count();
 	std::cout << "\rClosing down" << std::endl;
-
-	for(auto t : threads) {
-		t->join();
-		delete t;
-	}
-
-	enable_stats = false;
-
-	while(auto node = list.pop()) {
-		global.crc_out += node->value;
-		delete node;
-	}
-
+}
+
+void print_stats(double duration, unsigned nthread, global_stat_t & global) {
 	assert(Node::creates == Node::destroys);
 	assert(global.crc_in == global.crc_out);
@@ -227,12 +143,247 @@
 		double push_sur = (100.0 * double(global.pick.push.success) / global.pick.push.attempt);
 		double pop_sur  = (100.0 * double(global.pick.pop .success) / global.pick.pop .attempt);
+
 		std::cout << "Push Pick %   : " << push_sur << "(" << global.pick.push.success << " / " << global.pick.push.attempt << ")\n";
 		std::cout << "Pop  Pick %   : " << pop_sur  << "(" << global.pick.pop .success << " / " << global.pick.pop .attempt << ")\n";
+		std::cout << "Pop mask trys : " << global.pick.pop.mask_attempt << std::endl;
+
+		double avgQ_push = double(global.qstat.push.value) / global.qstat.push.count;
+		double avgQ_pop  = double(global.qstat.pop .value) / global.qstat.pop .count;
+		double avgQ      = double(global.qstat.push.value + global.qstat.pop .value) / (global.qstat.push.count + global.qstat.pop .count);
+		std::cout << "Push   Avg Qs : " << avgQ_push << " (" << global.qstat.push.count << "ops)\n";
+		std::cout << "Pop    Avg Qs : " << avgQ_pop  << " (" << global.qstat.pop .count << "ops)\n";
+		std::cout << "Global Avg Qs : " << avgQ      << " (" << (global.qstat.push.count + global.qstat.pop .count) << "ops)\n";
 	#endif
 }
 
-void usage(char * argv[]) {
-	std::cerr << argv[0] << ": [DURATION (FLOAT:SEC)] [NTHREADS] [NQUEUES] [FILL]" << std::endl;;
-	std::exit(1);
+// ================================================================================================
+//                        EXPERIMENTS
+// ================================================================================================
+
+// ================================================================================================
+__attribute__((noinline)) void runChurn_body(
+	std::atomic<bool>& done,
+	Random & rand,
+	Node * my_nodes[],
+	unsigned nslots,
+	local_stat_t & local,
+	relaxed_list<Node> & list
+) {
+	while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
+		int idx = rand.next() % nslots;
+		if (auto node = my_nodes[idx]) {
+			local.crc_in += node->value;
+			list.push(node);
+			my_nodes[idx] = nullptr;
+			local.in++;
+		}
+		else if(auto node = list.pop()) {
+			local.crc_out += node->value;
+			my_nodes[idx] = node;
+			local.out++;
+		}
+		else {
+			local.empty++;
+		}
+	}
+}
+
+void runChurn(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes, const unsigned nslots) {
+	std::cout << "Churn Benchmark" << std::endl;
+	assert(nnodes <= nslots);
+	// List being tested
+	relaxed_list<Node> list = { nthread * nqueues };
+
+	// Barrier for synchronization
+	barrier_t barrier(nthread + 1);
+
+	// Data to check everything is OK
+	global_stat_t global;
+
+	// Flag to signal termination
+	std::atomic_bool done  = { false };
+
+	// Prep nodes
+	std::cout << "Initializing ";
+	size_t npushed = 0;
+
+	Node** all_nodes[nthread];
+	for(auto & nodes : all_nodes) {
+		nodes = new __attribute__((aligned(64))) Node*[nslots + 8];
+		Random rand(rdtscl());
+		for(unsigned i = 0; i < nnodes; i++) {
+			nodes[i] = new Node(rand.next() % 100);
+		}
+
+		for(unsigned i = nnodes; i < nslots; i++) {
+			nodes[i] = nullptr;
+		}
+
+		for(int i = 0; i < 10 && i < (int)nslots; i++) {
+			int idx = rand.next() % nslots;
+			if (auto node = nodes[idx]) {
+				global.crc_in += node->value;
+				list.push(node);
+				npushed++;
+				nodes[idx] = nullptr;
+			}
+		}
+	}
+
+	std::cout << nnodes << " nodes (" << nslots << " slots)" << std::endl;
+
+	enable_stats = true;
+
+	std::thread * threads[nthread];
+	unsigned i = 1;
+	for(auto & t : threads) {
+		auto & my_nodes = all_nodes[i - 1];
+		t = new std::thread([&done, &list, &barrier, &global, &my_nodes, nslots](unsigned tid) {
+			Random rand(tid + rdtscl());
+
+			local_stat_t local;
+
+			// affinity(tid);
+
+			barrier.wait(tid);
+
+			// EXPERIMENT START
+
+			runChurn_body(done, rand, my_nodes, nslots, local, list);
+
+			// EXPERIMENT END
+
+			barrier.wait(tid);
+
+			tally_stats(global, local);
+
+			for(unsigned i = 0; i < nslots; i++) {
+				delete my_nodes[i];
+			}
+		}, i++);
+	}
+
+	waitfor(duration, barrier, done);
+
+	for(auto t : threads) {
+		t->join();
+		delete t;
+	}
+
+	enable_stats = false;
+
+	while(auto node = list.pop()) {
+		global.crc_out += node->value;
+		delete node;
+	}
+
+	for(auto nodes : all_nodes) {
+		delete[] nodes;
+	}
+
+	print_stats(duration, nthread, global);
+}
+
+// ================================================================================================
+__attribute__((noinline)) void runPingPong_body(
+	std::atomic<bool>& done,
+	Node initial_nodes[],
+	unsigned nnodes,
+	local_stat_t & local,
+	relaxed_list<Node> & list
+) {
+	Node * nodes[nnodes];
+	{
+		unsigned i = 0;
+		for(auto & n : nodes) {
+			n = &initial_nodes[i++];
+		}
+	}
+
+	while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
+
+		for(Node * & node : nodes) {
+			local.crc_in += node->value;
+			list.push(node);
+			local.in++;
+		}
+
+		// -----
+
+		for(Node * & node : nodes) {
+			node = list.pop();
+			assert(node);
+			local.crc_out += node->value;
+			local.out++;
+		}
+	}
+}
+
+void runPingPong(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes) {
+	std::cout << "PingPong Benchmark" << std::endl;
+
+	// List being tested
+	relaxed_list<Node> list = { nthread * nqueues };
+
+	// Barrier for synchronization
+	barrier_t barrier(nthread + 1);
+
+	// Data to check everything is OK
+	global_stat_t global;
+
+	// Flag to signal termination
+	std::atomic_bool done  = { false };
+
+	std::cout << "Initializing ";
+	enable_stats = true;
+
+	std::thread * threads[nthread];
+	unsigned i = 1;
+	for(auto & t : threads) {
+		t = new std::thread([&done, &list, &barrier, &global, nnodes](unsigned tid) {
+			Random rand(tid + rdtscl());
+
+			Node nodes[nnodes];
+			for(auto & n : nodes) {
+				n.value = (int)rand.next() % 100;
+			}
+
+			local_stat_t local;
+
+			// affinity(tid);
+
+			barrier.wait(tid);
+
+			// EXPERIMENT START
+
+			runPingPong_body(done, nodes, nnodes, local, list);
+
+			// EXPERIMENT END
+
+			barrier.wait(tid);
+
+			tally_stats(global, local);
+		}, i++);
+	}
+
+	waitfor(duration, barrier, done);
+
+	for(auto t : threads) {
+		t->join();
+		delete t;
+	}
+
+	enable_stats = false;
+
+	print_stats(duration, nthread, global);
+}
+
+bool iequals(const std::string& a, const std::string& b)
+{
+    return std::equal(a.begin(), a.end(),
+                      b.begin(), b.end(),
+                      [](char a, char b) {
+                          return std::tolower(a) == std::tolower(b);
+                      });
 }
 
@@ -241,47 +392,173 @@
 	double duration   = 5.0;
 	unsigned nthreads = 2;
-	unsigned nqueues  = 2;
-	unsigned fill     = 100;
+	unsigned nqueues  = 4;
+	unsigned nnodes   = 100;
+	unsigned nslots   = 100;
+
+	enum {
+		Churn,
+		PingPong,
+		NONE
+	} benchmark = NONE;
 
 	std::cout.imbue(std::locale(""));
 
-	switch (argc)
-	{
-	case 5:
-		fill = std::stoul(argv[4]);
-		[[fallthrough]];
-	case 4:
-		nqueues = std::stoul(argv[3]);
-		[[fallthrough]];
-	case 3:
-		nthreads = std::stoul(argv[2]);
-		[[fallthrough]];
-	case 2:
-		duration = std::stod(argv[1]);
-		if( duration <= 0.0 ) {
-			std::cerr << "Duration must be positive, was " << argv[1] << "(" << duration << ")" << std::endl;
-			usage(argv);
-		}
-		[[fallthrough]];
-	case 1:
-		break;
-	default:
-		usage(argv);
-		break;
-	}
+	for(;;) {
+		static struct option options[] = {
+			{"duration",  required_argument, 0, 'd'},
+			{"nthreads",  required_argument, 0, 't'},
+			{"nqueues",   required_argument, 0, 'q'},
+			{"benchmark", required_argument, 0, 'b'},
+			{0, 0, 0, 0}
+		};
+
+		int idx = 0;
+		int opt = getopt_long(argc, argv, "d:t:q:b:", options, &idx);
+
+		std::string arg = optarg ? optarg : "";
+		size_t len = 0;
+		switch(opt) {
+			// Exit Case
+			case -1:
+				/* paranoid */ assert(optind <= argc);
+				switch(benchmark) {
+				case NONE:
+					std::cerr << "Must specify a benchmark" << std::endl;
+					goto usage;
+				case PingPong:
+					nnodes = 1;
+					nslots = 1;
+					switch(argc - optind) {
+					case 0: break;
+					case 1:
+						try {
+							arg = optarg = argv[optind];
+							nnodes = stoul(optarg, &len);
+							if(len != arg.size()) { throw std::invalid_argument(""); }
+						} catch(std::invalid_argument &) {
+							std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl;
+							goto usage;
+						}
+						break;
+					default:
+						std::cerr << "'PingPong' benchmark doesn't accept more than 2 extra arguments" << std::endl;
+						goto usage;
+					}
+					break;
+				case Churn:
+					nnodes = 100;
+					nslots = 100;
+					switch(argc - optind) {
+					case 0: break;
+					case 1:
+						try {
+							arg = optarg = argv[optind];
+							nnodes = stoul(optarg, &len);
+							if(len != arg.size()) { throw std::invalid_argument(""); }
+							nslots = nnodes;
+						} catch(std::invalid_argument &) {
+							std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl;
+							goto usage;
+						}
+						break;
+					case 2:
+						try {
+							arg = optarg = argv[optind];
+							nnodes = stoul(optarg, &len);
+							if(len != arg.size()) { throw std::invalid_argument(""); }
+						} catch(std::invalid_argument &) {
+							std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl;
+							goto usage;
+						}
+						try {
+							arg = optarg = argv[optind + 1];
+							nslots = stoul(optarg, &len);
+							if(len != arg.size()) { throw std::invalid_argument(""); }
+						} catch(std::invalid_argument &) {
+							std::cerr << "Number of slots must be a positive integer, was " << arg << std::endl;
+							goto usage;
+						}
+						break;
+					default:
+						std::cerr << "'Churn' benchmark doesn't accept more than 2 extra arguments" << std::endl;
+						goto usage;
+					}
+					break;
+				}
+				goto run;
+			// Benchmarks
+			case 'b':
+				if(benchmark != NONE) {
+					std::cerr << "Only when benchmark can be run" << std::endl;
+					goto usage;
+				}
+				if(iequals(arg, "churn")) {
+					benchmark = Churn;
+					break;
+				}
+				if(iequals(arg, "pingpong")) {
+					benchmark = PingPong;
+					break;
+				}
+				std::cerr << "Unkown benchmark " << arg << std::endl;
+				goto usage;
+			// Numeric Arguments
+			case 'd':
+				try {
+					duration = stod(optarg, &len);
+					if(len != arg.size()) { throw std::invalid_argument(""); }
+				} catch(std::invalid_argument &) {
+					std::cerr << "Duration must be a valid double, was " << arg << std::endl;
+					goto usage;
+				}
+				break;
+			case 't':
+				try {
+					nthreads = stoul(optarg, &len);
+					if(len != arg.size()) { throw std::invalid_argument(""); }
+				} catch(std::invalid_argument &) {
+					std::cerr << "Number of threads must be a positive integer, was " << arg << std::endl;
+					goto usage;
+				}
+				break;
+			case 'q':
+				try {
+					nqueues = stoul(optarg, &len);
+					if(len != arg.size()) { throw std::invalid_argument(""); }
+				} catch(std::invalid_argument &) {
+					std::cerr << "Number of queues must be a positive integer, was " << arg << std::endl;
+					goto usage;
+				}
+				break;
+			// Other cases
+			default: /* ? */
+				std::cerr << opt << std::endl;
+			usage:
+				std::cerr << "Usage: " << argv[0] << ": [options] -b churn [NNODES] [NSLOTS = NNODES]" << std::endl;
+				std::cerr << "  or:  " << argv[0] << ": [options] -b pingpong [NNODES]" << std::endl;
+				std::cerr << std::endl;
+				std::cerr << "  -d, --duration=DURATION  Duration of the experiment, in seconds" << std::endl;
+				std::cerr << "  -t, --nthreads=NTHREADS  Number of kernel threads" << std::endl;
+				std::cerr << "  -q, --nqueues=NQUEUES    Number of queues per threads" << std::endl;
+				std::exit(1);
+		}
+	}
+	run:
 
 	check_cache_line_size();
 
 	std::cout << "Running " << nthreads << " threads (" << (nthreads * nqueues) << " queues) for " << duration << " seconds" << std::endl;
-	run(nthreads, nqueues, fill, duration);
-
+	switch(benchmark) {
+		case Churn:
+			runChurn(nthreads, nqueues, duration, nnodes, nslots);
+			break;
+		case PingPong:
+			runPingPong(nthreads, nqueues, duration, nnodes);
+			break;
+		default:
+			abort();
+	}
 	return 0;
 }
 
-template<>
-thread_local relaxed_list<Node>::TLS relaxed_list<Node>::tls = {};
-
-template<>
-relaxed_list<Node>::intrusive_queue_t::stat::Dif relaxed_list<Node>::intrusive_queue_t::stat::dif = {};
-
 const char * __my_progname = "Relaxed List";
Index: doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp	(revision b067d9b9c445bf3cee86e9d129e51949e669d4f5)
+++ doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp	(revision 9421f3d8aa85bb55de754037f302d2e96c392242)
@@ -37,4 +37,35 @@
 };
 
+static inline bool bts(std::atomic_size_t & target, size_t bit ) {
+	//*
+	int result = 0;
+	asm volatile(
+		"LOCK btsq %[bit], %[target]\n\t"
+		:"=@ccc" (result)
+		: [target] "m" (target), [bit] "r" (bit)
+	);
+ 	return result != 0;
+	/*/
+	size_t mask = 1ul << bit;
+	size_t ret = target.fetch_or(mask, std::memory_order_relaxed);
+	return (ret & mask) != 0;
+	//*/
+}
+
+static inline bool btr(std::atomic_size_t & target, size_t bit ) {
+	//*
+	int result = 0;
+	asm volatile(
+		"LOCK btrq %[bit], %[target]\n\t"
+		:"=@ccc" (result)
+		: [target] "m" (target), [bit] "r" (bit)
+	);
+ 	return result != 0;
+	/*/
+	size_t mask = 1ul << bit;
+	size_t ret = target.fetch_and(~mask, std::memory_order_relaxed);
+	return (ret & mask) != 0;
+	//*/
+}
 
 extern bool enable_stats;
@@ -48,4 +79,16 @@
 		size_t attempt = 0;
 		size_t success = 0;
+		size_t mask_attempt = 0;
+	} pop;
+};
+
+struct empty_stat {
+	struct {
+		size_t value = 0;
+		size_t count = 0;
+	} push;
+	struct {
+		size_t value = 0;
+		size_t count = 0;
 	} pop;
 };
@@ -65,8 +108,10 @@
 public:
 	relaxed_list(unsigned numLists)
-	  	: numNonEmpty{0}
-		, lists(new intrusive_queue_t[numLists])
+	  	: lists(new intrusive_queue_t[numLists])
 		, numLists(numLists)
-	{}
+	{
+		assertf(7 * 8 * 8 >= numLists, "List currently only supports 448 sublists");
+		assert(sizeof(*this) == 128);
+	}
 
 	~relaxed_list() {
@@ -84,5 +129,5 @@
 		while(true) {
 			// Pick a random list
-			int i = tls.rng.next() % numLists;
+			unsigned i = tls.rng.next() % numLists;
 
 			#ifndef NO_STATS
@@ -93,6 +138,16 @@
 			if( !lists[i].lock.try_lock() ) continue;
 
+			__attribute__((unused)) int num = numNonEmpty;
+
 			// Actually push it
-			lists[i].push(node, numNonEmpty);
+			if(lists[i].push(node)) {
+				numNonEmpty++;
+				size_t qword = i >> 6ull;
+				size_t bit   = i & 63ull;
+				assertf((list_mask[qword] & (1ul << bit)) == 0, "Before set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit));
+				__attribute__((unused)) bool ret = bts(list_mask[qword], bit);
+				assert(!ret);
+				assertf((list_mask[qword] & (1ul << bit)) != 0, "After set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit));
+			}
 			assert(numNonEmpty <= (int)numLists);
 
@@ -102,4 +157,6 @@
 			#ifndef NO_STATS
 				tls.pick.push.success++;
+				tls.empty.push.value += num;
+				tls.empty.push.count += 1;
 			#endif
 			return;
@@ -108,47 +165,122 @@
 
 	__attribute__((noinline, hot)) node_t * pop() {
-		while(numNonEmpty != 0) {
-			// Pick two lists at random
-			int i = tls.rng.next() % numLists;
-			int j = tls.rng.next() % numLists;
-
-			#ifndef NO_STATS
-				tls.pick.pop.attempt++;
-			#endif
-
-			// Pick the bet list
-			int w = i;
-			if( __builtin_expect(lists[j].ts() != 0, true) ) {
-				w = (lists[i].ts() < lists[j].ts()) ? i : j;
-			}
-
-			auto & list = lists[w];
-			// If list looks empty retry
-			if( list.ts() == 0 ) continue;
-
-			// If we can't get the lock retry
-			if( !list.lock.try_lock() ) continue;
-
-			// If list is empty, unlock and retry
-			if( list.ts() == 0 ) {
-				list.lock.unlock();
-				continue;
-			}
-
-			// Actually pop the list
-			auto node = list.pop(numNonEmpty);
-			assert(node);
-
-			// Unlock and return
-			list.lock.unlock();
-			assert(numNonEmpty >= 0);
-			#ifndef NO_STATS
-				tls.pick.pop.success++;
-			#endif
-			return node;
-		}
+		#if !defined(NO_BITMASK)
+			// for(int r = 0; r < 10 && numNonEmpty != 0; r++) {
+			// 	// Pick two lists at random
+			// 	unsigned i = tls.rng.next() % numLists;
+			// 	unsigned j = tls.rng.next() % numLists;
+
+			// 	if(auto node = try_pop(i, j)) return node;
+			// }
+			int nnempty;
+			while(0 != (nnempty = numNonEmpty)) {
+				unsigned i, j;
+				if( numLists < 4 || (numLists / nnempty) < 4 ) {
+					// Pick two lists at random
+					i = tls.rng.next() % numLists;
+					j = tls.rng.next() % numLists;
+				} else {
+					#ifndef NO_STATS
+						// tls.pick.push.mask_attempt++;
+					#endif
+
+					// Pick two lists at random
+					unsigned num = ((numLists - 1) >> 6) + 1;
+
+					unsigned ri = tls.rng.next();
+					unsigned rj = tls.rng.next();
+
+					unsigned wdxi = (ri >> 6u) % num;
+					unsigned wdxj = (rj >> 6u) % num;
+
+					size_t maski = list_mask[wdxi].load(std::memory_order_relaxed);
+					size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed);
+
+					if(maski == 0 && maskj == 0) continue;
+
+					unsigned biti = maski ? ri % __builtin_popcountl(maski) : 0;
+					unsigned bitj = maskj ? rj % __builtin_popcountl(maskj) : 0;
+
+					unsigned bi = 64 - nthSetBit(maski, biti + 1);
+					unsigned bj = 64 - nthSetBit(maskj, bitj + 1);
+
+					assertf(bi < 64, "%zu %u %u", maski, biti, bi);
+					assertf(bj < 64, "%zu %u %u", maskj, bitj, bj);
+
+					i = bi | (wdxi << 6);
+					j = bj | (wdxj << 6);
+
+					assertf(i < numLists, "%u", wdxi << 6);
+					assertf(j < numLists, "%u", wdxj << 6);
+				}
+
+				if(auto node = try_pop(i, j)) return node;
+			}
+		#else
+			while(numNonEmpty != 0) {
+				// Pick two lists at random
+				int i = tls.rng.next() % numLists;
+				int j = tls.rng.next() % numLists;
+
+				if(auto node = try_pop(i, j)) return node;
+			}
+		#endif
 
 		return nullptr;
     	}
+
+private:
+	node_t * try_pop(unsigned i, unsigned j) {
+		#ifndef NO_STATS
+			tls.pick.pop.attempt++;
+		#endif
+
+		// Pick the bet list
+		int w = i;
+		if( __builtin_expect(lists[j].ts() != 0, true) ) {
+			w = (lists[i].ts() < lists[j].ts()) ? i : j;
+		}
+
+		auto & list = lists[w];
+		// If list looks empty retry
+		if( list.ts() == 0 ) return nullptr;
+
+		// If we can't get the lock retry
+		if( !list.lock.try_lock() ) return nullptr;
+
+		__attribute__((unused)) int num = numNonEmpty;
+
+		// If list is empty, unlock and retry
+		if( list.ts() == 0 ) {
+			list.lock.unlock();
+			return nullptr;
+		}
+
+		// Actually pop the list
+		node_t * node;
+		bool emptied;
+		std::tie(node, emptied) = list.pop();
+		assert(node);
+
+		if(emptied) {
+			numNonEmpty--;
+			size_t qword = w >> 6ull;
+			size_t bit   = w & 63ull;
+			assert((list_mask[qword] & (1ul << bit)) != 0);
+			__attribute__((unused)) bool ret = btr(list_mask[qword], bit);
+			assert(ret);
+			assert((list_mask[qword] & (1ul << bit)) == 0);
+		}
+
+		// Unlock and return
+		list.lock.unlock();
+		assert(numNonEmpty >= 0);
+		#ifndef NO_STATS
+			tls.pick.pop.success++;
+			tls.empty.pop.value += num;
+			tls.empty.pop.count += 1;
+		#endif
+		return node;
+	}
 
 private:
@@ -178,5 +310,7 @@
 		sentinel_t before;
 		sentinel_t after;
-		stat s;
+		#ifndef NO_STATS
+			stat s;
+		#endif
 
 		static constexpr auto fields_offset = offsetof( node_t, _links );
@@ -186,12 +320,12 @@
 			, after {{ head(), nullptr }}
 		{
-			assert((reinterpret_cast<uintptr_t>( head() ) + fields_offset) == reinterpret_cast<uintptr_t>(&before));
-			assert((reinterpret_cast<uintptr_t>( tail() ) + fields_offset) == reinterpret_cast<uintptr_t>(&after ));
-			assert(head()->_links.prev == nullptr);
-			assert(head()->_links.next == tail() );
-			assert(tail()->_links.next == nullptr);
-			assert(tail()->_links.prev == head() );
-			assert(sizeof(*this) == 128);
-			assert((intptr_t(this) % 128) == 0);
+			/* paranoid */ assert((reinterpret_cast<uintptr_t>( head() ) + fields_offset) == reinterpret_cast<uintptr_t>(&before));
+			/* paranoid */ assert((reinterpret_cast<uintptr_t>( tail() ) + fields_offset) == reinterpret_cast<uintptr_t>(&after ));
+			/* paranoid */ assert(head()->_links.prev == nullptr);
+			/* paranoid */ assert(head()->_links.next == tail() );
+			/* paranoid */ assert(tail()->_links.next == nullptr);
+			/* paranoid */ assert(tail()->_links.prev == head() );
+			/* paranoid */ assert(sizeof(*this) == 128);
+			/* paranoid */ assert((intptr_t(this) % 128) == 0);
 		}
 
@@ -220,5 +354,5 @@
 		}
 
-		inline void push(node_t * node, std::atomic_int & nonEmpty) {
+		inline bool push(node_t * node) {
 			assert(lock);
 			assert(node->_links.ts != 0);
@@ -232,14 +366,16 @@
 			prev->_links.next = node;
 			tail->_links.prev = node;
-			if(before._links.ts == 0l) {
-				nonEmpty += 1;
-				before._links.ts = node->_links.ts;
-			}
 			#ifndef NO_STATS
 				if(enable_stats) s.diff++;
 			#endif
-		}
-
-		inline node_t * pop(std::atomic_int & nonEmpty) {
+			if(before._links.ts == 0l) {
+				before._links.ts = node->_links.ts;
+				assert(node->_links.prev == this->head());
+				return true;
+			}
+			return false;
+		}
+
+		inline std::pair<node_t *, bool> pop() {
 			assert(lock);
 			node_t * head = this->head();
@@ -248,12 +384,15 @@
 			node_t * node = head->_links.next;
 			node_t * next = node->_links.next;
-			if(node == tail) return nullptr;
+			if(node == tail) return {nullptr, false};
 
 			head->_links.next = next;
 			next->_links.prev = head;
 
+			#ifndef NO_STATS
+				if(enable_stats) s.diff--;
+			#endif
 			if(next == tail) {
 				before._links.ts = 0l;
-				nonEmpty -= 1;
+				return {node, true};
 			}
 			else {
@@ -261,9 +400,6 @@
 				before._links.ts = next->_links.ts;
 				assert(before._links.ts != 0);
-			}
-			#ifndef NO_STATS
-				if(enable_stats) s.diff--;
-			#endif
-			return node;
+				return {node, false};
+			}
 		}
 
@@ -277,10 +413,13 @@
 
 	static __attribute__((aligned(128))) thread_local struct TLS {
-		Random    rng = { int(rdtscl()) };
-		pick_stat pick;
+		Random     rng = { int(rdtscl()) };
+		pick_stat  pick;
+		empty_stat empty;
 	} tls;
 
+public:
+	std::atomic_int numNonEmpty  = 0;  // number of non-empty lists
+	std::atomic_size_t list_mask[7] = { 0 }; // which queues are empty
 private:
-	std::atomic_int numNonEmpty; // number of non-empty lists
     	__attribute__((aligned(64))) std::unique_ptr<intrusive_queue_t []> lists;
 	const unsigned numLists;
Index: doc/theses/thierry_delisle_PhD/code/relaxed_list_layout.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/relaxed_list_layout.cpp	(revision 9421f3d8aa85bb55de754037f302d2e96c392242)
+++ doc/theses/thierry_delisle_PhD/code/relaxed_list_layout.cpp	(revision 9421f3d8aa85bb55de754037f302d2e96c392242)
@@ -0,0 +1,23 @@
+#define NO_IO
+#define NDEBUG
+#include "relaxed_list.hpp"
+
+struct __attribute__((aligned(64))) Node {
+	static std::atomic_size_t creates;
+	static std::atomic_size_t destroys;
+
+	_LinksFields_t<Node> _links;
+
+	int value;
+	Node(int value): value(value) {
+		creates++;
+	}
+
+	~Node() {
+		destroys++;
+	}
+};
+
+int main() {
+	return sizeof(relaxed_list<Node>) + relaxed_list<Node>::sizeof_queue;
+}
Index: doc/theses/thierry_delisle_PhD/code/scale.sh
===================================================================
--- doc/theses/thierry_delisle_PhD/code/scale.sh	(revision 9421f3d8aa85bb55de754037f302d2e96c392242)
+++ doc/theses/thierry_delisle_PhD/code/scale.sh	(revision 9421f3d8aa85bb55de754037f302d2e96c392242)
@@ -0,0 +1,7 @@
+#!/bin/bash
+taskset -c 24-31 ./a.out -t  1 -b churn | grep --color -E "(ns|Ops|Running)"
+taskset -c 24-31 ./a.out -t  2 -b churn | grep --color -E "(ns|Ops|Running)"
+taskset -c 24-31 ./a.out -t  4 -b churn | grep --color -E "(ns|Ops|Running)"
+taskset -c 24-31 ./a.out -t  8 -b churn | grep --color -E "(ns|Ops|Running)"
+taskset -c 16-31 ./a.out -t 16 -b churn | grep --color -E "(ns|Ops|Running)"
+taskset -c  0-31 ./a.out -t 32 -b churn | grep --color -E "(ns|Ops|Running)"
Index: doc/theses/thierry_delisle_PhD/code/utils.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/utils.hpp	(revision b067d9b9c445bf3cee86e9d129e51949e669d4f5)
+++ doc/theses/thierry_delisle_PhD/code/utils.hpp	(revision 9421f3d8aa85bb55de754037f302d2e96c392242)
@@ -10,4 +10,6 @@
 #include <unistd.h>
 #include <sys/sysinfo.h>
+
+#include <x86intrin.h>
 
 // Barrier from
@@ -103,2 +105,105 @@
 	return std::chrono::duration_cast<std::chrono::duration<T, Ratio>>(std::chrono::duration<T>(seconds)).count();
 }
+
+// from geeksforgeeks
+const constexpr unsigned num_to_bits[16] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
+
+/* Recursively get nibble of a given number
+and map them in the array */
+__attribute__((noinline)) unsigned countSetBits(size_t num) {
+	unsigned nibble = 0;
+	if (0 == num)
+		return num_to_bits[0];
+
+	// Find last nibble
+	nibble = num & 0xf;
+
+	// Use pre-stored values to find count
+	// in last nibble plus recursively add
+	// remaining nibbles.
+	return num_to_bits[nibble] + countSetBits(num >> 4);
+}
+
+__attribute__((noinline)) unsigned nthSetBit(size_t mask, unsigned bit) {
+	uint64_t v = mask;   // Input value to find position with rank r.
+	unsigned int r = bit;// Input: bit's desired rank [1-64].
+	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
+	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
+	unsigned int t;      // Bit count temporary.
+
+	// Do a normal parallel bit count for a 64-bit integer,
+	// but store all intermediate steps.
+	// a = (v & 0x5555...) + ((v >> 1) & 0x5555...);
+	a =  v - ((v >> 1) & ~0UL/3);
+	// b = (a & 0x3333...) + ((a >> 2) & 0x3333...);
+	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
+	// c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...);
+	c = (b + (b >> 4)) & ~0UL/0x11;
+	// d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...);
+	d = (c + (c >> 8)) & ~0UL/0x101;
+
+
+	t = (d >> 32) + (d >> 48);
+	// Now do branchless select!
+	s  = 64;
+	// if (r > t) {s -= 32; r -= t;}
+	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
+	t  = (d >> (s - 16)) & 0xff;
+	// if (r > t) {s -= 16; r -= t;}
+	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
+	t  = (c >> (s - 8)) & 0xf;
+	// if (r > t) {s -= 8; r -= t;}
+	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
+	t  = (b >> (s - 4)) & 0x7;
+	// if (r > t) {s -= 4; r -= t;}
+	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
+	t  = (a >> (s - 2)) & 0x3;
+	// if (r > t) {s -= 2; r -= t;}
+	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
+	t  = (v >> (s - 1)) & 0x1;
+	// if (r > t) s--;
+	s -= ((t - r) & 256) >> 8;
+	s = 65 - s;
+	return s;
+}
+
+// __attribute__((noinline)) unsigned int nth_bit_set(uint64_t value, unsigned int n)
+// {
+// 	uint64_t      mask = 0x00000000FFFFFFFFul;
+// 	unsigned int  size = 32u;
+// 	unsigned int  base = 0u;
+
+// 	if(value == 0) return 0;
+// 	assertf(n < (unsigned)__builtin_popcountl(value), "%u < %d (%zu)", n, __builtin_popcountl(value), value);
+// 	n++;
+// 	while (size > 0) {
+// 		const unsigned int  count = __builtin_popcountl(value & mask);
+// 		if (n > count) {
+// 			base += size;
+// 			size >>= 1;
+// 			mask |= mask << size;
+// 		} else {
+// 			size >>= 1;
+// 			mask >>= size;
+// 		}
+//     }
+
+//     return base;
+// }
+
+
+static inline uint64_t rotl64 (uint64_t n, unsigned int c) {
+	const unsigned int mask = (8*sizeof(n) - 1);  // assumes width is a power of 2.
+
+	c &= mask;
+	return (n<<c) | (n>>( (-c)&mask ));
+}
+
+static inline uint64_t rotr64 (uint64_t n, unsigned int c)
+{
+	const unsigned int mask = (8*sizeof(n) - 1);
+
+	// assert ( (c<=mask) &&"rotate by type width or more");
+	c &= mask;
+	return (n>>c) | (n<<( (-c)&mask ));
+}
