Index: doc/theses/thierry_delisle_PhD/code/assert.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/assert.hpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
+++ doc/theses/thierry_delisle_PhD/code/assert.hpp	(revision 50aeb6ffc5d975fd3a5272488078f72cb24797e3)
@@ -1,4 +1,5 @@
 #pragma once
 
+#ifndef NDEBUG
 #include <cassert>
 #include <cstdlib>
@@ -17,2 +18,5 @@
 	}                                   \
 })
+#else
+#define assertf(cond, ...)
+#endif
Index: doc/theses/thierry_delisle_PhD/code/processor_list.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/processor_list.hpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
+++ doc/theses/thierry_delisle_PhD/code/processor_list.hpp	(revision 50aeb6ffc5d975fd3a5272488078f72cb24797e3)
@@ -188,4 +188,26 @@
 	}
 
+	//-----------------------------------------------------------------------
+	// Checking support
+	uint_fast32_t epoch_check() {
+		// Step 1 : lock global lock
+		// It is needed to avoid processors that register mid Critical-Section
+		//   to simply lock their own lock and enter.
+		while(lock.load(std::memory_order_relaxed))
+			asm volatile("pause");
+
+		// Step 2 : lock per-proc lock
+		// Processors that are currently being registered aren't counted
+		//   but can't be in read_lock or in the critical section.
+		// All other processors are counted
+		uint_fast32_t s = ready;
+		for(uint_fast32_t i = 0; i < s; i++) {
+			while(data[i].lock.load(std::memory_order_relaxed))
+				asm volatile("pause");
+		}
+
+		return s;
+	}
+
 public:
 };
Index: doc/theses/thierry_delisle_PhD/code/processor_list_fast.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/processor_list_fast.cpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
+++ doc/theses/thierry_delisle_PhD/code/processor_list_fast.cpp	(revision 50aeb6ffc5d975fd3a5272488078f72cb24797e3)
@@ -19,5 +19,5 @@
 	unsigned id;
 };
-void run(unsigned nthread, double duration, unsigned writes) {
+void run(unsigned nthread, double duration, unsigned writes, unsigned epochs) {
 	assert(writes < 100);
 
@@ -30,6 +30,9 @@
 	// Data to check everything is OK
 	size_t write_committed = 0ul;
-	std::atomic_size_t lock_cnt_write = { 0ul };
-	std::atomic_size_t lock_cnt_read  = { 0ul };
+	struct {
+		std::atomic_size_t write = { 0ul };
+		std::atomic_size_t read  = { 0ul };
+		std::atomic_size_t epoch = { 0ul };
+	} lock_cnt;
 
 	// Flag to signal termination
@@ -39,5 +42,5 @@
 	unsigned i = 1;
 	for(auto & t : threads) {
-		t = new std::thread([&done, &list, &barrier, &write_committed, &lock_cnt_write, &lock_cnt_read, writes](unsigned tid) {
+		t = new std::thread([&done, &list, &barrier, &write_committed, &lock_cnt, writes, epochs](unsigned tid) {
 			Random rand(tid + rdtscl());
 			processor proc;
@@ -45,4 +48,5 @@
 			size_t writes_cnt = 0;
 			size_t reads_cnt = 0;
+			size_t epoch_cnt = 0;
 
 			affinity(tid);
@@ -51,5 +55,6 @@
 
 			while(__builtin_expect(!done, true)) {
-				if ((rand.next() % 100) < writes) {
+				auto r = rand.next() % 100;
+				if (r < writes) {
 					auto n = list.write_lock();
 					write_committed++;
@@ -57,4 +62,8 @@
 					assert(writes_cnt < -2ul);
 					list.write_unlock(n);
+				}
+				else if(r < epochs) {
+					list.epoch_check();
+					epoch_cnt++;
 				}
 				else {
@@ -70,6 +79,7 @@
 			auto p = list.unregister(proc.id);
 			assert(&proc == p);
-			lock_cnt_write += writes_cnt;
-			lock_cnt_read  += reads_cnt;
+			lock_cnt.write += writes_cnt;
+			lock_cnt.read  += reads_cnt;
+			lock_cnt.epoch += epoch_cnt;
 		}, i++);
 	}
@@ -98,12 +108,13 @@
 	}
 
-	assert(write_committed == lock_cnt_write);
+	assert(write_committed == lock_cnt.write);
 
-	size_t ops_sec = size_t(double(lock_cnt_read + lock_cnt_write) / duration);
+	size_t totalop = lock_cnt.read + lock_cnt.write + lock_cnt.epoch;
+	size_t ops_sec = size_t(double(totalop) / duration);
 	size_t ops_thread = ops_sec / nthread;
 	double dur_nano = duration_cast<std::nano>(1.0);
 
 	std::cout << "Duration      : " << duration << "s\n";
-	std::cout << "Total ops     : " << (lock_cnt_read + lock_cnt_write) << "(" << lock_cnt_read << "r, " << lock_cnt_write << "w)\n";
+	std::cout << "Total ops     : " << totalop << "(" << lock_cnt.read << "r, " << lock_cnt.write << "w, " << lock_cnt.epoch << "e)\n";
 	std::cout << "Ops/sec       : " << ops_sec << "\n";
 	std::cout << "Ops/sec/thread: " << ops_thread << "\n";
@@ -121,4 +132,5 @@
 	unsigned nthreads = 2;
 	unsigned writes   = 0;
+	unsigned epochs   = 0;
 
 	std::cout.imbue(std::locale(""));
@@ -126,8 +138,11 @@
 	switch (argc)
 	{
+	case 5:
+		epochs = std::stoul(argv[4]);
+		[[fallthrough]];
 	case 4:
 		writes = std::stoul(argv[3]);
-		if( writes >= 100 ) {
-			std::cerr << "Writes must be valid percentage, was " << argv[3] << "(" << writes << ")" << std::endl;
+		if( (writes + epochs) > 100 ) {
+			std::cerr << "Writes + Epochs must be valid percentage, was " << argv[3] << " + " << argv[4] << "(" << writes << " + " << epochs << ")" << std::endl;
 			usage(argv);
 		}
@@ -152,6 +167,6 @@
 	check_cache_line_size();
 
-	std::cout << "Running " << nthreads << " threads for " << duration << " seconds with " << writes << "% writes" << std::endl;
-	run(nthreads, duration, writes);
+	std::cout << "Running " << nthreads << " threads for " << duration << " seconds with " << writes << "% writes and " << epochs << "% epochs" << std::endl;
+	run(nthreads, duration, writes, epochs + writes);
 
 	return 0;
Index: doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
+++ doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp	(revision 50aeb6ffc5d975fd3a5272488078f72cb24797e3)
@@ -13,5 +13,5 @@
 #include "utils.hpp"
 
-struct Node {
+struct __attribute__((aligned(64))) Node {
 	static std::atomic_size_t creates;
 	static std::atomic_size_t destroys;
@@ -33,12 +33,48 @@
 
 static const constexpr int nodes_per_threads = 128;
+struct NodeArray {
+	__attribute__((aligned(64))) Node * array[nodes_per_threads];
+	__attribute__((aligned(64))) char pad;
+};
 
 bool enable_stats = false;
 
-__attribute__((aligned(64))) thread_local pick_stat local_pick;
-
-void run(unsigned nthread, double duration) {
+struct local_stat_t {
+	size_t in  = 0;
+	size_t out = 0;
+	size_t empty = 0;
+	size_t crc_in  = 0;
+	size_t crc_out = 0;
+};
+
+__attribute__((noinline)) void run_body(
+	std::atomic<bool>& done,
+	Random & rand,
+	Node * (&my_nodes)[128],
+	local_stat_t & local,
+	relaxed_list<Node> & list
+) {
+	while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
+		int idx = rand.next() % nodes_per_threads;
+		if (auto node = my_nodes[idx]) {
+			local.crc_in += node->value;
+			list.push(node);
+			my_nodes[idx] = nullptr;
+			local.in++;
+		}
+		else if(auto node = list.pop()) {
+			local.crc_out += node->value;
+			my_nodes[idx] = node;
+			local.out++;
+		}
+		else {
+			local.empty++;
+		}
+	}
+}
+
+void run(unsigned nthread, unsigned nqueues, unsigned fill, double duration) {
 	// List being tested
-	relaxed_list<Node> list = { nthread * 2 };
+	relaxed_list<Node> list = { nthread * nqueues };
 
 	// Barrier for synchronization
@@ -52,6 +88,14 @@
 		std::atomic_size_t crc_in  = { 0 };
 		std::atomic_size_t crc_out = { 0 };
-		std::atomic_size_t pick_at = { 0 };
-		std::atomic_size_t pick_su = { 0 };
+		struct {
+			struct {
+				std::atomic_size_t attempt = { 0 };
+				std::atomic_size_t success = { 0 };
+			} push;
+			struct {
+				std::atomic_size_t attempt = { 0 };
+				std::atomic_size_t success = { 0 };
+			} pop;
+		} pick;
 	} global;
 
@@ -61,18 +105,19 @@
 	// Prep nodes
 	std::cout << "Initializing" << std::endl;
-	std::vector<Node *> all_nodes[nthread];
+	NodeArray all_nodes[nthread];
 	for(auto & nodes : all_nodes) {
 		Random rand(rdtscl());
-		nodes.resize(nodes_per_threads);
-		for(auto & node : nodes) {
-			node = new Node(rand.next() % 100);
+		for(auto & node : nodes.array) {
+			auto r = rand.next() % 100;
+			if(r < fill)
+				node = new Node(rand.next() % 100);
 		}
 
 		for(int i = 0; i < 10; i++) {
 			int idx = rand.next() % nodes_per_threads;
-			if (auto node = nodes[idx]) {
+			if (auto node = nodes.array[idx]) {
 				global.crc_in += node->value;
 				list.push(node);
-				nodes[idx] = nullptr;
+				nodes.array[idx] = nullptr;
 			}
 		}
@@ -84,15 +129,11 @@
 	unsigned i = 1;
 	for(auto & t : threads) {
-		auto & my_nodes = all_nodes[i - 1];
+		auto & my_nodes = all_nodes[i - 1].array;
 		t = new std::thread([&done, &list, &barrier, &global, &my_nodes](unsigned tid) {
 			Random rand(tid + rdtscl());
 
-			size_t local_in  = 0;
-			size_t local_out = 0;
-			size_t local_empty = 0;
-			size_t local_crc_in  = 0;
-			size_t local_crc_out = 0;
-
-			affinity(tid);
+			local_stat_t local;
+
+			// affinity(tid);
 
 			barrier.wait(tid);
@@ -100,21 +141,5 @@
 			// EXPERIMENT START
 
-			while(__builtin_expect(!done, true)) {
-				int idx = rand.next() % nodes_per_threads;
-				if (auto node = my_nodes[idx]) {
-					local_crc_in += node->value;
-					list.push(node);
-					my_nodes[idx] = nullptr;
-					local_in++;
-				}
-				else if(auto node = list.pop2()) {
-					local_crc_out += node->value;
-					my_nodes[idx] = node;
-					local_out++;
-				}
-				else {
-					local_empty++;
-				}
-			}
+			run_body(done, rand, my_nodes, local, list);
 
 			// EXPERIMENT END
@@ -122,7 +147,7 @@
 			barrier.wait(tid);
 
-			global.in    += local_in;
-			global.out   += local_out;
-			global.empty += local_empty;
+			global.in    += local.in;
+			global.out   += local.out;
+			global.empty += local.empty;
 
 			for(auto node : my_nodes) {
@@ -130,9 +155,11 @@
 			}
 
-			global.crc_in  += local_crc_in;
-			global.crc_out += local_crc_out;
-
-			global.pick_at += local_pick.attempt;
-			global.pick_su += local_pick.success;
+			global.crc_in  += local.crc_in;
+			global.crc_out += local.crc_out;
+
+			global.pick.push.attempt += relaxed_list<Node>::tls.pick.push.attempt;
+			global.pick.push.success += relaxed_list<Node>::tls.pick.push.success;
+			global.pick.pop .attempt += relaxed_list<Node>::tls.pick.pop.attempt;
+			global.pick.pop .success += relaxed_list<Node>::tls.pick.pop.success;
 		}, i++);
 	}
@@ -143,5 +170,5 @@
 
 	while(true) {
-		usleep(1000);
+		usleep(100000);
 		auto now = Clock::now();
 		duration_t durr = now - before;
@@ -150,4 +177,6 @@
 			break;
 		}
+		std::cout << "\r" << durr.count();
+		std::cout.flush();
 	}
 
@@ -156,5 +185,5 @@
 	duration_t durr = after - before;
 	duration = durr.count();
-	std::cout << "Closing down" << std::endl;
+	std::cout << "\nClosing down" << std::endl;
 
 	for(auto t : threads) {
@@ -181,13 +210,18 @@
 
 	std::cout << "Duration      : " << duration << "s\n";
+	std::cout << "ns/Op         : " << ( dur_nano / ops_thread )<< "\n";
+	std::cout << "Ops/sec/thread: " << ops_thread << "\n";
+	std::cout << "Ops/sec       : " << ops_sec << "\n";
 	std::cout << "Total ops     : " << ops << "(" << global.in << "i, " << global.out << "o, " << global.empty << "e)\n";
-	std::cout << "Ops/sec       : " << ops_sec << "\n";
-	std::cout << "Ops/sec/thread: " << ops_thread << "\n";
-	std::cout << "ns/Op         : " << ( dur_nano / ops_thread )<< "\n";
-	std::cout << "Pick %        : " << (100.0 * double(global.pick_su) / global.pick_at) << "(" << global.pick_su << " / " << global.pick_at << ")\n";
+	#ifndef NO_STATS
+		double push_sur = (100.0 * double(global.pick.push.success) / global.pick.push.attempt);
+		double pop_sur  = (100.0 * double(global.pick.pop .success) / global.pick.pop .attempt);
+		std::cout << "Push Pick %   : " << push_sur << "(" << global.pick.push.success << " / " << global.pick.push.attempt << ")\n";
+		std::cout << "Pop  Pick %   : " << pop_sur  << "(" << global.pick.pop .success << " / " << global.pick.pop .attempt << ")\n";
+	#endif
 }
 
 void usage(char * argv[]) {
-	std::cerr << argv[0] << ": [DURATION (FLOAT:SEC)] [NTHREADS]" << std::endl;;
+	std::cerr << argv[0] << ": [DURATION (FLOAT:SEC)] [NTHREADS] [NQUEUES] [FILL]" << std::endl;;
 	std::exit(1);
 }
@@ -197,4 +231,6 @@
 	double duration   = 5.0;
 	unsigned nthreads = 2;
+	unsigned nqueues  = 2;
+	unsigned fill     = 100;
 
 	std::cout.imbue(std::locale(""));
@@ -202,4 +238,10 @@
 	switch (argc)
 	{
+	case 5:
+		nqueues = std::stoul(argv[4]);
+		[[fallthrough]];
+	case 4:
+		nqueues = std::stoul(argv[3]);
+		[[fallthrough]];
 	case 3:
 		nthreads = std::stoul(argv[2]);
@@ -221,6 +263,6 @@
 	check_cache_line_size();
 
-	std::cout << "Running " << nthreads << " threads for " << duration << " seconds" << std::endl;
-	run(nthreads, duration);
+	std::cout << "Running " << nthreads << " threads (" << (nthreads * nqueues) << " queues) for " << duration << " seconds" << std::endl;
+	run(nthreads, nqueues, duration, fill);
 
 	return 0;
@@ -228,3 +270,8 @@
 
 template<>
-thread_local Random relaxed_list<Node>::rng_g = { int(rdtscl()) };
+thread_local relaxed_list<Node>::TLS relaxed_list<Node>::tls = {};
+
+template<>
+relaxed_list<Node>::intrusive_queue_t::stat::Dif relaxed_list<Node>::intrusive_queue_t::stat::dif = {};
+
+const char * __my_progname = "Relaxed List";
Index: doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
+++ doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp	(revision 50aeb6ffc5d975fd3a5272488078f72cb24797e3)
@@ -1,5 +1,8 @@
 #pragma once
 
+#ifndef NO_STATS
 #include <iostream>
+#endif
+
 #include <memory>
 #include <mutex>
@@ -11,13 +14,40 @@
 using namespace std;
 
+struct spinlock_t {
+	std::atomic_bool ll = { false };
+
+	inline void lock() {
+		while( __builtin_expect(ll.exchange(true),false) ) {
+			while(ll.load(std::memory_order_relaxed))
+				asm volatile("pause");
+		}
+	}
+
+	inline bool try_lock() {
+		return false == ll.exchange(true);
+	}
+
+	inline void unlock() {
+		ll.store(false, std::memory_order_release);
+	}
+
+	inline explicit operator bool() {
+		return ll.load(std::memory_order_relaxed);
+	}
+};
+
+
 extern bool enable_stats;
 
-
 struct pick_stat {
-	size_t attempt = 0;
-	size_t success = 0;
-};
-
-extern __attribute__((aligned(64))) thread_local pick_stat local_pick;
+	struct {
+		size_t attempt = 0;
+		size_t success = 0;
+	} push;
+	struct {
+		size_t attempt = 0;
+		size_t success = 0;
+	} pop;
+};
 
 template<typename node_t>
@@ -28,21 +58,6 @@
 };
 
-struct spinlock_t {
-	std::atomic_bool ll = { false };
-
-	inline void lock() {
-		while( __builtin_expect(ll.exchange(true),false) ) {
-			while(ll.load(std::memory_order_relaxed))
-				asm volatile("pause");
-		}
-	}
-
-	inline void unlock() {
-		ll.store(false, std::memory_order_release);
-	}
-};
-
 template<typename node_t>
-class relaxed_list {
+class __attribute__((aligned(128))) relaxed_list {
 	static_assert(std::is_same<decltype(node_t::_links), _LinksFields_t<node_t>>::value, "Node must have a links field");
 
@@ -50,46 +65,108 @@
 public:
 	relaxed_list(unsigned numLists)
-		: numLists(numLists)
+	  	: numNonEmpty{0}
 		, lists(new intrusive_queue_t[numLists])
-	  	, numNonEmpty(0)
+		, numLists(numLists)
 	{}
 
-    	void push(node_t * node) {
-		int i = rng_g.next() % numLists;
-		lists[i].push(node, numNonEmpty);
+	~relaxed_list() {
+		lists.reset();
+		#ifndef NO_STATS
+			std::cout << "Difference   : "
+				<< size_t(double(intrusive_queue_t::stat::dif.value) / intrusive_queue_t::stat::dif.num  ) << " avg\t"
+				<< intrusive_queue_t::stat::dif.max << "max" << std::endl;
+		#endif
+	}
+
+    	__attribute__((noinline, hot)) void push(node_t * node) {
+		node->_links.ts = rdtscl();
+
+		while(true) {
+			// Pick a random list
+			int i = tls.rng.next() % numLists;
+
+			#ifndef NO_STATS
+				tls.pick.push.attempt++;
+			#endif
+
+			// If we can't lock it retry
+			if( !lists[i].lock.try_lock() ) continue;
+
+			// Actually push it
+			lists[i].push(node, numNonEmpty);
+			assert(numNonEmpty <= (int)numLists);
+
+			// Unlock and return
+			lists[i].lock.unlock();
+
+			#ifndef NO_STATS
+				tls.pick.push.success++;
+			#endif
+			return;
+		}
     	}
 
-	node_t * pop() {
-		int i = pickRandomly(-1);
-		int j = pickRandomly(i);
-
-		if(i == -1) {
-			return nullptr;
-		}
-
-		auto guard = lock(i, j);
-		auto & list = best(i, j);
-		return list.pop(numNonEmpty);
+	__attribute__((noinline, hot)) node_t * pop() {
+		while(numNonEmpty != 0) {
+			// Pick two lists at random
+			int i = tls.rng.next() % numLists;
+			int j = tls.rng.next() % numLists;
+
+			#ifndef NO_STATS
+				tls.pick.pop.attempt++;
+			#endif
+
+			// Pick the bet list
+			int w = i;
+			if( __builtin_expect(lists[j].ts() != 0, true) ) {
+				w = (lists[i].ts() < lists[j].ts()) ? i : j;
+			}
+
+			auto & list = lists[w];
+			// If list looks empty retry
+			if( list.ts() == 0 ) continue;
+
+			// If we can't get the lock retry
+			if( !list.lock.try_lock() ) continue;
+
+			// If list is empty, unlock and retry
+			if( list.ts() == 0 ) {
+				list.lock.unlock();
+				continue;
+			}
+
+			// Actually pop the list
+			auto node = list.pop(numNonEmpty);
+			assert(node);
+
+			// Unlock and return
+			list.lock.unlock();
+			assert(numNonEmpty >= 0);
+			#ifndef NO_STATS
+				tls.pick.pop.success++;
+			#endif
+			return node;
+		}
+
+		return nullptr;
     	}
 
-	node_t * pop2() {
-		int i = pickRandomly(-1);
-		int j = pickRandomly(i);
-
-		if(i == -1) {
-			return nullptr;
-		}
-
-		auto & list = best2(i, j);
-		return list.pop2(numNonEmpty);
-    	}
-
 private:
 
-	class intrusive_queue_t {
+	class __attribute__((aligned(128))) intrusive_queue_t {
 	public:
 		typedef spinlock_t lock_t;
 
 		friend class relaxed_list<node_t>;
+
+		struct stat {
+			ssize_t diff = 0;
+
+			static struct Dif {
+				ssize_t value = 0;
+				size_t  num   = 0;
+				ssize_t max   = 0;
+			} dif;
+		};
 
 	private:
@@ -98,20 +175,13 @@
 		};
 
-		struct stat {
-			size_t push = 0;
-			size_t pop  = 0;
-		};
-
-		__attribute__((aligned(64))) lock_t lock;
-		__attribute__((aligned(64))) bool empty;
-		stat s;
+		lock_t lock;
 		sentinel_t before;
 		sentinel_t after;
+		stat s;
 
 		static constexpr auto fields_offset = offsetof( node_t, _links );
 	public:
 		intrusive_queue_t()
-			: empty(true)
-			, before{{ nullptr, tail() }}
+			: before{{ nullptr, tail() }}
 			, after {{ head(), nullptr }}
 		{
@@ -122,42 +192,55 @@
 			assert(tail()->_links.next == nullptr);
 			assert(tail()->_links.prev == head() );
+			assert(sizeof(*this) == 128);
+			assert((intptr_t(this) % 128) == 0);
 		}
 
 		~intrusive_queue_t() {
-			std::cout << " Push: " << s.push << "\tPop: " << s.pop << "\t(this: " << this << ")" << std::endl;
-		}
-
-		node_t * head() const {
-			return reinterpret_cast<node_t *>(
+			#ifndef NO_STATS
+				stat::dif.value+= s.diff;
+				stat::dif.num  ++;
+				stat::dif.max  = std::max(stat::dif.max, s.diff);
+			#endif
+		}
+
+		inline node_t * head() const {
+			node_t * rhead = reinterpret_cast<node_t *>(
 				reinterpret_cast<uintptr_t>( &before ) - fields_offset
 			);
-		}
-
-		node_t * tail() const {
-			return reinterpret_cast<node_t *>(
+			assert(rhead);
+			return rhead;
+		}
+
+		inline node_t * tail() const {
+			node_t * rtail = reinterpret_cast<node_t *>(
 				reinterpret_cast<uintptr_t>( &after ) - fields_offset
 			);
-		}
-
-		void push(node_t * node, volatile int & nonEmpty) {
+			assert(rtail);
+			return rtail;
+		}
+
+		inline void push(node_t * node, std::atomic_int & nonEmpty) {
+			assert(lock);
+			assert(node->_links.ts != 0);
 			node_t * tail = this->tail();
-			std::lock_guard<lock_t> guard(lock);
-			node->_links.ts = rdtscl();
 
 			node_t * prev = tail->_links.prev;
 			// assertf(node->_links.ts >= prev->_links.ts,
-			// "New node has smaller timestamp: %llu < %llu", node->_links.ts, prev->_links.ts);
+			// 	"New node has smaller timestamp: %llu < %llu", node->_links.ts, prev->_links.ts);
 			node->_links.next = tail;
 			node->_links.prev = prev;
 			prev->_links.next = node;
 			tail->_links.prev = node;
-			if(empty) {
-				__atomic_fetch_add(&nonEmpty, 1, __ATOMIC_SEQ_CST);
-				empty = false;
-			}
-			if(enable_stats) s.push++;
-		}
-
-		node_t * pop(volatile int & nonEmpty) {
+			if(before._links.ts == 0l) {
+				nonEmpty += 1;
+				before._links.ts = node->_links.ts;
+			}
+			#ifndef NO_STATS
+				if(enable_stats) s.diff++;
+			#endif
+		}
+
+		inline node_t * pop(std::atomic_int & nonEmpty) {
+			assert(lock);
 			node_t * head = this->head();
 			node_t * tail = this->tail();
@@ -171,133 +254,37 @@
 
 			if(next == tail) {
-				empty = true;
-				__atomic_fetch_sub(&nonEmpty, 1, __ATOMIC_SEQ_CST);
-			}
-			if(enable_stats) s.pop++;
+				before._links.ts = 0l;
+				nonEmpty -= 1;
+			}
+			else {
+				assert(next->_links.ts != 0);
+				before._links.ts = next->_links.ts;
+				assert(before._links.ts != 0);
+			}
+			#ifndef NO_STATS
+				if(enable_stats) s.diff--;
+			#endif
 			return node;
 		}
 
-		node_t * pop2(volatile int & nonEmpty) {
-			node_t * head = this->head();
-			node_t * tail = this->tail();
-
-			std::lock_guard<lock_t> guard(lock);
-			node_t * node = head->_links.next;
-			node_t * next = node->_links.next;
-			if(node == tail) return nullptr;
-
-			head->_links.next = next;
-			next->_links.prev = head;
-
-			if(next == tail) {
-				empty = true;
-				__atomic_fetch_sub(&nonEmpty, 1, __ATOMIC_SEQ_CST);
-			}
-			if(enable_stats) s.pop++;
-			return node;
-		}
-
-		static intrusive_queue_t & best(intrusive_queue_t & lhs, intrusive_queue_t & rhs) {
-			bool lhs_empty = lhs.empty;
-			bool rhs_empty = rhs.empty;
-
-			if(lhs_empty && rhs_empty) return lhs;
-			if(!lhs_empty && rhs_empty) return lhs;
-			if(lhs_empty && !rhs_empty) return rhs;
-			node_t * lhs_head = lhs.head()->_links.next;
-			node_t * rhs_head = rhs.head()->_links.next;
-
-			assert(lhs_head != lhs.tail());
-			assert(rhs_head != rhs.tail());
-
-			if(lhs_head->_links.ts < lhs_head->_links.ts) {
-				return lhs;
-			} else {
-				return rhs;
-			}
-		}
-
-		static intrusive_queue_t & best2(intrusive_queue_t & lhs, intrusive_queue_t & rhs) {
-			node_t * lhs_head = lhs.head()->_links.next;
-			node_t * rhs_head = rhs.head()->_links.next;
-
-			bool lhs_empty = lhs_head != lhs.tail();
-			bool rhs_empty = rhs_head != rhs.tail();
-			if(lhs_empty && rhs_empty) return lhs;
-			if(!lhs_empty && rhs_empty) return lhs;
-			if(lhs_empty && !rhs_empty) return rhs;
-
-			if(lhs_head->_links.ts < lhs_head->_links.ts) {
-				return lhs;
-			} else {
-				return rhs;
-			}
+		long long ts() const {
+			return before._links.ts;
 		}
 	};
 
 
+public:
+
+	static __attribute__((aligned(128))) thread_local struct TLS {
+		Random    rng = { int(rdtscl()) };
+		pick_stat pick;
+	} tls;
+
 private:
-
-	static thread_local Random rng_g;
-    	__attribute__((aligned(64))) const unsigned numLists;
-	std::unique_ptr<intrusive_queue_t []> lists;
-	__attribute__((aligned(64))) volatile int numNonEmpty; // number of non-empty lists
-
-
-private:
-
-
-
-private:
-	int pickRandomly(const int avoid) {
-		int j;
-		do {
-			local_pick.attempt++;
-			j = rng_g.next() % numLists;
-			if (numNonEmpty < 1 + (avoid != -1)) return -1;
-		} while (j == avoid || lists[j].empty);
-		local_pick.success++;
-		return j;
-	}
-
-private:
-
-	struct queue_guard {
-		intrusive_queue_t * lists;
-		int i, j;
-
-		queue_guard(intrusive_queue_t * lists, int i, int j)
-			: lists(lists), i(i), j(j)
-		{
-			if(i >= 0) lists[i].lock.lock();
-			if(j >= 0) lists[j].lock.lock();
-		}
-
-		queue_guard(const queue_guard &) = delete;
-		queue_guard(queue_guard &&) = default;
-
-		~queue_guard() {
-			if(i >= 0) lists[i].lock.unlock();
-			if(j >= 0) lists[j].lock.unlock();
-		}
-	};
-
-	auto lock(int i, int j) {
-		assert(i >= 0);
-		assert(i != j);
-		if(j < i) return queue_guard(lists.get(), j, i);
-		return queue_guard(lists.get(), i, j);
-	}
-
-	intrusive_queue_t & best(int i, int j) {
-		assert(i != -1);
-		if(j == -1) return lists[i];
-		return intrusive_queue_t::best(lists[i], lists[j]);
-	}
-
-	intrusive_queue_t & best2(int i, int j) {
-		assert(i != -1);
-		if(j == -1) return lists[i];
-		return intrusive_queue_t::best2(lists[i], lists[j]);
-	}
-};
+	std::atomic_int numNonEmpty; // number of non-empty lists
+    	__attribute__((aligned(64))) std::unique_ptr<intrusive_queue_t []> lists;
+	const unsigned numLists;
+
+public:
+	static const constexpr size_t sizeof_queue = sizeof(intrusive_queue_t);
+};
