Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/dynamic_entropy.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/dynamic_entropy.hpp	(revision 47e000c054a8dc58293a93fbc643e10735fd30f9)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/dynamic_entropy.hpp	(revision 47e000c054a8dc58293a93fbc643e10735fd30f9)
@@ -0,0 +1,201 @@
+#pragma once
+
+#define LIST_VARIANT dyn_ent_list
+
+#include <memory>
+#include <mutex>
+#include <thread>
+
+#include "assert.hpp"
+#include "utils.hpp"
+#include "links.hpp"
+
+std::mutex mtx_;
+
+template<typename node_t>
+class __attribute__((aligned(128))) dyn_ent_list {
+	const unsigned numLists;
+    	__attribute__((aligned(64))) std::unique_ptr<intrusive_queue_t<node_t> []> lists;
+
+public:
+	dyn_ent_list(unsigned numThreads, unsigned)
+		: numLists(numThreads * 4)
+		, lists(new intrusive_queue_t<node_t>[numLists])
+	{
+		std::cout << "Constructing Relaxed List with " << numLists << std::endl;
+	}
+
+	__attribute__((noinline, hot)) void push(node_t * node) {
+		node->_links.ts = rdtscl();
+
+		// Try to pick a lane and lock it
+		unsigned i;
+		do {
+			// Pick the index of a lane
+			i = idx_from_r(tls.rng1.next(), tls.my_queue).first;
+			// i = ret.first; //local = ret.second;
+			tls.stats.push.attempt++;
+		} while( !lists[i].lock.try_lock() );
+
+		lists[i].push(node);
+		lists[i].lock.unlock();
+		tls.rng2.set_raw_state( tls.rng1.get_raw_state());
+		tls.stats.push.success++;
+	}
+
+	__attribute__((noinline, hot)) node_t * pop() {
+		for(int n = 0; n < 25; n++) {
+			// Pick two lists at random
+			unsigned i, j;
+			bool locali, localj;
+			auto reti = idx_from_r(tls.rng2.prev(), tls.my_queue);
+			auto retj = idx_from_r(tls.rng2.prev(), tls.my_queue);
+
+			i = reti.first; locali = reti.second;
+			j = retj.first; localj = retj.second;
+			tls.stats.pop.attempt++;
+
+			// try popping from the 2 picked lists
+			node_t * thrd = try_pop(i, j);
+			if(thrd) {
+				tls.stats.pop.success++;
+				return thrd;
+			}
+		}
+
+		unsigned offset = tls.rng2.next();
+		for(unsigned i = 0; i < numLists; i++) {
+			unsigned idx = (offset + i) % numLists;
+			node_t * thrd = try_pop(idx);
+			if(thrd) {
+				return thrd;
+			}
+		}
+
+		// All lanes where empty return 0p
+		return nullptr;
+	}
+
+private:
+	inline node_t * try_pop(unsigned i, unsigned j) {
+		// Pick the bet list
+		int w = i;
+		if( __builtin_expect(!(lists[j].ts() > 0), true) ) {
+			w = (lists[i].ts() < lists[j].ts()) ? i : j;
+		}
+
+		return try_pop(w);
+	}
+
+	inline node_t * try_pop(unsigned w) {
+		// If list looks empty retry
+		if( lists[w].ts() == 0 ) { tls.stats.pop.empty++; return nullptr; }
+
+		// If we can't get the lock retry
+		if( !lists[w].lock.try_lock() ) { tls.stats.pop.locked++; return nullptr; }
+
+
+		// If list is empty, unlock and retry
+		if( lists[w].ts() == 0 ) {
+			tls.stats.pop.both++;
+			lists[w].lock.unlock();
+			return nullptr;
+		}
+
+		auto node = lists[w].pop();
+		lists[w].lock.unlock();
+		return node.first;
+	}
+
+	inline std::pair<unsigned, bool> idx_from_r(unsigned r, unsigned preferred) {
+		unsigned i;
+		bool local;
+		unsigned rlow  = r % 4;
+		unsigned rhigh = r / 4;
+		if((0 != rlow) && preferred != outside) {
+			// (BIAS - 1) out of BIAS chances
+			// Use perferred queues
+			i = preferred + (rhigh % 4);
+			local = true;
+		}
+		else {
+			// 1 out of BIAS chances
+			// Use all queues
+			i = rhigh;
+			local = false;
+		}
+		return {i % numLists, local};
+	}
+private:
+	static std::atomic_uint32_t ticket;
+	static const unsigned outside = 0xFFFFFFFF;
+
+	static inline unsigned calc_preferred() {
+		unsigned t = ticket++;
+		if(t == 0) return outside;
+		unsigned i = 4 * (t - 1);
+		return i;
+	}
+
+	static __attribute__((aligned(128))) thread_local struct TLS {
+		Random     rng1 = { unsigned(std::hash<std::thread::id>{}(std::this_thread::get_id()) ^ rdtscl()) };
+		Random     rng2 = { unsigned(std::hash<std::thread::id>{}(std::this_thread::get_id()) ^ rdtscl()) };
+		Random     rng3 = { unsigned(std::hash<std::thread::id>{}(std::this_thread::get_id()) ^ rdtscl()) };
+		unsigned   my_queue = calc_preferred();
+		struct {
+			struct {
+				size_t attempt = 0;
+				size_t success = 0;
+			} push;
+			struct {
+				size_t attempt = 0;
+				size_t success = 0;
+				size_t empty   = 0;
+				size_t locked  = 0;
+				size_t both    = 0;
+			} pop;
+		} stats;
+	} tls;
+public:
+	static const char * name() {
+		return "Dynamic Entropy List";
+	}
+
+public:
+	static struct GlobalStats {
+		struct {
+			std::atomic_size_t attempt = 0;
+			std::atomic_size_t success = 0;
+		} push;
+		struct {
+			std::atomic_size_t attempt = 0;
+			std::atomic_size_t success = 0;
+			std::atomic_size_t empty   = 0;
+			std::atomic_size_t locked  = 0;
+			std::atomic_size_t both    = 0;
+		} pop;
+	} global_stats;
+	static void stats_tls_tally() {
+		global_stats.push.attempt += tls.stats.push.attempt;
+		global_stats.push.success += tls.stats.push.success;
+		global_stats.pop .attempt += tls.stats.pop.attempt;
+		global_stats.pop .success += tls.stats.pop.success;
+		global_stats.pop .empty   += tls.stats.pop.empty;
+		global_stats.pop .locked  += tls.stats.pop.locked;
+		global_stats.pop .both    += tls.stats.pop.both;
+	}
+
+	static void stats_print(std::ostream & os, double) {
+			const auto & global = global_stats;
+
+		double push_sur = (100.0 * double(global.push.success) / global.push.attempt);
+		double pop_sur  = (100.0 * double(global.pop .success) / global.pop .attempt);
+
+		double push_len = double(global.push.attempt     ) / global.push.success;
+		double pop_len  = double(global.pop .attempt     ) / global.pop .success;
+
+		os << "Push   Pick   : " << push_sur << " %, len " << push_len << " (" << global.push.attempt      << " / " << global.push.success << ")\n";
+		os << "Pop    Pick   : " << pop_sur  << " %, len " << pop_len  << " (" << global.pop .attempt      << " / " << global.pop .success << ")\n";
+		os << "Pop    Fails  : " << global_stats.pop .empty << "e, " << global_stats.pop .locked << "l, " << global_stats.pop .both << "\n";
+	}
+};
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/links2.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/links2.hpp	(revision 47e000c054a8dc58293a93fbc643e10735fd30f9)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/links2.hpp	(revision 47e000c054a8dc58293a93fbc643e10735fd30f9)
@@ -0,0 +1,113 @@
+#pragma once
+
+#include <assert.h>
+
+#include "utils.hpp"
+
+//------------------------------------------------------------
+// Queue based on the MCS lock
+// It is a Multi-Producer/Single-Consumer queue threads pushing
+// elements must hold on to the elements they push
+// Not appropriate for an async message queue for example,
+template<typename node_t>
+class mcs_queue {
+	node_t * volatile tail;
+
+public:
+	mcs_queue(): tail(nullptr) {}
+
+	inline bool empty() const { return !tail; }
+
+	node_t * push( node_t * elem ) {
+		/* paranoid */ assert(!elem->_links.next);
+		// Race to add to the tail
+		node_t * prev = __atomic_exchange_n(&tail, elem, __ATOMIC_SEQ_CST);
+		// If we aren't the first, we need to tell the person before us
+		// No need to
+		if (prev) prev->_links.next = elem;
+		return prev;
+	}
+
+	// Advances the head of the list, dropping the element given.
+	// Passing an element that is not the head is undefined behavior
+	// NOT Multi-Thread Safe, concurrent pushes are safe
+	node_t * advance(node_t * elem) {
+		node_t * expected = elem;
+		// Check if this is already the last item
+		if (__atomic_compare_exchange_n(&tail, &expected, nullptr, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) return nullptr;
+
+		// If not wait for next item to show-up, filled by push
+		while (!elem->_links.next) Pause();
+
+		// we need to return if the next link was empty
+		node_t * ret = elem->_links.next;
+
+		// invalidate link to reset to initial state
+		elem->_links.next = nullptr;
+		return ret;
+	}
+};
+
+//------------------------------------------------------------
+// Queue based on the MCS lock
+// Extension of the above lock which supports 'blind' pops.
+// i.e., popping a value from the head without knowing what the head is
+// has no extra guarantees beyond the mcs_queue
+template<typename node_t>
+class mpsc_queue : private mcs_queue<node_t> {
+	node_t * volatile head;
+public:
+	mpsc_queue(): mcs_queue<node_t>(), head(nullptr) {}
+
+	inline bool empty() const { return mcs_queue<node_t>::empty(); }
+
+	// Added a new element to the queue
+	// Multi-Thread Safe, Lock-Free
+	inline node_t * push(node_t * elem) {
+		node_t * prev = mcs_queue<node_t>::push(elem);
+		if (!prev) head = elem;
+		return prev;
+	}
+
+	// Pop an element from the queue
+	// return the element that was removed
+	// next is set to the new head of the queue
+	// NOT Multi-Thread Safe
+	inline node_t * pop(node_t *& next) {
+		node_t * elem = head;
+		// If head is empty just return
+		if (!elem) return nullptr;
+
+		// If there is already someone in the list, then it's easy
+		if (elem->_links.next) {
+			head = next = elem->_links.next;
+			// force memory sync
+			__atomic_thread_fence(__ATOMIC_SEQ_CST);
+
+			// invalidate link to reset to initial state
+			elem->_links.next = nullptr;
+		}
+		// Otherwise, there might be a race where it only looks but someone is enqueuing
+		else {
+			// null out head here, because we linearize with push
+			// at the CAS in advance and therefore can write to head
+			// after that point, it could overwrite the write in push
+			head = nullptr;
+			next = mcs_queue<node_t>::advance(elem);
+
+			// Only write to the head if there is a next element
+			// it is the only way we can guarantee we are not overwriting
+			// a write made in push
+			if (next) head = next;
+		}
+
+		// return removed element
+		return elem;
+	}
+
+	// Same as previous function
+	inline node_t * pop() {
+		node_t * _ = nullptr;
+		return pop(_);
+	}
+};
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/ntmove.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/ntmove.cpp	(revision 47e000c054a8dc58293a93fbc643e10735fd30f9)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/ntmove.cpp	(revision 47e000c054a8dc58293a93fbc643e10735fd30f9)
@@ -0,0 +1,87 @@
+#include <atomic>
+#include <iostream>
+#include <locale>
+#include <thread>
+
+#include <x86intrin.h>
+
+struct __attribute__((aligned(128))) Global_t {
+	volatile size_t value;
+} global;
+
+static const size_t iterations = 1'000'000'000;
+
+size_t read() {
+	// size_t r = __atomic_load_n(&global.value, __ATOMIC_RELAXED);
+	// _mm_stream_si64((long long int*)&global.value, r);
+	// // _mm_clflush( (void*)&global.value );
+	// // __builtin_prefetch((void*)&global.value);
+	// asm volatile(
+	// 	"PREFETCHNTA %[target]"
+	// 	:
+	// 	: [target] "m" (global.value)
+	// );
+	// return r;
+	return __atomic_load_n(&global.value, __ATOMIC_SEQ_CST);
+
+	// __m128i r = _mm_stream_load_si128((__m128i*)&global.value);
+	// asm volatile(
+	// 	"PREFETCHNTA %[target]"
+	// 	:
+	// 	: [target] "m" (global.value)
+	// );
+	// return ((Global_t*)&r)->value;
+	// size_t r;
+	// asm volatile(
+	// 	"MOVNTI %[target], %[r]\n\t"
+	// 	: [r] "=r" (r)
+	// 	: [target] "m" (global.value)
+	// );
+	// return r;
+}
+
+void write(size_t v) {
+	// __atomic_store_n(&global.value, v, __ATOMIC_SEQ_CST);
+	// __atomic_store_n(&global.value, v, __ATOMIC_RELAXED);
+	// asm volatile(
+	// 	"MOVNTI %[v], %[target]\n\t"
+	// 	:
+	// 	: [target] "m" (global.value), [v] "r" (v)
+	// );
+	_mm_stream_si64((long long int*)&global.value, v);
+}
+
+void reader(size_t * reads, size_t * diffs, size_t * m) {
+	size_t last = read();
+	for(size_t i = 0; i < iterations; i++) {
+		size_t val = read();
+		if(last != val) (*diffs)++;
+		last = val;
+		if(last > *m) *m = last;
+		(*reads)++;
+	}
+}
+
+std::atomic<bool> done = { false };
+
+void writer() {
+	size_t v = 0;
+	while(!done) {
+		v++;
+		write(v);
+		__atomic_thread_fence(__ATOMIC_SEQ_CST);
+	}
+}
+
+int main() {
+	std::cout.imbue(std::locale(""));
+	size_t reads = 0;
+	size_t diffs = 0;
+	size_t max   = 0;
+	auto w = std::thread(writer);
+	auto r = std::thread(reader, &reads, &diffs, &max);
+	r.join();
+	done = true;
+	w.join();
+	std::cout << reads << " " << diffs << " " << max << std::endl;
+}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list.hpp	(revision 3ada8ae8221148e410bf581a2a73ad6b15798d78)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list.hpp	(revision 47e000c054a8dc58293a93fbc643e10735fd30f9)
@@ -39,5 +39,5 @@
 		while( __builtin_expect(ll.exchange(true),false) ) {
 			while(ll.load(std::memory_order_relaxed))
-				asm volatile("pause");
+				Pause();
 		}
 		/* paranoid */ assert(ll);
@@ -93,5 +93,5 @@
 			 && ready.compare_exchange_weak(copy, n + 1) )
 			 	break;
-			asm volatile("pause");
+			Pause();
 		}
 
@@ -133,5 +133,5 @@
 		// Step 1 : make sure no writer are in the middle of the critical section
 		while(lock.load(std::memory_order_relaxed))
-			asm volatile("pause");
+			Pause();
 
 		// Fence needed because we don't want to start trying to acquire the lock
@@ -195,5 +195,5 @@
 		//   to simply lock their own lock and enter.
 		while(lock.load(std::memory_order_relaxed))
-			asm volatile("pause");
+			Pause();
 
 		// Step 2 : lock per-proc lock
@@ -204,5 +204,5 @@
 		for(uint_fast32_t i = 0; i < s; i++) {
 			while(data[i].lock.load(std::memory_order_relaxed))
-				asm volatile("pause");
+				Pause();
 		}
 
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list_good.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list_good.cpp	(revision 3ada8ae8221148e410bf581a2a73ad6b15798d78)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/processor_list_good.cpp	(revision 47e000c054a8dc58293a93fbc643e10735fd30f9)
@@ -21,5 +21,5 @@
 		target = (target - (target % total)) + total;
 		while(waiting < target)
-			asm volatile("pause");
+			Pause();
 
 		assert(waiting < (1ul << 60));
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/randbit.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/randbit.cpp	(revision 3ada8ae8221148e410bf581a2a73ad6b15798d78)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/randbit.cpp	(revision 47e000c054a8dc58293a93fbc643e10735fd30f9)
@@ -123,5 +123,5 @@
 		target = (target - (target % total)) + total;
 		while(waiting < target)
-			asm volatile("pause");
+			Pause();
 
 		assert(waiting < (1ul << 60));
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list.cpp	(revision 3ada8ae8221148e410bf581a2a73ad6b15798d78)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list.cpp	(revision 47e000c054a8dc58293a93fbc643e10735fd30f9)
@@ -206,5 +206,5 @@
 	std::cout << "Total ops     : " << ops << "(" << global.in << "i, " << global.out << "o, " << global.empty << "e)\n";
 	#ifndef NO_STATS
-		LIST_VARIANT<Node>::stats_print(std::cout);
+		LIST_VARIANT<Node>::stats_print(std::cout, duration);
 	#endif
 }
@@ -368,6 +368,8 @@
 
 		for(Node * & node : nodes) {
-			node = list.pop();
-			assert(node);
+			node = nullptr;
+			while(!node) {
+				node = list.pop();
+			}
 			local.crc_out += node->value;
 			local.out++;
@@ -691,6 +693,6 @@
 
 				for(const auto & n : nodes) {
-					local.valmax = max(local.valmax, size_t(n.value));
-					local.valmin = min(local.valmin, size_t(n.value));
+					local.valmax = std::max(local.valmax, size_t(n.value));
+					local.valmin = std::min(local.valmin, size_t(n.value));
 				}
 
@@ -773,5 +775,5 @@
 						try {
 							arg = optarg = argv[optind];
-							nnodes = stoul(optarg, &len);
+							nnodes = std::stoul(optarg, &len);
 							if(len != arg.size()) { throw std::invalid_argument(""); }
 						} catch(std::invalid_argument &) {
@@ -792,5 +794,5 @@
 						try {
 							arg = optarg = argv[optind];
-							nnodes = stoul(optarg, &len);
+							nnodes = std::stoul(optarg, &len);
 							if(len != arg.size()) { throw std::invalid_argument(""); }
 						} catch(std::invalid_argument &) {
@@ -812,5 +814,5 @@
 						try {
 							arg = optarg = argv[optind];
-							nnodes = stoul(optarg, &len);
+							nnodes = std::stoul(optarg, &len);
 							if(len != arg.size()) { throw std::invalid_argument(""); }
 							nslots = nnodes;
@@ -823,5 +825,5 @@
 						try {
 							arg = optarg = argv[optind];
-							nnodes = stoul(optarg, &len);
+							nnodes = std::stoul(optarg, &len);
 							if(len != arg.size()) { throw std::invalid_argument(""); }
 						} catch(std::invalid_argument &) {
@@ -831,5 +833,5 @@
 						try {
 							arg = optarg = argv[optind + 1];
-							nslots = stoul(optarg, &len);
+							nslots = std::stoul(optarg, &len);
 							if(len != arg.size()) { throw std::invalid_argument(""); }
 						} catch(std::invalid_argument &) {
@@ -884,5 +886,5 @@
 			case 'd':
 				try {
-					duration = stod(optarg, &len);
+					duration = std::stod(optarg, &len);
 					if(len != arg.size()) { throw std::invalid_argument(""); }
 				} catch(std::invalid_argument &) {
@@ -893,5 +895,5 @@
 			case 't':
 				try {
-					nthreads = stoul(optarg, &len);
+					nthreads = std::stoul(optarg, &len);
 					if(len != arg.size()) { throw std::invalid_argument(""); }
 				} catch(std::invalid_argument &) {
@@ -902,5 +904,5 @@
 			case 'q':
 				try {
-					nqueues = stoul(optarg, &len);
+					nqueues = std::stoul(optarg, &len);
 					if(len != arg.size()) { throw std::invalid_argument(""); }
 				} catch(std::invalid_argument &) {
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzi-packed.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzi-packed.hpp	(revision 3ada8ae8221148e410bf581a2a73ad6b15798d78)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzi-packed.hpp	(revision 47e000c054a8dc58293a93fbc643e10735fd30f9)
@@ -168,5 +168,4 @@
 	for(int i = 0; i < width; i++) {
 		int idx = i % hwdith;
-		std::cout << i << " -> " << idx + width << std::endl;
 		leafs[i].parent = &nodes[ idx ];
 	}
@@ -174,5 +173,4 @@
 	for(int i = 0; i < root; i++) {
 		int idx = (i / 2) + hwdith;
-		std::cout << i + width << " -> " << idx + width << std::endl;
 		nodes[i].parent = &nodes[ idx ];
 	}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzi.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzi.hpp	(revision 3ada8ae8221148e410bf581a2a73ad6b15798d78)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/snzi.hpp	(revision 47e000c054a8dc58293a93fbc643e10735fd30f9)
@@ -159,5 +159,4 @@
 	std::cout << "SNZI: " << depth << "x" << width << "(" << mask - 1 << ") " << (sizeof(snzi_t::node) * (root + 1)) << " bytes" << std::endl;
 	for(int i = 0; i < root; i++) {
-		std::cout << i << " -> " << (i / base) + width << std::endl;
 		nodes[i].parent = &nodes[(i / base) + width];
 	}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/utils.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/utils.hpp	(revision 3ada8ae8221148e410bf581a2a73ad6b15798d78)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/utils.hpp	(revision 47e000c054a8dc58293a93fbc643e10735fd30f9)
@@ -12,26 +12,4 @@
 
 #include <x86intrin.h>
-
-// Barrier from
-class barrier_t {
-public:
-	barrier_t(size_t total)
-		: waiting(0)
-		, total(total)
-	{}
-
-	void wait(unsigned) {
-		size_t target = waiting++;
-		target = (target - (target % total)) + total;
-		while(waiting < target)
-			asm volatile("pause");
-
-		assert(waiting < (1ul << 60));
-    	}
-
-private:
-	std::atomic<size_t> waiting;
-	size_t total;
-};
 
 // class Random {
@@ -102,9 +80,26 @@
 };
 
-static inline long long rdtscl(void) {
-    unsigned int lo, hi;
-    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
-    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
-}
+static inline long long int rdtscl(void) {
+	#if defined( __i386 ) || defined( __x86_64 )
+		unsigned int lo, hi;
+		__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+		return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
+	#elif defined( __aarch64__ ) || defined( __arm__ )
+		// https://github.com/google/benchmark/blob/v1.1.0/src/cycleclock.h#L116
+		long long int virtual_timer_value;
+		asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value));
+		return virtual_timer_value;
+	#else
+		#error unsupported hardware architecture
+	#endif
+}
+
+#if defined( __i386 ) || defined( __x86_64 )
+	#define Pause() __asm__ __volatile__ ( "pause" : : : )
+#elif defined( __ARM_ARCH )
+	#define Pause() __asm__ __volatile__ ( "YIELD" : : : )
+#else
+	#error unsupported architecture
+#endif
 
 static inline void affinity(int tid) {
@@ -195,4 +190,26 @@
 }
 
+// Barrier from
+class barrier_t {
+public:
+	barrier_t(size_t total)
+		: waiting(0)
+		, total(total)
+	{}
+
+	void wait(unsigned) {
+		size_t target = waiting++;
+		target = (target - (target % total)) + total;
+		while(waiting < target)
+			Pause();
+
+		assert(waiting < (1ul << 60));
+    	}
+
+private:
+	std::atomic<size_t> waiting;
+	size_t total;
+};
+
 struct spinlock_t {
 	std::atomic_bool ll = { false };
@@ -201,5 +218,5 @@
 		while( __builtin_expect(ll.exchange(true),false) ) {
 			while(ll.load(std::memory_order_relaxed))
-				asm volatile("pause");
+				Pause();
 		}
 	}
Index: doc/theses/thierry_delisle_PhD/code/readyQ_proto/work_stealing.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readyQ_proto/work_stealing.hpp	(revision 3ada8ae8221148e410bf581a2a73ad6b15798d78)
+++ doc/theses/thierry_delisle_PhD/code/readyQ_proto/work_stealing.hpp	(revision 47e000c054a8dc58293a93fbc643e10735fd30f9)
@@ -6,4 +6,5 @@
 #include <memory>
 #include <mutex>
+#include <thread>
 #include <type_traits>
 
@@ -11,7 +12,24 @@
 #include "utils.hpp"
 #include "links.hpp"
+#include "links2.hpp"
 #include "snzi.hpp"
 
+#include <x86intrin.h>
+
 using namespace std;
+
+static const long long lim = 2000;
+static const unsigned nqueues = 2;
+
+struct __attribute__((aligned(128))) timestamp_t {
+	volatile unsigned long long val = 0;
+};
+
+template<typename node_t>
+struct __attribute__((aligned(128))) localQ_t {
+	mpsc_queue<node_t> queue = {};
+	spinlock_t lock = {};
+	bool needs_help = true;
+};
 
 template<typename node_t>
@@ -25,7 +43,8 @@
 
 	work_stealing(unsigned _numThreads, unsigned)
-		: numThreads(_numThreads)
+		: numThreads(_numThreads * nqueues)
 		, lists(new intrusive_queue_t<node_t>[numThreads])
-		, snzi( std::log2( numThreads / 2 ), 2 )
+		, times(new timestamp_t[numThreads])
+		// , snzi( std::log2( numThreads / 2 ), 2 )
 
 	{
@@ -39,103 +58,116 @@
 
 	__attribute__((noinline, hot)) void push(node_t * node) {
-		node->_links.ts = rdtscl();
-		if( node->_links.hint > numThreads ) {
-			node->_links.hint = tls.rng.next() % numThreads;
-			tls.stat.push.nhint++;
-		}
-
-		unsigned i = node->_links.hint;
+		// node->_links.ts = rdtscl();
+		node->_links.ts = 1;
+
+		auto & list = *({
+			unsigned i;
+			do {
+				tls.stats.push.attempt++;
+				// unsigned r = tls.rng1.next();
+				unsigned r = tls.it++;
+				if(tls.my_queue == outside) {
+					i = r % numThreads;
+				} else {
+					i = tls.my_queue + (r % nqueues);
+				}
+			} while(!lists[i].lock.try_lock());
+		 	&lists[i];
+		});
+
+		list.push( node );
+		list.lock.unlock();
+		// tls.rng2.set_raw_state( tls.rng1.get_raw_state());
+		// count++;
+		tls.stats.push.success++;
+	}
+
+	__attribute__((noinline, hot)) node_t * pop() {
+		if( tls.myfriend == outside ) {
+			auto r  = tls.rng1.next();
+			tls.myfriend = r % numThreads;
+			times[tls.myfriend].val = 0;
+		}
+		else if(times[tls.myfriend].val == 0) {
+			node_t * n = try_pop(tls.myfriend, tls.stats.pop.help);
+			tls.stats.help++;
+			tls.myfriend = outside;
+			if(n) return n;
+		}
+
+		if(tls.my_queue != outside) {
+			node_t * n = local();
+			if(n) return n;
+		}
+
+		// try steal
+		for(int i = 0; i < 25; i++) {
+			node_t * n = steal();
+			if(n) return n;
+		}
+
+		return search();
+	}
+
+private:
+	inline node_t * local() {
+		// unsigned i = (tls.rng2.prev() % 4) + tls.my_queue;
+		unsigned i = (--tls.it % nqueues) + tls.my_queue;
+		return try_pop(i, tls.stats.pop.local);
+	}
+
+	inline node_t * steal() {
+		unsigned i = tls.rng2.prev() % numThreads;
+		return try_pop(i, tls.stats.pop.steal);
+	}
+
+	inline node_t * search() {
+		unsigned offset = tls.rng2.prev();
+		for(unsigned i = 0; i < numThreads; i++) {
+			unsigned idx = (offset + i) % numThreads;
+			node_t * thrd = try_pop(idx, tls.stats.pop.search);
+			if(thrd) {
+				return thrd;
+			}
+		}
+
+		return nullptr;
+	}
+
+private:
+	struct attempt_stat_t {
+		std::size_t attempt = { 0 };
+		std::size_t elock   = { 0 };
+		std::size_t eempty  = { 0 };
+		std::size_t espec   = { 0 };
+		std::size_t success = { 0 };
+	};
+
+	node_t * try_pop(unsigned i, attempt_stat_t & stat) {
+		assert(i < numThreads);
 		auto & list = lists[i];
-		list.lock.lock();
-
-		if(list.push( node )) {
-			snzi.arrive(i);
-		}
-
-		list.lock.unlock();
-	}
-
-	__attribute__((noinline, hot)) node_t * pop() {
-		node_t * node;
-		while(true) {
-			if(!snzi.query()) {
-				return nullptr;
-			}
-
-			{
-				unsigned i = tls.my_queue;
-				auto & list = lists[i];
-				if( list.ts() != 0 ) {
-					list.lock.lock();
-					if((node = try_pop(i))) {
-						tls.stat.pop.local.success++;
-						break;
-					}
-					else {
-						tls.stat.pop.local.elock++;
-					}
-				}
-				else {
-					tls.stat.pop.local.espec++;
-				}
-			}
-
-			tls.stat.pop.steal.tried++;
-
-			int i = tls.rng.next() % numThreads;
-			auto & list = lists[i];
-			if( list.ts() == 0 ) {
-				tls.stat.pop.steal.empty++;
-				continue;
-			}
-
-			if( !list.lock.try_lock() ) {
-				tls.stat.pop.steal.locked++;
-				continue;
-			}
-
-			if((node = try_pop(i))) {
-				tls.stat.pop.steal.success++;
-				break;
-			}
-		}
-
-		#if defined(READ)
-			const unsigned f = READ;
-			if(0 == (tls.it % f)) {
-				unsigned i = tls.it / f;
-				lists[i % numThreads].ts();
-			}
-			// lists[tls.it].ts();
-			tls.it++;
-		#endif
-
-
-		return node;
-	}
-
-private:
-	node_t * try_pop(unsigned i) {
-		auto & list = lists[i];
+		stat.attempt++;
+
+		// If the list is empty, don't try
+		if(list.ts() == 0) { stat.espec++; return nullptr; }
+
+		// If we can't get the lock, move on
+		if( !list.lock.try_lock() ) { stat.elock++; return nullptr; }
+
 
 		// If list is empty, unlock and retry
 		if( list.ts() == 0 ) {
 			list.lock.unlock();
+			stat.eempty++;
 			return nullptr;
 		}
 
-			// Actually pop the list
-		node_t * node;
-		bool emptied;
-		std::tie(node, emptied) = list.pop();
-		assert(node);
-
-		if(emptied) {
-			snzi.depart(i);
-		}
-
-		// Unlock and return
+		auto node = list.pop();
 		list.lock.unlock();
-		return node;
+		stat.success++;
+		times[i].val = 1; //node.first->_links.ts;
+		// count--;
+		// _mm_stream_si64((long long int*)&times[i].val, node.first->_links.ts);
+		return node.first;
 	}
 
@@ -144,7 +176,19 @@
 
 	static std::atomic_uint32_t ticket;
+	static const unsigned outside = 0xFFFFFFFF;
+
+	static inline unsigned calc_preferred() {
+		unsigned t = ticket++;
+		if(t == 0) return outside;
+		unsigned i = (t - 1) * nqueues;
+		return i;
+	}
+
 	static __attribute__((aligned(128))) thread_local struct TLS {
-		Random     rng = { int(rdtscl()) };
-		unsigned   my_queue = ticket++;
+		Random     rng1 = { unsigned(std::hash<std::thread::id>{}(std::this_thread::get_id()) ^ rdtscl()) };
+		Random     rng2 = { unsigned(std::hash<std::thread::id>{}(std::this_thread::get_id()) ^ rdtscl()) };
+		unsigned   it   = 0;
+		unsigned   my_queue = calc_preferred();
+		unsigned   myfriend = outside;
 		#if defined(READ)
 			unsigned it = 0;
@@ -152,20 +196,15 @@
 		struct {
 			struct {
-				std::size_t nhint = { 0 };
+				std::size_t attempt = { 0 };
+				std::size_t success = { 0 };
 			} push;
 			struct {
-				struct {
-					std::size_t success = { 0 };
-					std::size_t espec = { 0 };
-					std::size_t elock = { 0 };
-				} local;
-				struct {
-					std::size_t tried   = { 0 };
-					std::size_t locked  = { 0 };
-					std::size_t empty   = { 0 };
-					std::size_t success = { 0 };
-				} steal;
+				attempt_stat_t help;
+				attempt_stat_t local;
+				attempt_stat_t steal;
+				attempt_stat_t search;
 			} pop;
-		} stat;
+			std::size_t help = { 0 };
+		} stats;
 	} tls;
 
@@ -173,5 +212,6 @@
 	const unsigned numThreads;
     	std::unique_ptr<intrusive_queue_t<node_t> []> lists;
-	__attribute__((aligned(64))) snzi_t snzi;
+    	std::unique_ptr<timestamp_t []> times;
+	__attribute__((aligned(128))) std::atomic_size_t count;
 
 #ifndef NO_STATS
@@ -179,42 +219,94 @@
 	static struct GlobalStats {
 		struct {
-			std::atomic_size_t nhint = { 0 };
+			std::atomic_size_t attempt = { 0 };
+			std::atomic_size_t success = { 0 };
 		} push;
 		struct {
 			struct {
+				std::atomic_size_t attempt = { 0 };
+				std::atomic_size_t elock   = { 0 };
+				std::atomic_size_t eempty  = { 0 };
+				std::atomic_size_t espec   = { 0 };
 				std::atomic_size_t success = { 0 };
-				std::atomic_size_t espec = { 0 };
-				std::atomic_size_t elock = { 0 };
+			} help;
+			struct {
+				std::atomic_size_t attempt = { 0 };
+				std::atomic_size_t elock   = { 0 };
+				std::atomic_size_t eempty  = { 0 };
+				std::atomic_size_t espec   = { 0 };
+				std::atomic_size_t success = { 0 };
 			} local;
 			struct {
-				std::atomic_size_t tried   = { 0 };
-				std::atomic_size_t locked  = { 0 };
-				std::atomic_size_t empty   = { 0 };
+				std::atomic_size_t attempt = { 0 };
+				std::atomic_size_t elock   = { 0 };
+				std::atomic_size_t eempty  = { 0 };
+				std::atomic_size_t espec   = { 0 };
 				std::atomic_size_t success = { 0 };
 			} steal;
+			struct {
+				std::atomic_size_t attempt = { 0 };
+				std::atomic_size_t elock   = { 0 };
+				std::atomic_size_t eempty  = { 0 };
+				std::atomic_size_t espec   = { 0 };
+				std::atomic_size_t success = { 0 };
+			} search;
 		} pop;
+		std::atomic_size_t help = { 0 };
 	} global_stats;
 
 public:
 	static void stats_tls_tally() {
-		global_stats.push.nhint += tls.stat.push.nhint;
-		global_stats.pop.local.success += tls.stat.pop.local.success;
-		global_stats.pop.local.espec   += tls.stat.pop.local.espec  ;
-		global_stats.pop.local.elock   += tls.stat.pop.local.elock  ;
-		global_stats.pop.steal.tried   += tls.stat.pop.steal.tried  ;
-		global_stats.pop.steal.locked  += tls.stat.pop.steal.locked ;
-		global_stats.pop.steal.empty   += tls.stat.pop.steal.empty  ;
-		global_stats.pop.steal.success += tls.stat.pop.steal.success;
-	}
-
-	static void stats_print(std::ostream & os ) {
+		global_stats.push.attempt += tls.stats.push.attempt;
+		global_stats.push.success += tls.stats.push.success;
+		global_stats.pop.help  .attempt += tls.stats.pop.help  .attempt;
+		global_stats.pop.help  .elock   += tls.stats.pop.help  .elock  ;
+		global_stats.pop.help  .eempty  += tls.stats.pop.help  .eempty ;
+		global_stats.pop.help  .espec   += tls.stats.pop.help  .espec  ;
+		global_stats.pop.help  .success += tls.stats.pop.help  .success;
+		global_stats.pop.local .attempt += tls.stats.pop.local .attempt;
+		global_stats.pop.local .elock   += tls.stats.pop.local .elock  ;
+		global_stats.pop.local .eempty  += tls.stats.pop.local .eempty ;
+		global_stats.pop.local .espec   += tls.stats.pop.local .espec  ;
+		global_stats.pop.local .success += tls.stats.pop.local .success;
+		global_stats.pop.steal .attempt += tls.stats.pop.steal .attempt;
+		global_stats.pop.steal .elock   += tls.stats.pop.steal .elock  ;
+		global_stats.pop.steal .eempty  += tls.stats.pop.steal .eempty ;
+		global_stats.pop.steal .espec   += tls.stats.pop.steal .espec  ;
+		global_stats.pop.steal .success += tls.stats.pop.steal .success;
+		global_stats.pop.search.attempt += tls.stats.pop.search.attempt;
+		global_stats.pop.search.elock   += tls.stats.pop.search.elock  ;
+		global_stats.pop.search.eempty  += tls.stats.pop.search.eempty ;
+		global_stats.pop.search.espec   += tls.stats.pop.search.espec  ;
+		global_stats.pop.search.success += tls.stats.pop.search.success;
+		global_stats.help += tls.stats.help;
+	}
+
+	static void stats_print(std::ostream & os, double duration ) {
 		std::cout << "----- Work Stealing Stats -----" << std::endl;
 
-		double stealSucc = double(global_stats.pop.steal.success) / global_stats.pop.steal.tried;
-		os << "Push to new Q : " << std::setw(15) << global_stats.push.nhint << "\n";
-		os << "Local Pop     : " << std::setw(15) << global_stats.pop.local.success << "\n";
-		os << "Steal Pop     : " << std::setw(15) << global_stats.pop.steal.success << "(" << global_stats.pop.local.espec << "s, " << global_stats.pop.local.elock << "l)\n";
-		os << "Steal Success : " << std::setw(15) << stealSucc << "(" << global_stats.pop.steal.tried << " tries)\n";
-		os << "Steal Fails   : " << std::setw(15) << global_stats.pop.steal.empty << "e, " << global_stats.pop.steal.locked << "l\n";
+		double push_suc = (100.0 * double(global_stats.push.success) / global_stats.push.attempt);
+		double push_len = double(global_stats.push.attempt     ) / global_stats.push.success;
+		os << "Push   Pick : " << push_suc << " %, len " << push_len << " (" << global_stats.push.attempt      << " / " << global_stats.push.success << ")\n";
+
+		double hlp_suc = (100.0 * double(global_stats.pop.help.success) / global_stats.pop.help.attempt);
+		double hlp_len = double(global_stats.pop.help.attempt     ) / global_stats.pop.help.success;
+		os << "Help        : " << hlp_suc << " %, len " << hlp_len << " (" << global_stats.pop.help.attempt      << " / " << global_stats.pop.help.success << ")\n";
+		os << "Help Fail   : " << global_stats.pop.help.espec << "s, " << global_stats.pop.help.eempty << "e, " << global_stats.pop.help.elock << "l\n";
+
+		double pop_suc = (100.0 * double(global_stats.pop.local.success) / global_stats.pop.local.attempt);
+		double pop_len = double(global_stats.pop.local.attempt     ) / global_stats.pop.local.success;
+		os << "Local       : " << pop_suc << " %, len " << pop_len << " (" << global_stats.pop.local.attempt      << " / " << global_stats.pop.local.success << ")\n";
+		os << "Local Fail  : " << global_stats.pop.local.espec << "s, " << global_stats.pop.local.eempty << "e, " << global_stats.pop.local.elock << "l\n";
+
+		double stl_suc = (100.0 * double(global_stats.pop.steal.success) / global_stats.pop.steal.attempt);
+		double stl_len = double(global_stats.pop.steal.attempt     ) / global_stats.pop.steal.success;
+		os << "Steal       : " << stl_suc << " %, len " << stl_len << " (" << global_stats.pop.steal.attempt      << " / " << global_stats.pop.steal.success << ")\n";
+		os << "Steal Fail  : " << global_stats.pop.steal.espec << "s, " << global_stats.pop.steal.eempty << "e, " << global_stats.pop.steal.elock << "l\n";
+
+		double srh_suc = (100.0 * double(global_stats.pop.search.success) / global_stats.pop.search.attempt);
+		double srh_len = double(global_stats.pop.search.attempt     ) / global_stats.pop.search.success;
+		os << "Search      : " << srh_suc << " %, len " << srh_len << " (" << global_stats.pop.search.attempt      << " / " << global_stats.pop.search.success << ")\n";
+		os << "Search Fail : " << global_stats.pop.search.espec << "s, " << global_stats.pop.search.eempty << "e, " << global_stats.pop.search.elock << "l\n";
+		os << "Helps       : " << std::setw(15) << std::scientific << global_stats.help / duration << "/sec (" << global_stats.help  << ")\n";
 	}
 private:
