Index: doc/theses/thierry_delisle_PhD/code/assert.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/assert.hpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
+++ doc/theses/thierry_delisle_PhD/code/assert.hpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <cassert>
+#include <cstdlib>
+
+#define sstr(s) #s
+#define xstr(s) sstr(s)
+
+extern const char * __my_progname;
+
+#define assertf(cond, ...) ({             \
+	if(!(cond)) {                       \
+		fprintf(stderr, "%s: " __FILE__ ":" xstr(__LINE__) ": %s: Assertion '" xstr(cond) "' failed.\n", __my_progname, __PRETTY_FUNCTION__); \
+		fprintf(stderr, __VA_ARGS__); \
+		fprintf(stderr, "\n"); \
+		std::abort();                 \
+	}                                   \
+})
Index: doc/theses/thierry_delisle_PhD/code/prefetch.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/prefetch.cpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
+++ doc/theses/thierry_delisle_PhD/code/prefetch.cpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
@@ -0,0 +1,106 @@
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <iostream>
+#include <locale>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <cassert>
+
+struct __attribute__((aligned(64))) element {
+	size_t value;
+};
+
+using block = std::array<element, 100>;
+
+block * create() {
+	block * b = new block();
+	for(auto & e : *b) {
+		e.value = rand();
+	}
+	b->back().value = b->size();
+
+	return b;
+}
+
+static inline size_t find(const block & b) {
+	size_t r = 0;
+	for(; r < b.size(); r++) {
+		if(__builtin_expect(b[r].value == b.size(), false)) break;
+	}
+
+	return r;
+}
+
+void usage(char * argv[]) {
+	std::cerr << argv[0] << ": [DURATION (FLOAT:SEC)] [NBLOCKS]" << std::endl;;
+	std::exit(1);
+}
+
+int main(int argc, char * argv[]) {
+	size_t nblocks = 1000;
+	double duration = 5;
+
+	std::cout.imbue(std::locale(""));
+
+	switch (argc)
+	{
+	case 3:
+		nblocks = std::stoul(argv[2]);
+		[[fallthrough]];
+	case 2:
+		duration = std::stod(argv[1]);
+		if( duration <= 0.0 ) {
+			std::cerr << "Duration must be positive, was " << argv[1] << "(" << duration << ")" << std::endl;
+			usage(argv);
+		}
+		[[fallthrough]];
+	case 1:
+		break;
+	default:
+		usage(argv);
+		break;
+	}
+
+	std::vector<std::unique_ptr<block>> blocks;
+	for(size_t i = 0; i < nblocks; i++) {
+		blocks.emplace_back( create() );
+	}
+	std::random_shuffle(blocks.begin(), blocks.end());
+
+	size_t CRC = 0;
+	size_t count = 0;
+
+	using clock = std::chrono::high_resolution_clock;
+	auto before = clock::now();
+
+	while(true) {
+		for(const auto & b : blocks) {
+			CRC += find(*b);
+			count++;
+		}
+		auto now = clock::now();
+		std::chrono::duration<double> durr = now - before;
+		if( durr.count() > duration ) {
+			break;
+		}
+	}
+
+	auto after = clock::now();
+	std::chrono::duration<double> durr = after - before;
+	duration = durr.count();
+
+	using std::chrono::duration_cast;
+	using std::chrono::nanoseconds;
+
+	size_t ops_sec = size_t(double(count) / duration);
+	auto dur_nano = duration_cast<nanoseconds>(std::chrono::duration<double>(1.0)).count();
+
+	std::cout << "CRC           : " << CRC << "\n";
+	std::cout << "Duration      : " << duration << "s\n";
+	std::cout << "Total ops     : " << count << "\n";
+	std::cout << "Ops/sec       : " << ops_sec << "\n";
+	std::cout << "ns/Op         : " << ( dur_nano / ops_sec )<< "\n";
+}
Index: doc/theses/thierry_delisle_PhD/code/processor.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/processor.hpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
+++ doc/theses/thierry_delisle_PhD/code/processor.hpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
@@ -0,0 +1,53 @@
+#include <atomic>
+
+struct thread {};
+
+struct cluster {
+	void add();
+	void remove();
+	thread * next();
+};
+
+struct processor {
+
+	cluster cluster;
+	std::atomic<bool> stop;
+	volatile bool idle;
+};
+
+
+void run(thread * ) {
+	// verify preemption
+
+	// run Thread
+
+	// verify preemption
+
+	// finish Running
+}
+
+void main(processor & self) {
+
+	self.cluster.add();
+
+	while(!self.stop) {
+		if(thread * t = self.cluster.next()) {
+			run(t);
+			continue;
+		}
+
+		self.set_idle();
+		std::atomic_thread_fence();
+
+		if(thread * t = self.cluster.next()) {
+			self.idle = false;
+			run(t);
+			continue;
+		}
+
+		halt();
+	}
+
+	self.cluster.remove();
+
+}
Index: doc/theses/thierry_delisle_PhD/code/processor_list.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/processor_list.hpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
+++ doc/theses/thierry_delisle_PhD/code/processor_list.hpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
@@ -0,0 +1,193 @@
+#include <cassert>
+
+#include <atomic>
+#include <new>
+#include <type_traits>
+
+struct processor;
+
+struct __attribute__((aligned(64))) processor_id {
+	std::atomic<processor *> handle;
+	std::atomic<bool> lock;
+
+	processor_id() = default;
+	processor_id(processor * proc) : handle(proc), lock() {
+		/*paranoid*/ assert(std::atomic_is_lock_free(&lock));
+	}
+};
+
+extern unsigned num();
+
+#define ERROR throw 1
+
+class processor_list {
+private:
+
+	static const constexpr std::size_t cache_line_size = 64;
+
+	static_assert(sizeof (processor_id) <= cache_line_size, "ERROR: Instances must fit in one cache line" );
+	static_assert(alignof(processor_id) == cache_line_size, "ERROR: Instances must aligned to one cache line" );
+
+	const unsigned max;     // total cachelines allocated
+	std::atomic_uint alloc; // cachelines currently in use
+	std::atomic_uint ready; // cachelines ready to iterate over (!= to alloc when thread is in second half of doregister)
+	std::atomic<bool> lock; // writerlock
+	processor_id * data;    // data pointer
+
+private:
+	inline void acquire(std::atomic<bool> & ll) {
+		while( __builtin_expect(ll.exchange(true),false) ) {
+			while(ll.load(std::memory_order_relaxed))
+				asm volatile("pause");
+		}
+		/* paranoid */ assert(ll);
+	}
+
+public:
+	processor_list()
+		: max(num())
+		, alloc(0)
+		, ready(0)
+		, lock{false}
+		, data( new processor_id[max] )
+	{
+		/*paranoid*/ assert(num() == max);
+		/*paranoid*/ assert(std::atomic_is_lock_free(&alloc));
+		/*paranoid*/ assert(std::atomic_is_lock_free(&ready));
+	}
+
+	~processor_list() {
+		delete[] data;
+	}
+
+	//=======================================================================
+	// Lock-Free registering/unregistering of threads
+	unsigned doregister(processor * proc) {
+		// Step - 1 : check if there is already space in the data
+		uint_fast32_t s = ready;
+
+		// Check among all the ready
+		for(uint_fast32_t i = 0; i < s; i++) {
+			processor * null = nullptr; // Re-write every loop since compare thrashes it
+			if( data[i].handle.load(std::memory_order_relaxed) == null
+			 && data[i].handle.compare_exchange_strong(null, proc)) {
+				/*paranoid*/ assert(i < ready);
+				/*paranoid*/ assert(alignof(decltype(data[i])) == cache_line_size);
+				/*paranoid*/ assert((uintptr_t(&data[i]) % cache_line_size) == 0);
+				return i;
+			}
+		}
+
+		if(max <= alloc) ERROR;
+
+		// Step - 2 : F&A to get a new spot in the array.
+		uint_fast32_t n = alloc++;
+		if(max <= n) ERROR;
+
+		// Step - 3 : Mark space as used and then publish it.
+		void * storage = &data[n];
+		new (storage) processor_id( proc );
+		while(true) {
+			unsigned copy = n;
+			if( ready.load(std::memory_order_relaxed) == n
+			 && ready.compare_exchange_weak(copy, n + 1) )
+			 	break;
+			asm volatile("pause");
+		}
+
+		// Return new spot.
+		/*paranoid*/ assert(n < ready);
+		/*paranoid*/ assert(alignof(decltype(data[n])) == cache_line_size);
+		/*paranoid*/ assert((uintptr_t(&data[n]) % cache_line_size) == 0);
+		return n;
+	}
+
+	processor * unregister(unsigned iproc) {
+		/*paranoid*/ assert(iproc < ready);
+		auto ret = data[iproc].handle.load(std::memory_order_relaxed);
+		data[iproc].handle = nullptr;
+		return ret;
+	}
+
+	// Reset all registration
+	// Unsafe in most cases, use for testing only.
+	void reset() {
+		alloc = 0;
+		ready = 0;
+	}
+
+	processor * get(unsigned iproc) {
+		return data[iproc].handle.load(std::memory_order_relaxed);
+	}
+
+	//=======================================================================
+	// Reader-writer lock implementation
+	// Concurrent with doregister/unregister,
+	//    i.e., threads can be added at any point during or between the entry/exit
+
+	//-----------------------------------------------------------------------
+	// Reader side
+	void read_lock(unsigned iproc) {
+		/*paranoid*/ assert(iproc < ready);
+
+		// Step 1 : make sure no writer are in the middle of the critical section
+		while(lock.load(std::memory_order_relaxed))
+			asm volatile("pause");
+
+		// Fence needed because we don't want to start trying to acquire the lock
+		// before we read a false.
+		// Not needed on x86
+		// std::atomic_thread_fence(std::memory_order_seq_cst);
+
+		// Step 2 : acquire our local lock
+		acquire( data[iproc].lock );
+		/*paranoid*/ assert(data[iproc].lock);
+	}
+
+	void read_unlock(unsigned iproc) {
+		/*paranoid*/ assert(iproc < ready);
+		/*paranoid*/ assert(data[iproc].lock);
+		data[iproc].lock.store(false, std::memory_order_release);
+	}
+
+	//-----------------------------------------------------------------------
+	// Writer side
+	uint_fast32_t write_lock() {
+		// Step 1 : lock global lock
+		// It is needed to avoid processors that register mid Critical-Section
+		//   to simply lock their own lock and enter.
+		acquire(lock);
+
+		// Step 2 : lock per-proc lock
+		// Processors that are currently being registered aren't counted
+		//   but can't be in read_lock or in the critical section.
+		// All other processors are counted
+		uint_fast32_t s = ready;
+		for(uint_fast32_t i = 0; i < s; i++) {
+			acquire( data[i].lock );
+		}
+
+		return s;
+	}
+
+	void write_unlock(uint_fast32_t last_s) {
+		// Step 1 : release local locks
+		// This must be done while the global lock is held to avoid
+		//   threads that where created mid critical section
+		//   to race to lock their local locks and have the writer
+		//   immidiately unlock them
+		// Alternative solution : return s in write_lock and pass it to write_unlock
+		for(uint_fast32_t i = 0; i < last_s; i++) {
+			assert(data[i].lock);
+			data[i].lock.store(false, std::memory_order_release);
+		}
+
+		// Step 2 : release global lock
+		/*paranoid*/ assert(true == lock);
+		lock.store(false, std::memory_order_release);
+	}
+
+public:
+};
+
+#undef ERROR
Index: doc/theses/thierry_delisle_PhD/code/processor_list_fast.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/processor_list_fast.cpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
+++ doc/theses/thierry_delisle_PhD/code/processor_list_fast.cpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
@@ -0,0 +1,158 @@
+#include "processor_list.hpp"
+
+#include <array>
+#include <iomanip>
+#include <iostream>
+#include <locale>
+#include <string>
+#include <thread>
+
+#include "utils.hpp"
+
+unsigned num() {
+	return 0x1000000;
+}
+
+//-------------------
+
+struct processor {
+	unsigned id;
+};
+void run(unsigned nthread, double duration, unsigned writes) {
+	assert(writes < 100);
+
+	// List being tested
+	processor_list list = {};
+
+	// Barrier for synchronization
+	barrier_t barrier(nthread + 1);
+
+	// Data to check everything is OK
+	size_t write_committed = 0ul;
+	std::atomic_size_t lock_cnt_write = { 0ul };
+	std::atomic_size_t lock_cnt_read  = { 0ul };
+
+	// Flag to signal termination
+	std::atomic_bool done = { false };
+
+	std::thread * threads[nthread];
+	unsigned i = 1;
+	for(auto & t : threads) {
+		t = new std::thread([&done, &list, &barrier, &write_committed, &lock_cnt_write, &lock_cnt_read, writes](unsigned tid) {
+			Random rand(tid + rdtscl());
+			processor proc;
+			proc.id = list.doregister(&proc);
+			size_t writes_cnt = 0;
+			size_t reads_cnt = 0;
+
+			affinity(tid);
+
+			barrier.wait(tid);
+
+			while(__builtin_expect(!done, true)) {
+				if ((rand.next() % 100) < writes) {
+					auto n = list.write_lock();
+					write_committed++;
+					writes_cnt++;
+					assert(writes_cnt < -2ul);
+					list.write_unlock(n);
+				}
+				else {
+					list.read_lock(proc.id);
+					reads_cnt++;
+					assert(reads_cnt < -2ul);
+					list.read_unlock(proc.id);
+				}
+			}
+
+			barrier.wait(tid);
+
+			auto p = list.unregister(proc.id);
+			assert(&proc == p);
+			lock_cnt_write += writes_cnt;
+			lock_cnt_read  += reads_cnt;
+		}, i++);
+	}
+
+	auto before = Clock::now();
+	barrier.wait(0);
+
+	while(true) {
+		usleep(1000);
+		auto now = Clock::now();
+		duration_t durr = now - before;
+		if( durr.count() > duration ) {
+			done = true;
+			break;
+		}
+	}
+
+	barrier.wait(0);
+	auto after = Clock::now();
+	duration_t durr = after - before;
+	duration = durr.count();
+
+	for(auto t : threads) {
+		t->join();
+		delete t;
+	}
+
+	assert(write_committed == lock_cnt_write);
+
+	size_t ops_sec = size_t(double(lock_cnt_read + lock_cnt_write) / duration);
+	size_t ops_thread = ops_sec / nthread;
+	double dur_nano = duration_cast<std::nano>(1.0);
+
+	std::cout << "Duration      : " << duration << "s\n";
+	std::cout << "Total ops     : " << (lock_cnt_read + lock_cnt_write) << "(" << lock_cnt_read << "r, " << lock_cnt_write << "w)\n";
+	std::cout << "Ops/sec       : " << ops_sec << "\n";
+	std::cout << "Ops/sec/thread: " << ops_thread << "\n";
+	std::cout << "ns/Op         : " << ( dur_nano / ops_thread )<< "\n";
+}
+
+void usage(char * argv[]) {
+	std::cerr << argv[0] << ": [DURATION (FLOAT:SEC)] [NTHREADS] [%WRITES]" << std::endl;;
+	std::exit(1);
+}
+
+int main(int argc, char * argv[]) {
+
+	double duration   = 5.0;
+	unsigned nthreads = 2;
+	unsigned writes   = 0;
+
+	std::cout.imbue(std::locale(""));
+
+	switch (argc)
+	{
+	case 4:
+		writes = std::stoul(argv[3]);
+		if( writes >= 100 ) {
+			std::cerr << "Writes must be valid percentage, was " << argv[3] << "(" << writes << ")" << std::endl;
+			usage(argv);
+		}
+		[[fallthrough]];
+	case 3:
+		nthreads = std::stoul(argv[2]);
+		[[fallthrough]];
+	case 2:
+		duration = std::stod(argv[1]);
+		if( duration <= 0.0 ) {
+			std::cerr << "Duration must be positive, was " << argv[1] << "(" << duration << ")" << std::endl;
+			usage(argv);
+		}
+		[[fallthrough]];
+	case 1:
+		break;
+	default:
+		usage(argv);
+		break;
+	}
+
+	check_cache_line_size();
+
+	std::cout << "Running " << nthreads << " threads for " << duration << " seconds with " << writes << "% writes" << std::endl;
+	run(nthreads, duration, writes);
+
+	return 0;
+}
Index: doc/theses/thierry_delisle_PhD/code/processor_list_good.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/processor_list_good.cpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
+++ doc/theses/thierry_delisle_PhD/code/processor_list_good.cpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
@@ -0,0 +1,269 @@
+#include "processor_list.hpp"
+
+#include <iostream>
+#include <string>
+#include <thread>
+
+unsigned num() {
+	return 0x1000000;
+}
+
+// Barrier from
+class barrier_t {
+public:
+	barrier_t(size_t total)
+		: waiting(0)
+		, total(total)
+	{}
+
+	void wait(unsigned) {
+		size_t target = waiting++;
+		target = (target - (target % total)) + total;
+		while(waiting < target)
+			asm volatile("pause");
+
+		assert(waiting < (1ul << 60));
+    	}
+
+private:
+	std::atomic<size_t> waiting;
+	size_t total;
+};
+
+class Random {
+private:
+	unsigned int seed;
+public:
+	Random(int seed) {
+		this->seed = seed;
+	}
+
+	/** returns pseudorandom x satisfying 0 <= x < n. **/
+	unsigned int next() {
+		seed ^= seed << 6;
+		seed ^= seed >> 21;
+		seed ^= seed << 7;
+		return seed;
+    	}
+};
+
+//-------------------
+
+struct processor {
+	unsigned id;
+};
+
+// Stage 1
+// Make sure that the early registration works correctly
+// Registration uses a different process if the act of
+// registering the processor makes it the highest processor count
+// seen yet.
+void stage1(unsigned nthread, unsigned repeats) {
+	const int n = repeats;
+	const int nproc = 10;
+
+	// List being tested
+	processor_list list;
+
+	// Barrier for synchronization
+	barrier_t barrier(nthread + 1);
+
+	// Seen values to detect duplicattion
+	std::atomic<processor *> ids[nthread * nproc];
+	for(auto & i : ids) {
+		i = nullptr;
+	}
+
+	// Can't pass VLA to lambda
+	std::atomic<processor *> * idsp = ids;
+
+	// Threads which will run the code
+	std::thread * threads[nthread];
+	unsigned i = 1;
+	for(auto & t : threads) {
+		// Each thread will try to register a processor then add it to the
+		// list of registerd processor
+		t = new std::thread([&list, &barrier, idsp, n](unsigned tid){
+			processor proc[nproc];
+			for(int i = 0; i < n; i++) {
+				for(auto & p : proc) {
+					// Register the thread
+					p.id = list.doregister(&p);
+				}
+
+				for(auto & p : proc) {
+					// Make sure no one got this id before
+					processor * prev = idsp[p.id].exchange(&p);
+					assert(nullptr == prev);
+
+					// Make sure id is still consistend
+					assert(&p == list.get(p.id));
+				}
+
+				// wait for round to finish
+				barrier.wait(tid);
+
+				// wait for reset
+				barrier.wait(tid);
+			}
+		}, i++);
+	}
+
+	for(int i = 0; i < n; i++) {
+		//Wait for round to finish
+		barrier.wait(0);
+
+		// Reset list
+		list.reset();
+
+		std::cout << i << "\r";
+
+		// Reset seen values
+		for(auto & i : ids) {
+			i = nullptr;
+		}
+
+		// Start next round
+		barrier.wait(0);
+	}
+
+	for(auto t : threads) {
+		t->join();
+		delete t;
+	}
+}
+
+// Stage 2
+// Check that once churning starts, registration is still consistent.
+void stage2(unsigned nthread, unsigned repeats) {
+	// List being tested
+	processor_list list;
+
+	// Threads which will run the code
+	std::thread * threads[nthread];
+	unsigned i = 1;
+	for(auto & t : threads) {
+		// Each thread will try to register a few processors and
+		// unregister them, making sure that the registration is
+		// consistent
+		t = new std::thread([&list, repeats](unsigned tid){
+			processor procs[10];
+			for(unsigned i = 0; i < repeats; i++) {
+				// register the procs and note the id
+				for(auto & p : procs) {
+					p.id = list.doregister(&p);
+				}
+
+				if(1 == tid) std::cout << i << "\r";
+
+				// check the id is still consistent
+				for(const auto & p : procs) {
+					assert(&p == list.get(p.id));
+				}
+
+				// unregister and check the id is consistent
+				for(const auto & p : procs) {
+					assert(&p == list.unregister(p.id));
+				}
+			}
+		}, i++);
+	}
+
+	for(auto t : threads) {
+		t->join();
+		delete t;
+	}
+}
+
+bool is_writer();
+
+// Stage 3
+// Check that the reader writer lock works.
+void stage3(unsigned nthread, unsigned repeats) {
+	// List being tested
+	processor_list list;
+
+	size_t before = 0;
+
+	std::unique_ptr<size_t> after( new size_t(0) );
+
+	std::atomic<bool> done ( false );
+
+	// Threads which will run the code
+	std::thread * threads[nthread];
+	unsigned i = 1;
+	for(auto & t : threads) {
+		// Each thread will try to register a few processors and
+		// unregister them, making sure that the registration is
+		// consistent
+		t = new std::thread([&list, repeats, &before, &after, &done](unsigned tid){
+			Random rng(tid);
+			processor proc;
+			proc.id = list.doregister(&proc);
+			while(!done) {
+
+				if( (rng.next() % 100) == 0 ) {
+					auto r = list.write_lock();
+
+					auto b = before++;
+
+					std::cout << b << "\r";
+
+					(*after)++;
+
+					if(b >= repeats) done = true;
+
+					list.write_unlock(r);
+				}
+				else {
+					list.read_lock(proc.id);
+					assert(before == *after);
+					list.read_unlock(proc.id);
+				}
+
+			}
+
+			list.unregister(proc.id);
+		}, i++);
+	}
+
+	for(auto t : threads) {
+		t->join();
+		delete t;
+	}
+}
+
+int main(int argc, char * argv[]) {
+
+	unsigned nthreads = 1;
+	if( argc >= 3 ) {
+		size_t idx;
+		nthreads = std::stoul(argv[2], &idx);
+		assert('\0' == argv[2][idx]);
+	}
+
+	unsigned repeats = 100;
+	if( argc >= 2 ) {
+		size_t idx;
+		repeats = std::stoul(argv[1], &idx);
+		assert('\0' == argv[1][idx]);
+	}
+
+	processor_list::check_cache_line_size();
+
+	std::cout << "Running " << repeats << " repetitions on " << nthreads << " threads" << std::endl;
+	std::cout << "Checking registration - early" << std::endl;
+	stage1(nthreads, repeats);
+	std::cout << "Done                         " << std::endl;
+
+	std::cout << "Checking registration - churn" << std::endl;
+	stage2(nthreads, repeats);
+	std::cout << "Done                         " << std::endl;
+
+	std::cout << "Checking RW lock             " << std::endl;
+	stage3(nthreads, repeats);
+	std::cout << "Done                         " << std::endl;
+
+
+	return 0;
+}
Index: doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
+++ doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
@@ -0,0 +1,230 @@
+#include "relaxed_list.hpp"
+
+#include <array>
+#include <iostream>
+#include <locale>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include <unistd.h>
+#include <sys/sysinfo.h>
+
+#include "utils.hpp"
+
+struct Node {
+	static std::atomic_size_t creates;
+	static std::atomic_size_t destroys;
+
+	_LinksFields_t<Node> _links;
+
+	int value;
+	Node(int value): value(value) {
+		creates++;
+	}
+
+	~Node() {
+		destroys++;
+	}
+};
+
+std::atomic_size_t Node::creates  = { 0 };
+std::atomic_size_t Node::destroys = { 0 };
+
+static const constexpr int nodes_per_threads = 128;
+
+bool enable_stats = false;
+
+__attribute__((aligned(64))) thread_local pick_stat local_pick;
+
+void run(unsigned nthread, double duration) {
+	// List being tested
+	relaxed_list<Node> list = { nthread * 2 };
+
+	// Barrier for synchronization
+	barrier_t barrier(nthread + 1);
+
+	// Data to check everything is OK
+	struct {
+		std::atomic_size_t in  = { 0 };
+		std::atomic_size_t out = { 0 };
+		std::atomic_size_t empty = { 0 };
+		std::atomic_size_t crc_in  = { 0 };
+		std::atomic_size_t crc_out = { 0 };
+		std::atomic_size_t pick_at = { 0 };
+		std::atomic_size_t pick_su = { 0 };
+	} global;
+
+	// Flag to signal termination
+	std::atomic_bool done  = { false };
+
+	// Prep nodes
+	std::cout << "Initializing" << std::endl;
+	std::vector<Node *> all_nodes[nthread];
+	for(auto & nodes : all_nodes) {
+		Random rand(rdtscl());
+		nodes.resize(nodes_per_threads);
+		for(auto & node : nodes) {
+			node = new Node(rand.next() % 100);
+		}
+
+		for(int i = 0; i < 10; i++) {
+			int idx = rand.next() % nodes_per_threads;
+			if (auto node = nodes[idx]) {
+				global.crc_in += node->value;
+				list.push(node);
+				nodes[idx] = nullptr;
+			}
+		}
+	}
+
+	enable_stats = true;
+
+	std::thread * threads[nthread];
+	unsigned i = 1;
+	for(auto & t : threads) {
+		auto & my_nodes = all_nodes[i - 1];
+		t = new std::thread([&done, &list, &barrier, &global, &my_nodes](unsigned tid) {
+			Random rand(tid + rdtscl());
+
+			size_t local_in  = 0;
+			size_t local_out = 0;
+			size_t local_empty = 0;
+			size_t local_crc_in  = 0;
+			size_t local_crc_out = 0;
+
+			affinity(tid);
+
+			barrier.wait(tid);
+
+			// EXPERIMENT START
+
+			while(__builtin_expect(!done, true)) {
+				int idx = rand.next() % nodes_per_threads;
+				if (auto node = my_nodes[idx]) {
+					local_crc_in += node->value;
+					list.push(node);
+					my_nodes[idx] = nullptr;
+					local_in++;
+				}
+				else if(auto node = list.pop2()) {
+					local_crc_out += node->value;
+					my_nodes[idx] = node;
+					local_out++;
+				}
+				else {
+					local_empty++;
+				}
+			}
+
+			// EXPERIMENT END
+
+			barrier.wait(tid);
+
+			global.in    += local_in;
+			global.out   += local_out;
+			global.empty += local_empty;
+
+			for(auto node : my_nodes) {
+				delete node;
+			}
+
+			global.crc_in  += local_crc_in;
+			global.crc_out += local_crc_out;
+
+			global.pick_at += local_pick.attempt;
+			global.pick_su += local_pick.success;
+		}, i++);
+	}
+
+	std::cout << "Starting" << std::endl;
+	auto before = Clock::now();
+	barrier.wait(0);
+
+	while(true) {
+		usleep(1000);
+		auto now = Clock::now();
+		duration_t durr = now - before;
+		if( durr.count() > duration ) {
+			done = true;
+			break;
+		}
+	}
+
+	barrier.wait(0);
+	auto after = Clock::now();
+	duration_t durr = after - before;
+	duration = durr.count();
+	std::cout << "Closing down" << std::endl;
+
+	for(auto t : threads) {
+		t->join();
+		delete t;
+	}
+
+	enable_stats = false;
+
+	while(auto node = list.pop()) {
+		global.crc_out += node->value;
+		delete node;
+	}
+
+	assert(Node::creates == Node::destroys);
+	assert(global.crc_in == global.crc_out);
+
+	std::cout << "Done" << std::endl;
+
+	size_t ops = global.in + global.out;
+	size_t ops_sec = size_t(double(ops) / duration);
+	size_t ops_thread = ops_sec / nthread;
+	auto dur_nano = duration_cast<std::nano>(1.0);
+
+	std::cout << "Duration      : " << duration << "s\n";
+	std::cout << "Total ops     : " << ops << "(" << global.in << "i, " << global.out << "o, " << global.empty << "e)\n";
+	std::cout << "Ops/sec       : " << ops_sec << "\n";
+	std::cout << "Ops/sec/thread: " << ops_thread << "\n";
+	std::cout << "ns/Op         : " << ( dur_nano / ops_thread )<< "\n";
+	std::cout << "Pick %        : " << (100.0 * double(global.pick_su) / global.pick_at) << "(" << global.pick_su << " / " << global.pick_at << ")\n";
+}
+
+void usage(char * argv[]) {
+	std::cerr << argv[0] << ": [DURATION (FLOAT:SEC)] [NTHREADS]" << std::endl;;
+	std::exit(1);
+}
+
+int main(int argc, char * argv[]) {
+
+	double duration   = 5.0;
+	unsigned nthreads = 2;
+
+	std::cout.imbue(std::locale(""));
+
+	switch (argc)
+	{
+	case 3:
+		nthreads = std::stoul(argv[2]);
+		[[fallthrough]];
+	case 2:
+		duration = std::stod(argv[1]);
+		if( duration <= 0.0 ) {
+			std::cerr << "Duration must be positive, was " << argv[1] << "(" << duration << ")" << std::endl;
+			usage(argv);
+		}
+		[[fallthrough]];
+	case 1:
+		break;
+	default:
+		usage(argv);
+		break;
+	}
+
+	check_cache_line_size();
+
+	std::cout << "Running " << nthreads << " threads for " << duration << " seconds" << std::endl;
+	run(nthreads, duration);
+
+	return 0;
+}
+
+template<>
+thread_local Random relaxed_list<Node>::rng_g = { int(rdtscl()) };
Index: doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
+++ doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
@@ -0,0 +1,303 @@
+#pragma once
+
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <type_traits>
+
+#include "assert.hpp"
+#include "utils.hpp"
+
+using namespace std;
+
+extern bool enable_stats;
+
+
+struct pick_stat {
+	size_t attempt = 0;
+	size_t success = 0;
+};
+
+extern __attribute__((aligned(64))) thread_local pick_stat local_pick;
+
+template<typename node_t>
+struct _LinksFields_t {
+	node_t * prev = nullptr;
+	node_t * next = nullptr;
+	unsigned long long ts = 0;
+};
+
+struct spinlock_t {
+	std::atomic_bool ll = { false };
+
+	inline void lock() {
+		while( __builtin_expect(ll.exchange(true),false) ) {
+			while(ll.load(std::memory_order_relaxed))
+				asm volatile("pause");
+		}
+	}
+
+	inline void unlock() {
+		ll.store(false, std::memory_order_release);
+	}
+};
+
+template<typename node_t>
+class relaxed_list {
+	static_assert(std::is_same<decltype(node_t::_links), _LinksFields_t<node_t>>::value, "Node must have a links field");
+
+
+public:
+	relaxed_list(unsigned numLists)
+		: numLists(numLists)
+		, lists(new intrusive_queue_t[numLists])
+	  	, numNonEmpty(0)
+	{}
+
+    	void push(node_t * node) {
+		int i = rng_g.next() % numLists;
+		lists[i].push(node, numNonEmpty);
+    	}
+
+	node_t * pop() {
+		int i = pickRandomly(-1);
+		int j = pickRandomly(i);
+
+		if(i == -1) {
+			return nullptr;
+		}
+
+		auto guard = lock(i, j);
+		auto & list = best(i, j);
+		return list.pop(numNonEmpty);
+    	}
+
+	node_t * pop2() {
+		int i = pickRandomly(-1);
+		int j = pickRandomly(i);
+
+		if(i == -1) {
+			return nullptr;
+		}
+
+		auto & list = best2(i, j);
+		return list.pop2(numNonEmpty);
+    	}
+
+private:
+
+	class intrusive_queue_t {
+	public:
+		typedef spinlock_t lock_t;
+
+		friend class relaxed_list<node_t>;
+
+	private:
+		struct sentinel_t {
+			_LinksFields_t<node_t> _links;
+		};
+
+		struct stat {
+			size_t push = 0;
+			size_t pop  = 0;
+		};
+
+		__attribute__((aligned(64))) lock_t lock;
+		__attribute__((aligned(64))) bool empty;
+		stat s;
+		sentinel_t before;
+		sentinel_t after;
+
+		static constexpr auto fields_offset = offsetof( node_t, _links );
+	public:
+		intrusive_queue_t()
+			: empty(true)
+			, before{{ nullptr, tail() }}
+			, after {{ head(), nullptr }}
+		{
+			assert((reinterpret_cast<uintptr_t>( head() ) + fields_offset) == reinterpret_cast<uintptr_t>(&before));
+			assert((reinterpret_cast<uintptr_t>( tail() ) + fields_offset) == reinterpret_cast<uintptr_t>(&after ));
+			assert(head()->_links.prev == nullptr);
+			assert(head()->_links.next == tail() );
+			assert(tail()->_links.next == nullptr);
+			assert(tail()->_links.prev == head() );
+		}
+
+		~intrusive_queue_t() {
+			std::cout << " Push: " << s.push << "\tPop: " << s.pop << "\t(this: " << this << ")" << std::endl;
+		}
+
+		node_t * head() const {
+			return reinterpret_cast<node_t *>(
+				reinterpret_cast<uintptr_t>( &before ) - fields_offset
+			);
+		}
+
+		node_t * tail() const {
+			return reinterpret_cast<node_t *>(
+				reinterpret_cast<uintptr_t>( &after ) - fields_offset
+			);
+		}
+
+		void push(node_t * node, volatile int & nonEmpty) {
+			node_t * tail = this->tail();
+			std::lock_guard<lock_t> guard(lock);
+			node->_links.ts = rdtscl();
+
+			node_t * prev = tail->_links.prev;
+			// assertf(node->_links.ts >= prev->_links.ts,
+			// "New node has smaller timestamp: %llu < %llu", node->_links.ts, prev->_links.ts);
+			node->_links.next = tail;
+			node->_links.prev = prev;
+			prev->_links.next = node;
+			tail->_links.prev = node;
+			if(empty) {
+				__atomic_fetch_add(&nonEmpty, 1, __ATOMIC_SEQ_CST);
+				empty = false;
+			}
+			if(enable_stats) s.push++;
+		}
+
+		node_t * pop(volatile int & nonEmpty) {
+			node_t * head = this->head();
+			node_t * tail = this->tail();
+
+			node_t * node = head->_links.next;
+			node_t * next = node->_links.next;
+			if(node == tail) return nullptr;
+
+			head->_links.next = next;
+			next->_links.prev = head;
+
+			if(next == tail) {
+				empty = true;
+				__atomic_fetch_sub(&nonEmpty, 1, __ATOMIC_SEQ_CST);
+			}
+			if(enable_stats) s.pop++;
+			return node;
+		}
+
+		node_t * pop2(volatile int & nonEmpty) {
+			node_t * head = this->head();
+			node_t * tail = this->tail();
+
+			std::lock_guard<lock_t> guard(lock);
+			node_t * node = head->_links.next;
+			node_t * next = node->_links.next;
+			if(node == tail) return nullptr;
+
+			head->_links.next = next;
+			next->_links.prev = head;
+
+			if(next == tail) {
+				empty = true;
+				__atomic_fetch_sub(&nonEmpty, 1, __ATOMIC_SEQ_CST);
+			}
+			if(enable_stats) s.pop++;
+			return node;
+		}
+
+		static intrusive_queue_t & best(intrusive_queue_t & lhs, intrusive_queue_t & rhs) {
+			bool lhs_empty = lhs.empty;
+			bool rhs_empty = rhs.empty;
+
+			if(lhs_empty && rhs_empty) return lhs;
+			if(!lhs_empty && rhs_empty) return lhs;
+			if(lhs_empty && !rhs_empty) return rhs;
+			node_t * lhs_head = lhs.head()->_links.next;
+			node_t * rhs_head = rhs.head()->_links.next;
+
+			assert(lhs_head != lhs.tail());
+			assert(rhs_head != rhs.tail());
+
+			if(lhs_head->_links.ts < lhs_head->_links.ts) {
+				return lhs;
+			} else {
+				return rhs;
+			}
+		}
+
+		static intrusive_queue_t & best2(intrusive_queue_t & lhs, intrusive_queue_t & rhs) {
+			node_t * lhs_head = lhs.head()->_links.next;
+			node_t * rhs_head = rhs.head()->_links.next;
+
+			bool lhs_empty = lhs_head != lhs.tail();
+			bool rhs_empty = rhs_head != rhs.tail();
+			if(lhs_empty && rhs_empty) return lhs;
+			if(!lhs_empty && rhs_empty) return lhs;
+			if(lhs_empty && !rhs_empty) return rhs;
+
+			if(lhs_head->_links.ts < lhs_head->_links.ts) {
+				return lhs;
+			} else {
+				return rhs;
+			}
+		}
+	};
+
+
+private:
+
+	static thread_local Random rng_g;
+    	__attribute__((aligned(64))) const unsigned numLists;
+	std::unique_ptr<intrusive_queue_t []> lists;
+	__attribute__((aligned(64))) volatile int numNonEmpty; // number of non-empty lists
+
+
+private:
+
+
+
+private:
+	int pickRandomly(const int avoid) {
+		int j;
+		do {
+			local_pick.attempt++;
+			j = rng_g.next() % numLists;
+			if (numNonEmpty < 1 + (avoid != -1)) return -1;
+		} while (j == avoid || lists[j].empty);
+		local_pick.success++;
+		return j;
+	}
+
+private:
+
+	struct queue_guard {
+		intrusive_queue_t * lists;
+		int i, j;
+
+		queue_guard(intrusive_queue_t * lists, int i, int j)
+			: lists(lists), i(i), j(j)
+		{
+			if(i >= 0) lists[i].lock.lock();
+			if(j >= 0) lists[j].lock.lock();
+		}
+
+		queue_guard(const queue_guard &) = delete;
+		queue_guard(queue_guard &&) = default;
+
+		~queue_guard() {
+			if(i >= 0) lists[i].lock.unlock();
+			if(j >= 0) lists[j].lock.unlock();
+		}
+	};
+
+	auto lock(int i, int j) {
+		assert(i >= 0);
+		assert(i != j);
+		if(j < i) return queue_guard(lists.get(), j, i);
+		return queue_guard(lists.get(), i, j);
+	}
+
+	intrusive_queue_t & best(int i, int j) {
+		assert(i != -1);
+		if(j == -1) return lists[i];
+		return intrusive_queue_t::best(lists[i], lists[j]);
+	}
+
+	intrusive_queue_t & best2(int i, int j) {
+		assert(i != -1);
+		if(j == -1) return lists[i];
+		return intrusive_queue_t::best2(lists[i], lists[j]);
+	}
+};
Index: doc/theses/thierry_delisle_PhD/code/utils.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/utils.hpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
+++ doc/theses/thierry_delisle_PhD/code/utils.hpp	(revision b2a37b045b22a2a00392f054434cf6298517bec0)
@@ -0,0 +1,104 @@
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <atomic>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+
+#include <unistd.h>
+#include <sys/sysinfo.h>
+
+// Barrier from
+class barrier_t {
+public:
+	barrier_t(size_t total)
+		: waiting(0)
+		, total(total)
+	{}
+
+	void wait(unsigned) {
+		size_t target = waiting++;
+		target = (target - (target % total)) + total;
+		while(waiting < target)
+			asm volatile("pause");
+
+		assert(waiting < (1ul << 60));
+    	}
+
+private:
+	std::atomic<size_t> waiting;
+	size_t total;
+};
+
+class Random {
+private:
+	unsigned int seed;
+public:
+	Random(int seed) {
+		this->seed = seed;
+	}
+
+	/** returns pseudorandom x satisfying 0 <= x < n. **/
+	unsigned int next() {
+		seed ^= seed << 6;
+		seed ^= seed >> 21;
+		seed ^= seed << 7;
+		return seed;
+    	}
+};
+
+static inline long long rdtscl(void) {
+    unsigned int lo, hi;
+    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
+}
+
+void affinity(int tid) {
+	static int cpus = get_nprocs();
+
+	cpu_set_t  mask;
+	CPU_ZERO(&mask);
+	int cpu = cpus - tid;  // Set CPU affinity to tid, starting from the end
+	CPU_SET(cpu, &mask);
+	auto result = sched_setaffinity(0, sizeof(mask), &mask);
+	if(result != 0) {
+		std::cerr << "Affinity set failed with " << result<< ", wanted " << cpu << std::endl;
+	}
+}
+
+static const constexpr std::size_t cache_line_size = 64;
+void check_cache_line_size() {
+	std::cout << "Checking cache line size" << std::endl;
+	const std::string cache_file = "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size";
+
+	std::ifstream ifs (cache_file, std::ifstream::in);
+
+	if(!ifs.good()) {
+		std::cerr << "Could not open file to check cache line size" << std::endl;
+		std::cerr << "Looking for: " << cache_file << std::endl;
+		std::exit(2);
+	}
+
+	size_t got;
+	ifs >> got;
+
+	ifs.close();
+
+	if(cache_line_size != got) {
+		std::cerr << "Cache line has incorrect size : " << got << std::endl;
+		std::exit(1);
+	}
+
+	std::cout << "Done" << std::endl;
+}
+
+using Clock = std::chrono::high_resolution_clock;
+using duration_t = std::chrono::duration<double>;
+using std::chrono::nanoseconds;
+
+template<typename Ratio, typename T>
+T duration_cast(T seconds) {
+	return std::chrono::duration_cast<std::chrono::duration<T, Ratio>>(std::chrono::duration<T>(seconds)).count();
+}