Index: benchmark/Cargo.toml.in
===================================================================
--- benchmark/Cargo.toml.in	(revision 7b1f6d46fc5a0667747e03864d0195d693c13f3b)
+++ benchmark/Cargo.toml.in	(revision 7b1f6d46fc5a0667747e03864d0195d693c13f3b)
@@ -0,0 +1,25 @@
+[package]
+name = "cforall-rust-bench"
+version = "@VERSION@"
+authors = ["Cforall"]
+edition = "2018"
+
+[[bin]]
+name = "cycle-tokio"
+path = "@abs_srcdir@/readyQ/cycle.rs"
+
+[[bin]]
+name = "locality-tokio"
+path = "@abs_srcdir@/readyQ/locality.rs"
+
+[features]
+sync = ["tokio/sync"]
+time = ["tokio/time"]
+
+[dependencies]
+clap = "2.33"
+isatty = "0.1"
+num-format = "0.4.0"
+rand = "*"
+tokio = { version = "0.3.0", features = ["full"] }
+
Index: benchmark/Makefile.am
===================================================================
--- benchmark/Makefile.am	(revision 276a94d733611091c32960a63117feee9a0aae97)
+++ benchmark/Makefile.am	(revision 7b1f6d46fc5a0667747e03864d0195d693c13f3b)
@@ -522,2 +522,8 @@
 size-cfa$(EXEEXT):
 	$(BENCH_V_CFA)$(CFACOMPILE) $(srcdir)/size/size.cfa
+
+## =========================================================================================================
+
+%-tokio$(EXEEXT): $(srcdir)/readyQ/%.rs $(srcdir)/bench.rs
+	cd $(builddir) && cargo build --release
+	cp $(builddir)/target/release/$(basename $@) $@
Index: benchmark/bench.rs
===================================================================
--- benchmark/bench.rs	(revision 7b1f6d46fc5a0667747e03864d0195d693c13f3b)
+++ benchmark/bench.rs	(revision 7b1f6d46fc5a0667747e03864d0195d693c13f3b)
@@ -0,0 +1,70 @@
+use std::io::{self, Write};
+use std::sync::atomic::{AtomicU64, AtomicBool, Ordering};
+use std::time::{Instant,Duration};
+
+use clap::{Arg, ArgMatches};
+use isatty::stdout_isatty;
+
+use tokio::time;
+
+
+// ==================================================
+pub fn args<'a, 'b>() -> [Arg<'a, 'b>; 4] {[
+	Arg::with_name("duration")  .short("d").long("duration")  .takes_value(true).default_value("5").help("Duration of the experiments in seconds"),
+	Arg::with_name("iterations").short("i").long("iterations").takes_value(true).conflicts_with("duration").help("Number of iterations of the experiments"),
+	Arg::with_name("nthreads")  .short("t").long("nthreads")  .takes_value(true).default_value("1").help("Number of threads to use"),
+	Arg::with_name("nprocs")    .short("p").long("nprocs")    .takes_value(true).default_value("1").help("Number of processors to use")
+]}
+
+pub struct BenchData {
+	pub clock_mode: bool,
+	pub stop: AtomicBool,
+	pub stop_count: u64,
+	pub duration: f64,
+	pub threads_left: AtomicU64,
+	is_tty: bool,
+}
+
+impl BenchData {
+	pub fn new(options: ArgMatches, nthreads: usize) -> BenchData {
+		let (clock_mode, stop_count, duration) = if options.is_present("iterations") {
+			(false,
+			options.value_of("iterations").unwrap().parse::<u64>().unwrap(),
+			-1.0)
+		} else {
+			(true,
+			std::u64::MAX,
+			options.value_of("duration").unwrap().parse::<f64>().unwrap())
+		};
+
+		BenchData{
+			clock_mode: clock_mode,
+			stop: AtomicBool::new(false),
+			stop_count: stop_count,
+			duration: duration,
+			threads_left: AtomicU64::new(nthreads as u64),
+			is_tty: stdout_isatty(),
+		}
+	}
+
+	pub async fn wait(&self, start: &Instant) -> Duration{
+		loop {
+			time::sleep(Duration::from_micros(100000)).await;
+			let delta = start.elapsed();
+			if self.is_tty {
+				print!(" {:.1}\r", delta.as_secs_f32());
+				io::stdout().flush().unwrap();
+			}
+			if self.clock_mode && delta >= Duration::from_secs_f64(self.duration)  {
+				break;
+			}
+			else if !self.clock_mode && self.threads_left.load(Ordering::Relaxed) == 0 {
+				break;
+			}
+		}
+
+		self.stop.store(true, Ordering::SeqCst);
+		return start.elapsed();
+	}
+}
+
Index: benchmark/readyQ/bench.go
===================================================================
--- benchmark/readyQ/bench.go	(revision 276a94d733611091c32960a63117feee9a0aae97)
+++ benchmark/readyQ/bench.go	(revision 7b1f6d46fc5a0667747e03864d0195d693c13f3b)
@@ -5,6 +5,8 @@
 	"flag"
 	"fmt"
+	"log"
 	"os"
 	"runtime"
+	"runtime/pprof"
 	"sync/atomic"
 	"time"
@@ -43,9 +45,10 @@
 }
 
-func bench_init() {
+func bench_init() func() {
 	nprocsOpt := flag.Int("p", 1, "The number of processors")
 	nthreadsOpt := flag.Int("t", 1, "The number of threads")
 	durationOpt := flag.Float64("d", 0, "Duration of the experiment in seconds")
 	stopOpt := flag.Uint64("i", 0, "Duration of the experiment in iterations")
+	cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file")
 
 	flag.Parse()
@@ -72,3 +75,17 @@
 
 	runtime.GOMAXPROCS(nprocs)
+
+	if (*cpuprofile) != "" {
+		f, err := os.Create(*cpuprofile)
+		if err != nil {
+		    log.Fatal(err)
+		}
+		pprof.StartCPUProfile(f)
+	}
+
+	return func() {
+		if (*cpuprofile) != "" {
+			pprof.StopCPUProfile()
+		}
+	}
 }
Index: benchmark/readyQ/cycle.rs
===================================================================
--- benchmark/readyQ/cycle.rs	(revision 276a94d733611091c32960a63117feee9a0aae97)
+++ benchmark/readyQ/cycle.rs	(revision 7b1f6d46fc5a0667747e03864d0195d693c13f3b)
@@ -1,67 +1,16 @@
-#[cfg(any(
-	feature = "sync time rt-threaded",
-  ))]
-
-extern crate tokio;
-
-use std::io::{self, Write};
 use std::sync::Arc;
-use std::sync::atomic::{AtomicU64, AtomicBool,Ordering};
-use std::time::{Instant,Duration};
+use std::sync::atomic::Ordering;
+use std::time::Instant;
 
 use tokio::runtime::Builder;
 use tokio::sync;
-use tokio::time;
 
-extern crate isatty;
-use isatty::stdout_isatty;
-
-extern crate num_format;
+use clap::{Arg, App};
 use num_format::{Locale, ToFormattedString};
 
-extern crate clap;
-use clap::{Arg, App};
+#[path = "../bench.rs"]
+mod bench;
 
-use std::cell::UnsafeCell;
-use std::mem::MaybeUninit;
-use std::ops;
-
-pub struct InitializeCell<T> {
-    inner: UnsafeCell<MaybeUninit<T>>,
-}
-
-unsafe impl<T> Sync for InitializeCell<T> {}
-
-impl<T> InitializeCell<T> {
-    pub const unsafe fn new_uninitialized() -> InitializeCell<T> {
-	  InitializeCell {
-		inner: UnsafeCell::new(MaybeUninit::uninit()),
-	  }
-    }
-    pub const fn new(init: T) -> InitializeCell<T> {
-	  InitializeCell {
-		inner: UnsafeCell::new(MaybeUninit::new(init)),
-	  }
-    }
-    pub unsafe fn init(&self, init: T) {
-	  (*self.inner.get()) = MaybeUninit::new(init);
-    }
-}
-
-impl<T> ops::Deref for InitializeCell<T> {
-    type Target = T;
-    fn deref(&self) -> &T {
-	  unsafe {
-		&*(*self.inner.get()).as_ptr()
-	  }
-    }
-}
-
-static CLOCK_MODE: InitializeCell<bool> = unsafe { InitializeCell::new_uninitialized() };
-static STOP_COUNT: InitializeCell<u64>  = unsafe { InitializeCell::new_uninitialized() };
-static DURATION: InitializeCell<f64>    = unsafe { InitializeCell::new_uninitialized() };
-static STOP         : AtomicBool = AtomicBool::new(false);
-static THREADS_LEFT : AtomicU64  = AtomicU64 ::new(10);
-
+// ==================================================
 struct Partner {
 	sem: sync::Semaphore,
@@ -69,5 +18,5 @@
 }
 
-async fn partner_main(result: sync::oneshot::Sender<u64>, idx: usize, others: Arc<Vec<Arc<Partner>>> ) {
+async fn partner_main(idx: usize, others: Arc<Vec<Arc<Partner>>>, exp: Arc<bench::BenchData> ) -> u64 {
 	let this = &others[idx];
 	let mut count:u64 = 0;
@@ -77,47 +26,16 @@
 		count += 1;
 
-		if  *CLOCK_MODE && STOP.load(Ordering::Relaxed) { break; }
-		if !*CLOCK_MODE && count >= *STOP_COUNT { break; }
+		if  exp.clock_mode && exp.stop.load(Ordering::Relaxed) { break; }
+		if !exp.clock_mode && count >= exp.stop_count { break; }
 	}
 
-	THREADS_LEFT.fetch_sub(1, Ordering::SeqCst);
-	result.send( count ).unwrap();
+	exp.threads_left.fetch_sub(1, Ordering::SeqCst);
+	count
 }
 
-fn prep(nthreads: usize, tthreads: usize) -> Vec<Arc<Partner>> {
-	let mut thddata = Vec::with_capacity(tthreads);
-	for i in 0..tthreads {
-		let pi = (i + nthreads) % tthreads;
-		thddata.push(Arc::new(Partner{
-			sem: sync::Semaphore::new(0),
-			next: pi,
-		}));
-	}
-	return thddata;
-}
-
-async fn wait(start: &Instant, is_tty: bool) {
-	loop {
-		time::sleep(Duration::from_micros(100000)).await;
-		let delta = start.elapsed();
-		if is_tty {
-			print!(" {:.1}\r", delta.as_secs_f32());
-			io::stdout().flush().unwrap();
-		}
-		if *CLOCK_MODE && delta >= Duration::from_secs_f64(*DURATION)  {
-			break;
-		}
-		else if !*CLOCK_MODE && THREADS_LEFT.load(Ordering::Relaxed) == 0 {
-			break;
-		}
-	}
-}
-
+// ==================================================
 fn main() {
 	let options = App::new("Cycle Tokio")
-		.arg(Arg::with_name("duration")  .short("d").long("duration")  .takes_value(true).default_value("5").help("Duration of the experiments in seconds"))
-		.arg(Arg::with_name("iterations").short("i").long("iterations").takes_value(true).conflicts_with("duration").help("Number of iterations of the experiments"))
-		.arg(Arg::with_name("nthreads")  .short("t").long("nthreads")  .takes_value(true).default_value("1").help("Number of threads to use"))
-		.arg(Arg::with_name("nprocs")    .short("p").long("nprocs")    .takes_value(true).default_value("1").help("Number of processors to use"))
+		.args(&bench::args())
 		.arg(Arg::with_name("ringsize")  .short("r").long("ringsize")  .takes_value(true).default_value("1").help("Number of threads in a cycle"))
 		.get_matches();
@@ -127,24 +45,19 @@
 	let nprocs    = options.value_of("nprocs").unwrap().parse::<usize>().unwrap();
 
-	if options.is_present("iterations") {
-		unsafe{
-			CLOCK_MODE.init( false );
-			STOP_COUNT.init( options.value_of("iterations").unwrap().parse::<u64>().unwrap() );
-		}
-	}
-	else {
-		unsafe{
-			CLOCK_MODE.init(true);
-			DURATION  .init(options.value_of("duration").unwrap().parse::<f64>().unwrap());
-		}
-	}
+	let tthreads = nthreads * ring_size;
+	let exp = Arc::new(bench::BenchData::new(options, tthreads));
 
 	let s = (1000000 as u64).to_formatted_string(&Locale::en);
 	assert_eq!(&s, "1,000,000");
 
-
-	let tthreads = nthreads * ring_size;
-	THREADS_LEFT.store(tthreads as u64, Ordering::SeqCst);
-	let thddata = Arc::new(prep(nthreads, tthreads));
+	let thddata : Arc<Vec<Arc<Partner>>> = Arc::new(
+		(0..tthreads).map(|i| {
+			let pi = (i + nthreads) % tthreads;
+			Arc::new(Partner{
+				sem: sync::Semaphore::new(0),
+				next: pi,
+			})
+		}).collect()
+	);
 
 	let mut global_counter :u64 = 0;
@@ -157,35 +70,25 @@
 
 	runtime.block_on(async {
-		let mut result  : Vec<sync::oneshot::Receiver::<u64>> = Vec::with_capacity(tthreads);
-		{
-			let mut threads = Vec::with_capacity(tthreads);
-			for i in 0..tthreads {
-				let (s, r) = sync::oneshot::channel::<u64>();
-				result.push(r);
-				threads.push(tokio::spawn(partner_main(s, i, thddata.clone())));
-			}
-			println!("Starting");
+		let threads: Vec<_> = (0..tthreads).map(|i| {
+			tokio::spawn(partner_main(i, thddata.clone(), exp.clone()))
+		}).collect();
+		println!("Starting");
 
-			let is_tty = stdout_isatty();
-			let start = Instant::now();
+		let start = Instant::now();
 
-			for i in 0..nthreads {
-				thddata[i].sem.add_permits(1);
-			}
+		for i in 0..nthreads {
+			thddata[i].sem.add_permits(1);
+		}
 
-			wait(&start, is_tty).await;
+		duration = exp.wait(&start).await;
 
-			STOP.store(true, Ordering::SeqCst);
-			duration = start.elapsed();
+		println!("\nDone");
 
-			println!("\nDone");
+		for i in 0..tthreads {
+			thddata[i].sem.add_permits(1);
+		}
 
-			for i in 0..tthreads {
-				thddata[i].sem.add_permits(1);
-			}
-
-			for _ in 0..tthreads {
-				global_counter += result.pop().unwrap().await.unwrap();
-			}
+		for t in threads {
+			global_counter += t.await.unwrap();
 		}
 	});
Index: benchmark/readyQ/locality.go
===================================================================
--- benchmark/readyQ/locality.go	(revision 276a94d733611091c32960a63117feee9a0aae97)
+++ benchmark/readyQ/locality.go	(revision 7b1f6d46fc5a0667747e03864d0195d693c13f3b)
@@ -215,6 +215,6 @@
 func main() {
 	// Benchmark specific command line arguments
-	work_sizeOpt := flag.Uint64("w", 2    , "Number of words (uint64) per threads")
-	countOpt     := flag.Uint64("c", 2    , "Number of words (uint64) to touch")
+	work_sizeOpt := flag.Uint64("w", 2    , "Size of the array for each threads, in words (64bit)")
+	countOpt     := flag.Uint64("c", 2    , "Number of words to touch when working (random pick, cells can be picked more than once)")
 	shareOpt     := flag.Bool  ("s", false, "Pass the work data to the next thread when blocking")
 
@@ -266,10 +266,10 @@
 
 	// Join and accumulate results
-	global_result := NewResult()
+	results := NewResult()
 	for i := 0; i < nthreads; i++ {
 		r := <- result
-		global_result.count += r.count
-		global_result.gmigs += r.gmigs
-		global_result.dmigs += r.dmigs
+		results.count += r.count
+		results.gmigs += r.gmigs
+		results.dmigs += r.dmigs
 	}
 
@@ -280,12 +280,12 @@
 	p.Printf("Number of threads      : %d\n", nthreads);
 	p.Printf("Work size (64bit words): %d\n", size);
-	p.Printf("Total Operations(ops)  : %15d\n", global_result.count)
-	p.Printf("Total G Migrations     : %15d\n", global_result.gmigs)
-	p.Printf("Total D Migrations     : %15d\n", global_result.dmigs)
-	p.Printf("Ops per second         : %18.2f\n", float64(global_result.count) / delta.Seconds())
-	p.Printf("ns per ops             : %18.2f\n", float64(delta.Nanoseconds()) / float64(global_result.count))
-	p.Printf("Ops per threads        : %15d\n", global_result.count / uint64(nthreads))
-	p.Printf("Ops per procs          : %15d\n", global_result.count / uint64(nprocs))
-	p.Printf("Ops/sec/procs          : %18.2f\n", (float64(global_result.count) / float64(nprocs)) / delta.Seconds())
-	p.Printf("ns per ops/procs       : %18.2f\n", float64(delta.Nanoseconds()) / (float64(global_result.count) / float64(nprocs)))
-}
+	p.Printf("Total Operations(ops)  : %15d\n", results.count)
+	p.Printf("Total G Migrations     : %15d\n", results.gmigs)
+	p.Printf("Total D Migrations     : %15d\n", results.dmigs)
+	p.Printf("Ops per second         : %18.2f\n", float64(results.count) / delta.Seconds())
+	p.Printf("ns per ops             : %18.2f\n", float64(delta.Nanoseconds()) / float64(results.count))
+	p.Printf("Ops per threads        : %15d\n", results.count / uint64(nthreads))
+	p.Printf("Ops per procs          : %15d\n", results.count / uint64(nprocs))
+	p.Printf("Ops/sec/procs          : %18.2f\n", (float64(results.count) / float64(nprocs)) / delta.Seconds())
+	p.Printf("ns per ops/procs       : %18.2f\n", float64(delta.Nanoseconds()) / (float64(results.count) / float64(nprocs)))
+}
Index: benchmark/readyQ/locality.rs
===================================================================
--- benchmark/readyQ/locality.rs	(revision 7b1f6d46fc5a0667747e03864d0195d693c13f3b)
+++ benchmark/readyQ/locality.rs	(revision 7b1f6d46fc5a0667747e03864d0195d693c13f3b)
@@ -0,0 +1,336 @@
+use std::ptr;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::time::Instant;
+use std::thread::{self, ThreadId};
+
+use tokio::runtime::Builder;
+use tokio::sync;
+
+use clap::{App, Arg};
+use num_format::{Locale, ToFormattedString};
+use rand::Rng;
+
+#[path = "../bench.rs"]
+mod bench;
+
+// ==================================================
+struct MyData {
+	data: Vec<u64>,
+	ttid: ThreadId,
+	_id: usize,
+}
+
+impl MyData {
+	fn new(id: usize, size: usize) -> MyData {
+		MyData {
+			data: vec![0; size],
+			ttid: thread::current().id(),
+			_id: id,
+		}
+	}
+
+	fn moved(&mut self, ttid: ThreadId) -> u64 {
+		if self.ttid == ttid {
+			return 0;
+		}
+		self.ttid = ttid;
+		return 1;
+	}
+
+	fn access(&mut self, idx: usize) {
+		let l = self.data.len();
+		self.data[idx % l] += 1;
+	}
+}
+
+struct MyDataPtr {
+	ptr: *mut MyData,
+}
+
+unsafe impl std::marker::Send for MyDataPtr{}
+
+// ==================================================
+struct MyCtx {
+	s: sync::Semaphore,
+	d: MyDataPtr,
+	ttid: ThreadId,
+	_id: usize,
+}
+
+impl MyCtx {
+	fn new(d: *mut MyData, id: usize) -> MyCtx {
+		MyCtx {
+			s: sync::Semaphore::new(0),
+			d: MyDataPtr{ ptr: d },
+			ttid: thread::current().id(),
+			_id: id
+		}
+	}
+
+	fn moved(&mut self, ttid: ThreadId) -> u64 {
+		if self.ttid == ttid {
+			return 0;
+		}
+		self.ttid = ttid;
+		return 1;
+	}
+}
+// ==================================================
+// Atomic object where a single thread can wait
+// May exchanges data
+struct MySpot {
+	ptr: AtomicU64,
+	_id: usize,
+}
+
+impl MySpot {
+	fn new(id: usize) -> MySpot {
+		let r = MySpot{
+			ptr: AtomicU64::new(0),
+			_id: id,
+		};
+		r
+	}
+
+	fn one() -> u64 {
+		1
+	}
+
+	// Main handshake of the code
+	// Single seat, first thread arriving waits
+	// Next threads unblocks current one and blocks in its place
+	// if share == true, exchange data in the process
+	async fn put( &self, ctx: &mut MyCtx, data: MyDataPtr, share: bool) -> (*mut MyData, bool) {
+		{
+			// Attempt to CAS our context into the seat
+			let raw = {
+				loop {
+					let expected = self.ptr.load(Ordering::Relaxed) as u64;
+					if expected == MySpot::one() { // Seat is closed, return
+						let r: *const MyData = ptr::null();
+						return (r as *mut MyData, true);
+					}
+					let got = self.ptr.compare_and_swap(expected, ctx as *mut MyCtx as u64, Ordering::SeqCst);
+					if got == expected {
+						break expected;// We got the seat
+					}
+				}
+			};
+
+			// If we aren't the fist in, wake someone
+			if raw != 0 {
+				let val: &mut MyCtx = unsafe{ &mut *(raw as *mut MyCtx) };
+				// If we are sharing, give them our data
+				if share {
+					val.d.ptr = data.ptr;
+				}
+
+				// Wake them up
+				val.s.add_permits(1);
+			}
+		}
+
+		// Block once on the seat
+		ctx.s.acquire().await.forget();
+
+		// Someone woke us up, get the new data
+		let ret = ctx.d.ptr;
+		return (ret, false);
+	}
+
+	// Shutdown the spot
+	// Wake current thread and mark seat as closed
+	fn release(&self) {
+		let val = self.ptr.swap(MySpot::one(), Ordering::SeqCst);
+		if val == 0 {
+			return
+		}
+
+		// Someone was there, release them
+		unsafe{ &mut *(val as *mut MyCtx) }.s.add_permits(1)
+	}
+}
+
+// ==================================================
+// Struct for result, Go doesn't support passing tuple in channels
+struct Result {
+	count: u64,
+	gmigs: u64,
+	dmigs: u64,
+}
+
+impl Result {
+	fn new() -> Result {
+		Result{ count: 0, gmigs: 0, dmigs: 0}
+	}
+
+	fn add(&mut self, o: Result) {
+		self.count += o.count;
+		self.gmigs += o.gmigs;
+		self.dmigs += o.dmigs;
+	}
+}
+
+// ==================================================
+// Random number generator, Go's native one is to slow and global
+fn __xorshift64( state: &mut u64 ) -> usize {
+	let mut x = *state;
+	x ^= x << 13;
+	x ^= x >> 7;
+	x ^= x << 17;
+	*state = x;
+	x as usize
+}
+
+// ==================================================
+// Do some work by accessing 'cnt' cells in the array
+fn work(data: &mut MyData, cnt: u64, state : &mut u64) {
+	for _ in 0..cnt {
+		data.access(__xorshift64(state))
+	}
+}
+
+async fn local(start: Arc<sync::Barrier>, idata: MyDataPtr, spots : Arc<Vec<MySpot>>, cnt: u64, share: bool, id: usize, exp: Arc<bench::BenchData>) -> Result{
+	let mut state = rand::thread_rng().gen::<u64>();
+	let mut data = idata;
+	let mut ctx = MyCtx::new(data.ptr, id);
+	let _size = unsafe{ &mut *data.ptr }.data.len();
+
+	// Prepare results
+	let mut r = Result::new();
+
+	// Wait for start
+	start.wait().await;
+
+	// Main loop
+	loop {
+		// Touch our current data, write to invalidate remote cache lines
+		work(unsafe{ &mut *data.ptr }, cnt, &mut state);
+
+		// Wait on a random spot
+		let i = (__xorshift64(&mut state) as usize) % spots.len();
+		let closed = {
+			let (d, c) = spots[i].put(&mut ctx, data, share).await;
+			data = MyDataPtr{ ptr: d };
+			c
+		};
+
+		// Check if the experiment is over
+		if closed { break }                                                   // yes, spot was closed
+		if  exp.clock_mode && exp.stop.load(Ordering::Relaxed) { break }  // yes, time's up
+		if !exp.clock_mode && r.count >= exp.stop_count { break }         // yes, iterations reached
+
+		assert_ne!(data.ptr as *const MyData, ptr::null());
+
+		let d = unsafe{ &mut *data.ptr };
+
+		// Check everything is consistent
+		debug_assert_eq!(d.data.len(), _size);
+
+		// write down progress and check migrations
+		let ttid = thread::current().id();
+		r.count += 1;
+		r.gmigs += ctx .moved(ttid);
+		r.dmigs += d.moved(ttid);
+	}
+
+	exp.threads_left.fetch_sub(1, Ordering::SeqCst);
+	r
+}
+
+
+// ==================================================
+fn main() {
+	let options = App::new("Locality Tokio")
+		.args(&bench::args())
+		.arg(Arg::with_name("size") .short("w").long("worksize").takes_value(true).default_value("2").help("Size of the array for each threads, in words (64bit)"))
+		.arg(Arg::with_name("work") .short("c").long("workcnt") .takes_value(true).default_value("2").help("Number of words to touch when working (random pick, cells can be picked more than once)"))
+		.arg(Arg::with_name("share").short("s").long("share")   .takes_value(true).default_value("true").help("Pass the work data to the next thread when blocking"))
+		.get_matches();
+
+	let nthreads   = options.value_of("nthreads").unwrap().parse::<usize>().unwrap();
+	let nprocs     = options.value_of("nprocs").unwrap().parse::<usize>().unwrap();
+	let wsize      = options.value_of("size").unwrap().parse::<usize>().unwrap();
+	let wcnt       = options.value_of("work").unwrap().parse::<u64>().unwrap();
+	let share      = options.value_of("share").unwrap().parse::<bool>().unwrap();
+
+	// Check params
+	if ! (nthreads > nprocs) {
+		panic!("Must have more threads than procs");
+	}
+
+	let s = (1000000 as u64).to_formatted_string(&Locale::en);
+	assert_eq!(&s, "1,000,000");
+
+	let exp = Arc::new(bench::BenchData::new(options, nprocs));
+	let mut results = Result::new();
+
+	let mut elapsed : std::time::Duration = std::time::Duration::from_secs(0);
+
+	let mut data_arrays : Vec<MyData> = (0..nthreads).map(|i| MyData::new(i, wsize)).rev().collect();
+	let spots : Arc<Vec<MySpot>> = Arc::new((0..nthreads - nprocs).map(|i| MySpot::new(i)).rev().collect());
+	let barr = Arc::new(sync::Barrier::new(nthreads + 1));
+
+	let runtime = Builder::new_multi_thread()
+		.worker_threads(nprocs)
+		.enable_all()
+		.build()
+		.unwrap();
+
+	runtime.block_on(async
+		{
+			let thrds: Vec<_> = (0..nthreads).map(|i| {
+				debug_assert!(i < data_arrays.len());
+
+				runtime.spawn(local(
+					barr.clone(),
+					MyDataPtr{ ptr: &mut data_arrays[i] },
+					spots.clone(),
+					wcnt,
+					share,
+					i,
+					exp.clone(),
+				))
+			}).collect();
+
+
+			println!("Starting");
+
+			let start = Instant::now();
+			barr.wait().await;
+
+			elapsed = exp.wait(&start).await;
+
+			println!("\nDone");
+
+			// release all the blocked threads
+			for s in &* spots {
+				s.release();
+			}
+
+			println!("Threads released");
+
+			// Join and accumulate results
+			for t in thrds {
+				results.add( t.await.unwrap() );
+			}
+
+			println!("Threads joined");
+		}
+	);
+
+	println!("Duration (ms)          : {}", (elapsed.as_millis()).to_formatted_string(&Locale::en));
+	println!("Number of processors   : {}", (nprocs).to_formatted_string(&Locale::en));
+	println!("Number of threads      : {}", (nthreads).to_formatted_string(&Locale::en));
+	println!("Work size (64bit words): {}", (wsize).to_formatted_string(&Locale::en));
+	println!("Total Operations(ops)  : {:>15}", (results.count).to_formatted_string(&Locale::en));
+	println!("Total G Migrations     : {:>15}", (results.gmigs).to_formatted_string(&Locale::en));
+	println!("Total D Migrations     : {:>15}", (results.dmigs).to_formatted_string(&Locale::en));
+	println!("Ops per second         : {:>15}", (((results.count as f64) / elapsed.as_secs() as f64) as u64).to_formatted_string(&Locale::en));
+	println!("ns per ops             : {:>15}", ((elapsed.as_nanos() as f64 / results.count as f64) as u64).to_formatted_string(&Locale::en));
+	println!("Ops per threads        : {:>15}", (results.count / nthreads as u64).to_formatted_string(&Locale::en));
+	println!("Ops per procs          : {:>15}", (results.count / nprocs as u64).to_formatted_string(&Locale::en));
+	println!("Ops/sec/procs          : {:>15}", ((((results.count as f64) / nprocs as f64) / elapsed.as_secs() as f64) as u64).to_formatted_string(&Locale::en));
+	println!("ns per ops/procs       : {:>15}", ((elapsed.as_nanos() as f64 / (results.count as f64 / nprocs as f64)) as u64).to_formatted_string(&Locale::en));
+}
Index: configure.ac
===================================================================
--- configure.ac	(revision 276a94d733611091c32960a63117feee9a0aae97)
+++ configure.ac	(revision 7b1f6d46fc5a0667747e03864d0195d693c13f3b)
@@ -295,6 +295,6 @@
 # Some of our makefile don't need to be distributed
 AM_CONDITIONAL([CFORALL_DISTRIBUTE], [test -e $TOP_SRCDIR/autogen.sh])
-AM_COND_IF([CFORALL_DISTRIBUTE],
-	[AC_CONFIG_FILES([
+AM_COND_IF([CFORALL_DISTRIBUTE], [
+	AC_CONFIG_FILES([
 		longrun_tests/Makefile
 		benchmark/Makefile
@@ -302,5 +302,8 @@
 		tools/Makefile
 		tools/prettyprinter/Makefile
-		])])
+	])
+
+	AC_OUTPUT(benchmark/Cargo.toml)
+])
 
 AC_CONFIG_LINKS([tests/test.py:tests/test.py])
