Index: benchmark/Cargo.toml.in
===================================================================
--- benchmark/Cargo.toml.in	(revision ce9f9d49a2f8e613169e8a0385563024beea05b3)
+++ benchmark/Cargo.toml.in	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
@@ -6,10 +6,18 @@
 
 [[bin]]
-name = "cycle-tokio"
+name = "rdq-cycle-tokio"
 path = "@abs_srcdir@/readyQ/cycle.rs"
 
 [[bin]]
-name = "locality-tokio"
+name = "rdq-locality-tokio"
 path = "@abs_srcdir@/readyQ/locality.rs"
+
+[[bin]]
+name = "rdq-transfer-tokio"
+path = "@abs_srcdir@/readyQ/transfer.rs"
+
+[[bin]]
+name = "rdq-yield-tokio"
+path = "@abs_srcdir@/readyQ/yield.rs"
 
 [features]
Index: benchmark/Makefile.am
===================================================================
--- benchmark/Makefile.am	(revision ce9f9d49a2f8e613169e8a0385563024beea05b3)
+++ benchmark/Makefile.am	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
@@ -21,5 +21,5 @@
 include $(top_srcdir)/tools/build/cfa.make
 
-AM_CFLAGS = -O2 -Wall -Wextra -I$(srcdir) -lrt -pthread # -Werror
+AM_CFLAGS = -O3 -Wall -Wextra -I$(srcdir) -lrt -pthread # -Werror
 AM_CFAFLAGS = -quiet -nodebug
 AM_UPPFLAGS = -quiet -nodebug -multi -std=c++14
@@ -612,5 +612,54 @@
 ## =========================================================================================================
 
-%-tokio$(EXEEXT): $(srcdir)/readyQ/%.rs $(srcdir)/bench.rs
-	cd $(builddir) && cargo build --release
-	cp $(builddir)/target/release/$(basename $@) $@
+RDQBENCHES = \
+	rdq-cycle-cfa \
+	rdq-cycle-tokio \
+	rdq-cycle-go \
+	rdq-cycle-fibre \
+	rdq-yield-cfa \
+	rdq-yield-tokio \
+	rdq-yield-go \
+	rdq-yield-fibre \
+	rdq-locality-cfa \
+	rdq-locality-tokio \
+	rdq-locality-go \
+	rdq-locality-fibre \
+	rdq-transfer-cfa \
+	rdq-transfer-tokio \
+	rdq-transfer-go \
+	rdq-transfer-fibre
+
+rdq-benches:
+	+make $(RDQBENCHES)
+
+clean-rdq-benches:
+	rm -rf $(RDQBENCHES) $(builddir)/target go.mod
+
+rdq-%-tokio$(EXEEXT): $(builddir)/target/release/rdq-%-tokio$(EXEEXT)
+	$(BENCH_V_RUSTC)cp $(builddir)/target/release/$(basename $@) $@
+
+$(builddir)/target/release/rdq-%-tokio$(EXEEXT): $(srcdir)/readyQ/%.rs $(srcdir)/bench.rs
+	$(BENCH_V_RUSTC)cd $(builddir) && cargo build --release
+
+rdq-%-cfa$(EXEEXT): $(srcdir)/readyQ/%.cfa $(srcdir)/readyQ/rq_bench.hfa
+	$(BENCH_V_CFA)$(CFACOMPILE) $< -o $@
+
+go.mod:
+	touch $@
+	go mod edit -module=rdq.bench
+	go get golang.org/x/sync/semaphore
+	go get golang.org/x/text/language
+	go get golang.org/x/text/message
+
+rdq-%-go$(EXEEXT): $(srcdir)/readyQ/%.go $(srcdir)/readyQ/bench.go go.mod
+	$(BENCH_V_GOC)go build -o $@ $< $(srcdir)/readyQ/bench.go
+
+rdq-%-fibre$(EXEEXT): $(srcdir)/readyQ/%.cpp
+	$(BENCH_V_CXX)$(CXXCOMPILE) $< -o $@ -lfibre -std=c++17 $(AM_CFLAGS)
+
+# ## =========================================================================================================
+
+CLEANFILES = $(RDQBENCHES) go.mod go.sum
+
+clean-local:
+	-rm -rf target
Index: benchmark/bench.rs
===================================================================
--- benchmark/bench.rs	(revision ce9f9d49a2f8e613169e8a0385563024beea05b3)
+++ benchmark/bench.rs	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
@@ -1,5 +1,7 @@
 use std::io::{self, Write};
+use std::option;
 use std::sync::atomic::{AtomicU64, AtomicBool, Ordering};
 use std::time::{Instant,Duration};
+use std::u128;
 
 use clap::{Arg, ArgMatches};
@@ -27,8 +29,12 @@
 
 impl BenchData {
-	pub fn new(options: ArgMatches, nthreads: usize) -> BenchData {
+	pub fn new(options: ArgMatches, nthreads: usize, default_it: option::Option<u64>) -> BenchData {
 		let (clock_mode, stop_count, duration) = if options.is_present("iterations") {
 			(false,
 			options.value_of("iterations").unwrap().parse::<u64>().unwrap(),
+			-1.0)
+		} else if !default_it.is_none() {
+			(false,
+			default_it.unwrap(),
 			-1.0)
 		} else {
@@ -48,4 +54,5 @@
 	}
 
+	#[allow(dead_code)]
 	pub async fn wait(&self, start: &Instant) -> Duration{
 		loop {
@@ -69,2 +76,7 @@
 }
 
+// ==================================================
+pub fn _lehmer64( state: &mut u128 ) -> u64 {
+	*state = state.wrapping_mul(0xda942042e4dd58b5);
+	return (*state >> 64) as u64;
+}
Index: benchmark/readyQ/cycle.cpp
===================================================================
--- benchmark/readyQ/cycle.cpp	(revision ce9f9d49a2f8e613169e8a0385563024beea05b3)
+++ benchmark/readyQ/cycle.cpp	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
@@ -41,9 +41,9 @@
 			Fibre * threads[tthreads];
 			Partner thddata[tthreads];
-			for(int i = 0; i < tthreads; i++) {
+			for(unsigned i = 0; i < tthreads; i++) {
 				unsigned pi = (i + nthreads) % tthreads;
 				thddata[i].next = &thddata[pi].self;
 			}
-			for(int i = 0; i < tthreads; i++) {
+			for(unsigned i = 0; i < tthreads; i++) {
 				threads[i] = new Fibre( reinterpret_cast<void (*)(void *)>(partner_main), &thddata[i] );
 			}
@@ -53,5 +53,5 @@
 			start = timeHiRes();
 
-			for(int i = 0; i < nthreads; i++) {
+			for(unsigned i = 0; i < nthreads; i++) {
 				thddata[i].self.post();
 			}
@@ -62,5 +62,5 @@
 			printf("\nDone\n");
 
-			for(int i = 0; i < tthreads; i++) {
+			for(unsigned i = 0; i < tthreads; i++) {
 				thddata[i].self.post();
 				fibre_join( threads[i], nullptr );
Index: benchmark/readyQ/cycle.go
===================================================================
--- benchmark/readyQ/cycle.go	(revision ce9f9d49a2f8e613169e8a0385563024beea05b3)
+++ benchmark/readyQ/cycle.go	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
@@ -60,5 +60,5 @@
 	atomic.StoreInt32(&stop, 1)
 	end := time.Now()
-	delta := end.Sub(start)
+	duration := end.Sub(start)
 
 	fmt.Printf("\nDone\n")
@@ -74,15 +74,15 @@
 
 	p := message.NewPrinter(language.English)
-	p.Printf("Duration (ms)        : %f\n", delta.Seconds());
+	p.Printf("Duration (ms)        : %d\n", duration.Milliseconds())
 	p.Printf("Number of processors : %d\n", nprocs);
 	p.Printf("Number of threads    : %d\n", tthreads);
 	p.Printf("Cycle size (# thrds) : %d\n", ring_size);
 	p.Printf("Total Operations(ops): %15d\n", global_counter)
-	p.Printf("Ops per second       : %18.2f\n", float64(global_counter) / delta.Seconds())
-	p.Printf("ns per ops           : %18.2f\n", float64(delta.Nanoseconds()) / float64(global_counter))
+	p.Printf("Ops per second       : %18.2f\n", float64(global_counter) / duration.Seconds())
+	p.Printf("ns per ops           : %18.2f\n", float64(duration.Nanoseconds()) / float64(global_counter))
 	p.Printf("Ops per threads      : %15d\n", global_counter / uint64(tthreads))
 	p.Printf("Ops per procs        : %15d\n", global_counter / uint64(nprocs))
-	p.Printf("Ops/sec/procs        : %18.2f\n", (float64(global_counter) / float64(nprocs)) / delta.Seconds())
-	p.Printf("ns per ops/procs     : %18.2f\n", float64(delta.Nanoseconds()) / (float64(global_counter) / float64(nprocs)))
+	p.Printf("Ops/sec/procs        : %18.2f\n", (float64(global_counter) / float64(nprocs)) / duration.Seconds())
+	p.Printf("ns per ops/procs     : %18.2f\n", float64(duration.Nanoseconds()) / (float64(global_counter) / float64(nprocs)))
 
 }
Index: benchmark/readyQ/cycle.rs
===================================================================
--- benchmark/readyQ/cycle.rs	(revision ce9f9d49a2f8e613169e8a0385563024beea05b3)
+++ benchmark/readyQ/cycle.rs	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
@@ -46,5 +46,5 @@
 
 	let tthreads = nthreads * ring_size;
-	let exp = Arc::new(bench::BenchData::new(options, tthreads));
+	let exp = Arc::new(bench::BenchData::new(options, tthreads, None));
 
 	let s = (1000000 as u64).to_formatted_string(&Locale::en);
Index: benchmark/readyQ/locality.go
===================================================================
--- benchmark/readyQ/locality.go	(revision ce9f9d49a2f8e613169e8a0385563024beea05b3)
+++ benchmark/readyQ/locality.go	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
@@ -286,5 +286,5 @@
 	// Print with nice 's, i.e. 1'000'000 instead of 1000000
 	p := message.NewPrinter(language.English)
-	p.Printf("Duration (s)           : %f\n", delta.Seconds());
+	p.Printf("Duration (ms)          : %f\n", delta.Milliseconds());
 	p.Printf("Number of processors   : %d\n", nprocs);
 	p.Printf("Number of threads      : %d\n", nthreads);
Index: benchmark/readyQ/locality.rs
===================================================================
--- benchmark/readyQ/locality.rs	(revision ce9f9d49a2f8e613169e8a0385563024beea05b3)
+++ benchmark/readyQ/locality.rs	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
@@ -124,6 +124,6 @@
 						return (r as *mut MyData, true);
 					}
-					let got = self.ptr.compare_and_swap(expected, ctx as *mut MyCtx as u64, Ordering::SeqCst);
-					if got == expected {
+					let got = self.ptr.compare_exchange_weak(expected, ctx as *mut MyCtx as u64, Ordering::SeqCst, Ordering::SeqCst);
+					if got == Ok(expected) {
 						break expected;// We got the seat
 					}
@@ -285,5 +285,5 @@
 	assert_eq!(&s, "1,000,000");
 
-	let exp = Arc::new(bench::BenchData::new(options, nprocs));
+	let exp = Arc::new(bench::BenchData::new(options, nprocs, None));
 	let mut results = Result::new();
 
Index: benchmark/readyQ/transfer.cfa
===================================================================
--- benchmark/readyQ/transfer.cfa	(revision ce9f9d49a2f8e613169e8a0385563024beea05b3)
+++ benchmark/readyQ/transfer.cfa	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
@@ -39,4 +39,5 @@
 			Pause();
 			if( (timeHiRes() - start) > 5`s ) {
+				print_stats_now( bench_cluster, CFA_STATS_READY_Q | CFA_STATS_IO );
 				serr | "Programs has been blocked for more than 5 secs";
 				exit(1);
@@ -110,5 +111,5 @@
 	cfa_option opt[] = {
 		BENCH_OPT,
-		{ 'e', "exhaust", "Whether or not threads that have seen the new epoch should yield or park.", exhaust, parse_yesno}
+		{ 'e', "exhaust", "Whether or not threads that have seen the new epoch should park instead of yielding.", exhaust, parse_yesno}
 	};
 	BENCH_OPT_PARSE("cforall transition benchmark");
@@ -166,5 +167,5 @@
 	}
 
-	sout | "Duration                : " | ws(3, 3, unit(eng((end - start)`ds))) | 's';
+	sout | "Duration (ms)           : " | ws(3, 3, unit(eng((end - start)`dms)));
 	sout | "Number of processors    : " | nprocs;
 	sout | "Number of threads       : " | nthreads;
Index: benchmark/readyQ/transfer.cpp
===================================================================
--- benchmark/readyQ/transfer.cpp	(revision ce9f9d49a2f8e613169e8a0385563024beea05b3)
+++ benchmark/readyQ/transfer.cpp	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
@@ -173,5 +173,5 @@
 	}
 
-	std::cout << "Duration                : " << to_miliseconds(end - start) << "ms" << std::endl;
+	std::cout << "Duration (ms)           : " << to_miliseconds(end - start) << std::endl;
 	std::cout << "Number of processors    : " << nprocs << std::endl;
 	std::cout << "Number of threads       : " << nthreads << std::endl;
Index: benchmark/readyQ/transfer.go
===================================================================
--- benchmark/readyQ/transfer.go	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
+++ benchmark/readyQ/transfer.go	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
@@ -0,0 +1,223 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"math/rand"
+	"os"
+	"runtime"
+	"sync/atomic"
+	"time"
+	"golang.org/x/text/language"
+	"golang.org/x/text/message"
+)
+
+type LeaderInfo struct {
+	id uint64
+	idx uint64
+	seed uint64
+}
+
+func __xorshift64( state * uint64 ) (uint64) {
+	x := *state
+	x ^= x << 13
+	x ^= x >> 7
+	x ^= x << 17
+	*state = x
+	return x
+}
+
+func (this * LeaderInfo) next(len uint64) {
+	n := __xorshift64( &this.seed )
+	atomic.StoreUint64( &this.id, n % len )
+}
+
+func NewLeader(size uint64) (*LeaderInfo) {
+	this := &LeaderInfo{0, 0, uint64(os.Getpid())}
+
+	r := rand.Intn(10)
+
+	for i := 0; i < r; i++ {
+		this.next( uint64(nthreads) )
+	}
+
+	return this
+}
+
+type MyThread struct {
+	id uint64
+	idx uint64
+	sem chan struct{}
+}
+
+func waitgroup(idx uint64, threads [] MyThread) {
+	start := time.Now()
+	for i := 0; i < len(threads); i++ {
+		// fmt.Fprintf(os.Stderr, "Waiting for :%d (%d)\n", threads[i].id, atomic.LoadUint64(&threads[i].idx) );
+		for atomic.LoadUint64( &threads[i].idx ) != idx {
+			// hint::spin_loop();
+			end := time.Now()
+			delta := end.Sub(start)
+			if delta.Seconds() > 5 {
+				fmt.Fprintf(os.Stderr, "Programs has been blocked for more than 5 secs")
+				os.Exit(1)
+			}
+		}
+	}
+	// debug!( "Waiting done" );
+}
+
+func wakegroup(exhaust bool, me uint64, threads [] MyThread) {
+	if !exhaust { return; }
+
+	for i := uint64(0); i < uint64(len(threads)); i++ {
+		if i != me {
+			// debug!( "Leader waking {}", i);
+			threads[i].sem <- (struct {}{})
+		}
+	}
+}
+
+func lead(exhaust bool, leader * LeaderInfo, this * MyThread, threads [] MyThread, main_sem chan struct {}) {
+	nidx := atomic.LoadUint64(&leader.idx) + 1;
+	atomic.StoreUint64(&this.idx, nidx);
+	atomic.StoreUint64(&leader.idx, nidx);
+
+	if nidx > stop_count {
+		// debug!( "Leader {} done", this.id);
+		main_sem <- (struct {}{})
+		return
+	}
+
+	// debug!( "====================\nLeader no {} : {}", nidx, this.id);
+
+	waitgroup(nidx, threads);
+
+	leader.next( uint64(len(threads)) );
+
+	wakegroup(exhaust, this.id, threads);
+
+	// debug!( "Leader no {} : {} done\n====================", nidx, this.id);
+}
+
+func waitleader(exhaust bool, leader * LeaderInfo, this * MyThread, rechecks * uint64) {
+	runtime.Gosched()
+
+	if atomic.LoadUint64(&leader.idx) == atomic.LoadUint64(&this.idx) {
+		// debug!("Waiting {} recheck", this.id);
+		*rechecks += uint64(1)
+		return
+	}
+
+	// debug!("Waiting {}", this.id);
+
+	// debug_assert!( (leader.idx.load(Ordering::Relaxed) - 1) == this.idx.load(Ordering::Relaxed) );
+	atomic.AddUint64(&this.idx, 1)
+	if exhaust {
+		// debug!("Waiting {} sem", this.id);
+		<- this.sem
+	} else {
+		// debug!("Waiting {} yield", this.id);
+		runtime.Gosched()
+	}
+
+	// debug!("Waiting {} done", this.id);
+}
+
+func transfer_main( result chan uint64, me uint64, leader * LeaderInfo, threads [] MyThread, exhaust bool, start chan struct{}, main_sem chan struct{}) {
+	// assert!( me == threads[me].id );
+
+	// debug!("Ready {}: {:p}", me, &threads[me].sem as *const sync::Semaphore);
+
+	// Wait for start
+	<- start
+
+	// debug!( "Start {}", me );
+
+	// Prepare results
+	r := uint64(0)
+
+	// Main loop
+	for true {
+		if atomic.LoadUint64(&leader.id) == me {
+			lead( exhaust, leader, &threads[me], threads, main_sem )
+			runtime.Gosched()
+		} else {
+			waitleader( exhaust, leader, &threads[me], &r )
+		}
+		if atomic.LoadUint64(&leader.idx) > stop_count { break; }
+	}
+
+	// return result
+	result <- r
+}
+
+func main() {
+	// Benchmark specific command line arguments
+	exhaustOpt := flag.Bool("e", false, "Whether or not threads that have seen the new epoch should park instead of yielding.")
+
+	// General benchmark initialization and deinitialization
+	defer bench_init()()
+
+	exhaust := *exhaustOpt;
+	if clock_mode {
+		fmt.Fprintf(os.Stderr, "Programs does not support fixed duration mode")
+		os.Exit(1)
+	}
+
+	var es string
+	if exhaust {
+		es = "with"
+	} else {
+		es = "without"
+	}
+	fmt.Printf("Running %d threads on %d processors, doing %d iterations, %s exhaustion\n", nthreads, nprocs, stop_count, es );
+
+	main_sem := make(chan struct{})
+	leader := NewLeader(uint64(nthreads))
+	barr := make(chan struct{})
+	result := make(chan uint64)
+
+	thddata := make([]MyThread, nthreads)
+	for i := range thddata {
+		thddata[i] = MyThread{ uint64(i), 0, make(chan struct {}) }
+	}
+
+	rechecks := uint64(0)
+	for i := range thddata {
+		go transfer_main(result, uint64(i), leader, thddata, exhaust, barr, main_sem)
+	}
+	fmt.Printf("Starting\n");
+
+	start := time.Now()
+	close(barr) // release barrier
+
+	<- main_sem
+
+	end := time.Now()
+	delta := end.Sub(start)
+
+	fmt.Printf("\nDone\n")
+
+	// release all the blocked threads
+	for i := range thddata {
+		close(thddata[i].sem)
+	}
+	for range thddata {
+		rechecks += <- result
+	}
+
+	p := message.NewPrinter(language.English)
+	var ws string
+	if exhaust {
+		ws = "yes"
+	} else {
+		ws = "no"
+	}
+	p.Printf("Duration (ms)           : %f\n", delta.Milliseconds() )
+	p.Printf("Number of processors    : %d\n", nprocs )
+	p.Printf("Number of threads       : %d\n", nthreads )
+	p.Printf("Total Operations(ops)   : %15d\n", stop_count )
+	p.Printf("Threads parking on wait : %s\n", ws)
+	p.Printf("Rechecking              : %d\n", rechecks )
+}
Index: benchmark/readyQ/transfer.rs
===================================================================
--- benchmark/readyQ/transfer.rs	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
+++ benchmark/readyQ/transfer.rs	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
@@ -0,0 +1,278 @@
+#[cfg(debug_assertions)]
+use std::io::{self, Write};
+
+use std::process;
+use std::option;
+use std::hint;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::time::{Instant,Duration};
+
+use tokio::runtime::Builder;
+use tokio::sync;
+use tokio::task;
+
+use rand::Rng;
+
+use clap::{Arg, App};
+use num_format::{Locale, ToFormattedString};
+
+#[path = "../bench.rs"]
+mod bench;
+
+#[cfg(debug_assertions)]
+macro_rules! debug {
+	($x:expr) => {
+		println!( $x );
+		io::stdout().flush().unwrap();
+	};
+	($x:expr, $($more:expr),+) => {
+		println!( $x, $($more), * );
+		io::stdout().flush().unwrap();
+	};
+}
+
+#[cfg(not(debug_assertions))]
+macro_rules! debug {
+    ($x:expr   ) => { () };
+    ($x:expr, $($more:expr),+) => { () };
+}
+
+fn parse_yes_no(opt: option::Option<&str>, default: bool) -> bool {
+	match opt {
+		Some(val) => {
+			match val {
+				"yes" => true,
+				"no"  => false,
+				"maybe" | "I don't know" | "Can you repeat the question?" => {
+					eprintln!("Lines for 'Malcolm in the Middle' are not acceptable values of parameter 'exhaust'");
+					std::process::exit(1);
+				},
+				_ => {
+					eprintln!("parameter 'exhaust' must have value 'yes' or 'no', was {}", val);
+					std::process::exit(1);
+				},
+			}
+		},
+		_ => {
+			default
+		},
+	}
+}
+
+struct LeaderInfo {
+	id: AtomicUsize,
+	idx: AtomicUsize,
+	seed: u128,
+}
+
+impl LeaderInfo {
+	pub fn new(nthreads: usize) -> LeaderInfo {
+		let this = LeaderInfo{
+			id: AtomicUsize::new(nthreads),
+			idx: AtomicUsize::new(0),
+			seed: process::id() as u128
+		};
+
+		let mut rng = rand::thread_rng();
+
+		for _ in 0..rng.gen_range(0..10) {
+			this.next( nthreads );
+		}
+
+		this
+	}
+
+	pub fn next(&self, len: usize) {
+		let n = bench::_lehmer64( unsafe {
+			let r1 = &self.seed as *const u128;
+			let r2 = r1 as *mut u128;
+			&mut *r2
+		} ) as usize;
+		self.id.store( n % len , Ordering::SeqCst );
+	}
+}
+
+struct MyThread {
+	id: usize,
+	idx: AtomicUsize,
+	sem: sync::Semaphore,
+}
+
+fn waitgroup(idx: usize, threads: &Vec<Arc<MyThread>>) {
+	let start = Instant::now();
+	for t in threads {
+		debug!( "Waiting for :{} ({})", t.id, t.idx.load(Ordering::Relaxed) );
+		while t.idx.load(Ordering::Relaxed) != idx {
+			hint::spin_loop();
+			if start.elapsed() > Duration::from_secs(5) {
+				eprintln!("Programs has been blocked for more than 5 secs");
+				std::process::exit(1);
+			}
+		}
+	}
+	debug!( "Waiting done" );
+}
+
+fn wakegroup(exhaust: bool, me: usize, threads: &Vec<Arc<MyThread>>) {
+	if !exhaust { return; }
+
+	for i in 0..threads.len() {
+		if i != me {
+			debug!( "Leader waking {}", i);
+			threads[i].sem.add_permits(1);
+		}
+	}
+}
+
+fn lead(exhaust: bool, leader: &LeaderInfo, this: & MyThread, threads: &Vec<Arc<MyThread>>, main_sem: &sync::Semaphore, exp: &bench::BenchData) {
+	let nidx = leader.idx.load(Ordering::Relaxed) + 1;
+	this.idx.store(nidx, Ordering::Relaxed);
+	leader.idx.store(nidx, Ordering::Relaxed);
+
+	if nidx as u64 > exp.stop_count {
+		debug!( "Leader {} done", this.id);
+		main_sem.add_permits(1);
+		return;
+	}
+
+	debug!( "====================\nLeader no {} : {}", nidx, this.id);
+
+	waitgroup(nidx, threads);
+
+	leader.next( threads.len() );
+
+	wakegroup(exhaust, this.id, threads);
+
+	debug!( "Leader no {} : {} done\n====================", nidx, this.id);
+}
+
+async fn wait(exhaust: bool, leader: &LeaderInfo, this: &MyThread, rechecks: &mut usize) {
+	task::yield_now().await;
+
+	if leader.idx.load(Ordering::Relaxed) == this.idx.load(Ordering::Relaxed) {
+		debug!("Waiting {} recheck", this.id);
+		*rechecks += 1;
+		return;
+	}
+
+	debug!("Waiting {}", this.id);
+
+	debug_assert!( (leader.idx.load(Ordering::Relaxed) - 1) == this.idx.load(Ordering::Relaxed) );
+	this.idx.fetch_add(1, Ordering::SeqCst);
+	if exhaust {
+		debug!("Waiting {} sem", this.id);
+		this.sem.acquire().await.forget();
+	}
+	else {
+		debug!("Waiting {} yield", this.id);
+		task::yield_now().await;
+	}
+
+	debug!("Waiting {} done", this.id);
+}
+
+async fn transfer_main( me: usize, leader: Arc<LeaderInfo>, threads: Arc<Vec<Arc<MyThread>>>, exhaust: bool, start: Arc<sync::Barrier>, main_sem: Arc<sync::Semaphore>, exp: Arc<bench::BenchData>) -> usize{
+	assert!( me == threads[me].id );
+
+	debug!("Ready {}: {:p}", me, &threads[me].sem as *const sync::Semaphore);
+
+	start.wait().await;
+
+	debug!( "Start {}", me );
+
+	let mut rechecks: usize = 0;
+
+	loop {
+		if leader.id.load(Ordering::Relaxed) == me {
+			lead( exhaust, &leader, &threads[me], &threads, &main_sem, &exp );
+			task::yield_now().await;
+		}
+		else {
+			wait( exhaust, &leader, &threads[me], &mut rechecks ).await;
+		}
+		if leader.idx.load(Ordering::Relaxed) as u64 > exp.stop_count { break; }
+	}
+
+	rechecks
+}
+
+fn main() {
+	let options = App::new("Transfer Tokio")
+		.args(&bench::args())
+		.arg(Arg::with_name("exhaust")  .short("e").long("exhaust")  .takes_value(true).default_value("no").help("Whether or not threads that have seen the new epoch should park instead of yielding."))
+		.get_matches();
+
+	let exhaust  = parse_yes_no( options.value_of("exhaust"), false );
+	let nthreads = options.value_of("nthreads").unwrap().parse::<usize>().unwrap();
+	let nprocs   = options.value_of("nprocs").unwrap().parse::<usize>().unwrap();
+
+
+	let exp = Arc::new(bench::BenchData::new(options, nthreads, Some(100)));
+	if exp.clock_mode {
+		eprintln!("Programs does not support fixed duration mode");
+		std::process::exit(1);
+	}
+
+	println!("Running {} threads on {} processors, doing {} iterations, {} exhaustion", nthreads, nprocs, exp.stop_count, if exhaust { "with" } else { "without" });
+
+	let s = (1000000 as u64).to_formatted_string(&Locale::en);
+	assert_eq!(&s, "1,000,000");
+
+	let main_sem = Arc::new(sync::Semaphore::new(0));
+	let leader = Arc::new(LeaderInfo::new(nthreads));
+	let barr = Arc::new(sync::Barrier::new(nthreads + 1));
+	let thddata : Arc<Vec<Arc<MyThread>>> = Arc::new(
+		(0..nthreads).map(|i| {
+			Arc::new(MyThread{
+				id: i,
+				idx: AtomicUsize::new(0),
+				sem: sync::Semaphore::new(0),
+			})
+		}).collect()
+	);
+
+	let mut rechecks: usize = 0;
+	let mut duration : std::time::Duration = std::time::Duration::from_secs(0);
+
+	let runtime = Builder::new_multi_thread()
+		.worker_threads(nprocs)
+		.enable_all()
+		.build()
+		.unwrap();
+
+	runtime.block_on(async {
+		let threads: Vec<_> = (0..nthreads).map(|i| {
+			tokio::spawn(transfer_main(i, leader.clone(), thddata.clone(), exhaust, barr.clone(), main_sem.clone(), exp.clone()))
+		}).collect();
+		println!("Starting");
+
+		let start = Instant::now();
+
+		barr.wait().await;
+		debug!("Unlocked all");
+
+
+		main_sem.acquire().await.forget();
+
+		duration = start.elapsed();
+
+		println!("\nDone");
+
+
+		for i in 0..nthreads {
+			thddata[i].sem.add_permits(1);
+		}
+
+		for t in threads {
+			rechecks += t.await.unwrap();
+		}
+	});
+
+	println!("Duration (ms)           : {}", (duration.as_millis()).to_formatted_string(&Locale::en));
+	println!("Number of processors    : {}", (nprocs).to_formatted_string(&Locale::en));
+	println!("Number of threads       : {}", (nthreads).to_formatted_string(&Locale::en));
+	println!("Total Operations(ops)   : {:>15}", (exp.stop_count).to_formatted_string(&Locale::en));
+	println!("Threads parking on wait : {}", if exhaust { "yes" } else { "no" });
+	println!("Rechecking              : {}", rechecks );
+}
Index: benchmark/readyQ/yield.cfa
===================================================================
--- benchmark/readyQ/yield.cfa	(revision ce9f9d49a2f8e613169e8a0385563024beea05b3)
+++ benchmark/readyQ/yield.cfa	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
@@ -80,8 +80,10 @@
 		}
 
-		printf("Took %'ld ms\n", (end - start)`ms);
+		printf("Duration (ms)       : %'ld\n", (end - start)`dms);
+		printf("Number of processors: %'d\n", nprocs);
+		printf("Number of threads   : %'d\n", nthreads);
+		printf("Total yields        : %'15llu\n", global_counter);
 		printf("Yields per second   : %'18.2lf\n", ((double)global_counter) / (end - start)`s);
 		printf("ns per yields       : %'18.2lf\n", ((double)(end - start)`ns) / global_counter);
-		printf("Total yields        : %'15llu\n", global_counter);
 		printf("Yields per procs    : %'15llu\n", global_counter / nprocs);
 		printf("Yields/sec/procs    : %'18.2lf\n", (((double)global_counter) / nprocs) / (end - start)`s);
Index: benchmark/readyQ/yield.cpp
===================================================================
--- benchmark/readyQ/yield.cpp	(revision ce9f9d49a2f8e613169e8a0385563024beea05b3)
+++ benchmark/readyQ/yield.cpp	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
@@ -154,6 +154,7 @@
 
 		auto dur_nano = duration_cast<std::nano>(duration);
+		auto dur_dms  = duration_cast<std::milli>(duration);
 
-		std::cout << "Took " << duration << " s\n";
+		printf("Duration (ms)       : %'.2lf\n", dur_dms );
 		printf("Total yields        : %'15llu\n", global_counter );
 		printf("Yields per procs    : %'15llu\n", global_counter / nprocs );
Index: benchmark/readyQ/yield.go
===================================================================
--- benchmark/readyQ/yield.go	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
+++ benchmark/readyQ/yield.go	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
@@ -0,0 +1,67 @@
+package main
+
+import (
+	"fmt"
+	"runtime"
+	"sync"
+	"sync/atomic"
+	"time"
+	"golang.org/x/text/language"
+	"golang.org/x/text/message"
+)
+
+func yielder(result chan uint64, start *sync.WaitGroup) {
+	count := uint64(0)
+	start.Wait()
+	for true {
+		runtime.Gosched()
+		count += 1
+		if  clock_mode && atomic.LoadInt32(&stop) == 1 { break }
+		if !clock_mode && count >= stop_count { break }
+	}
+
+	atomic.AddInt64(&threads_left, -1);
+	result <- count
+}
+
+func main() {
+	bench_init()
+
+	threads_left = int64(nthreads)
+
+	result := make(chan uint64)
+	var wg sync.WaitGroup
+	wg.Add(1)
+
+	for i := 0; i < nthreads; i++ {
+		go yielder(result, &wg)
+	}
+	fmt.Printf("Starting\n");
+	atomic.StoreInt32(&stop, 0)
+	start := time.Now()
+	wg.Done();
+	wait(start, true);
+
+	atomic.StoreInt32(&stop, 1)
+	end := time.Now()
+	duration := end.Sub(start)
+
+	fmt.Printf("\nDone\n")
+
+	global_counter := uint64(0)
+	for i := 0; i < nthreads; i++ {
+		global_counter += <- result
+	}
+
+	p := message.NewPrinter(language.English)
+	p.Printf("Duration (ms)        : %d\n", duration.Milliseconds())
+	p.Printf("Number of processors : %d\n", nprocs)
+	p.Printf("Number of threads    : %d\n", nthreads)
+	p.Printf("Total Operations(ops): %15d\n", global_counter)
+	p.Printf("Ops per second       : %18.2f\n", float64(global_counter) / duration.Seconds())
+	p.Printf("ns per ops           : %18.2f\n", float64(duration.Nanoseconds()) / float64(global_counter))
+	p.Printf("Ops per threads      : %15d\n", global_counter / uint64(nthreads))
+	p.Printf("Ops per procs        : %15d\n", global_counter / uint64(nprocs))
+	p.Printf("Ops/sec/procs        : %18.2f\n", (float64(global_counter) / float64(nprocs)) / duration.Seconds())
+	p.Printf("ns per ops/procs     : %18.2f\n", float64(duration.Nanoseconds()) / (float64(global_counter) / float64(nprocs)))
+}
Index: benchmark/readyQ/yield.rs
===================================================================
--- benchmark/readyQ/yield.rs	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
+++ benchmark/readyQ/yield.rs	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
@@ -0,0 +1,102 @@
+use std::sync::Arc;
+use std::sync::atomic::Ordering;
+use std::time::Instant;
+
+use tokio::runtime::Builder;
+use tokio::sync;
+use tokio::task;
+
+use clap::App;
+use num_format::{Locale, ToFormattedString};
+
+#[path = "../bench.rs"]
+mod bench;
+
+// ==================================================
+struct Yielder {
+	sem: sync::Semaphore,
+}
+
+async fn yield_main(idx: usize, others: Arc<Vec<Arc<Yielder>>>, exp: Arc<bench::BenchData> ) -> u64 {
+	let this = &others[idx];
+	this.sem.acquire().await.forget();
+
+	let mut count:u64 = 0;
+	loop {
+		task::yield_now().await;
+		count += 1;
+
+		if  exp.clock_mode && exp.stop.load(Ordering::Relaxed) { break; }
+		if !exp.clock_mode && count >= exp.stop_count { break; }
+	}
+
+	exp.threads_left.fetch_sub(1, Ordering::SeqCst);
+	count
+}
+
+// ==================================================
+fn main() {
+	let options = App::new("Yield Tokio")
+		.args(&bench::args())
+		.get_matches();
+
+	let nthreads  = options.value_of("nthreads").unwrap().parse::<usize>().unwrap();
+	let nprocs    = options.value_of("nprocs").unwrap().parse::<usize>().unwrap();
+
+	let exp = Arc::new(bench::BenchData::new(options, nthreads, None));
+
+	let s = (1000000 as u64).to_formatted_string(&Locale::en);
+	assert_eq!(&s, "1,000,000");
+
+	let thddata : Arc<Vec<Arc<Yielder>>> = Arc::new(
+		(0..nthreads).map(|_i| {
+			Arc::new(Yielder{
+				sem: sync::Semaphore::new(0),
+			})
+		}).collect()
+	);
+
+	let mut global_counter :u64 = 0;
+	let mut duration : std::time::Duration = std::time::Duration::from_secs(0);
+	let runtime = Builder::new_multi_thread()
+		.worker_threads(nprocs)
+		.enable_all()
+		.build()
+		.unwrap();
+
+	runtime.block_on(async {
+		let threads: Vec<_> = (0..nthreads).map(|i| {
+			tokio::spawn(yield_main(i, thddata.clone(), exp.clone()))
+		}).collect();
+		println!("Starting");
+
+		let start = Instant::now();
+
+		for i in 0..nthreads {
+			thddata[i].sem.add_permits(1);
+		}
+
+		duration = exp.wait(&start).await;
+
+		println!("\nDone");
+
+		for i in 0..nthreads {
+			thddata[i].sem.add_permits(1);
+		}
+
+		for t in threads {
+			global_counter += t.await.unwrap();
+		}
+	});
+
+	println!("Duration (ms)       : {}", (duration.as_millis()).to_formatted_string(&Locale::en));
+	println!("Number of processors: {}", (nprocs).to_formatted_string(&Locale::en));
+	println!("Number of threads   : {}", (nthreads).to_formatted_string(&Locale::en));
+	println!("Total yields        : {:>15}", (global_counter).to_formatted_string(&Locale::en));
+	println!("Yields per second   : {:>15}", (((global_counter as f64) / duration.as_secs() as f64) as u64).to_formatted_string(&Locale::en));
+	println!("ns per yields       : {:>15}", ((duration.as_nanos() as f64 / global_counter as f64) as u64).to_formatted_string(&Locale::en));
+	println!("Yields per threads  : {:>15}", (global_counter / nthreads as u64).to_formatted_string(&Locale::en));
+	println!("Yields per procs    : {:>15}", (global_counter / nprocs as u64).to_formatted_string(&Locale::en));
+	println!("Yields/sec/procs    : {:>15}", ((((global_counter as f64) / nprocs as f64) / duration.as_secs() as f64) as u64).to_formatted_string(&Locale::en));
+	println!("ns per yields/procs : {:>15}", ((duration.as_nanos() as f64 / (global_counter as f64 / nprocs as f64)) as u64).to_formatted_string(&Locale::en));
+}
Index: benchmark/rmit.py
===================================================================
--- benchmark/rmit.py	(revision ce9f9d49a2f8e613169e8a0385563024beea05b3)
+++ benchmark/rmit.py	(revision 237df76db82f504ef572c9139c965eb7aa1bfcfb)
@@ -16,4 +16,5 @@
 import random
 import re
+import socket
 import subprocess
 import sys
@@ -95,15 +96,65 @@
 	return nopts
 
+# returns the first option with key 'opt'
+def search_option(action, opt):
+	i = 0
+	while i < len(action):
+		if action[i] == opt:
+			i += 1
+			if i != len(action):
+				return action[i]
+		i += 1
+
+	return None
+
 def actions_eta(actions):
 	time = 0
 	for a in actions:
-		i = 0
-		while i < len(a):
-			if a[i] == '-d':
-				i += 1
-				if i != len(a):
-					time += int(a[i])
-			i += 1
+		o = search_option(a, '-d')
+		if o :
+			time += int(o)
 	return time
+
+taskset_maps = None
+
+def init_taskset_maps():
+	global taskset_maps
+	known_hosts = {
+		"jax": {
+			range(  1,  24) : "48-71",
+			range( 25,  48) : "48-71,144-167",
+			range( 49,  96) : "48-95,144-191",
+			range( 97, 144) : "24-95,120-191",
+			range(145, 192) : "0-95,96-191",
+		},
+	}
+
+	if (host := socket.gethostname()) in known_hosts:
+		taskset_maps = known_hosts[host]
+		return True
+
+	print("Warning unknown host '{}', disable taskset usage".format(host), file=sys.stderr)
+	return False
+
+
+def settaskset_one(action):
+	o = search_option(action, '-p')
+	if not o:
+		return action
+	try:
+		oi = int(o)
+	except ValueError:
+		return action
+
+	m = "Not found"
+	for key in taskset_maps:
+		if oi in key:
+			return ['taskset', '-c', taskset_maps[key], *action]
+
+	print("Warning no mapping for {} cores".format(oi), file=sys.stderr)
+	return action
+
+def settaskset(actions):
+	return [settaskset_one(a) for a in actions]
 
 if __name__ == "__main__":
@@ -115,4 +166,5 @@
 	parser.add_argument('--file', nargs='?', type=argparse.FileType('w'), default=sys.stdout)
 	parser.add_argument('--trials', help='Number of trials to run per combinaison', type=int, default=3)
+	parser.add_argument('--notaskset', help='If specified, the trial will not use taskset to match the -p option', action='store_true')
 	parser.add_argument('command', metavar='command', type=str, nargs=1, help='the command prefix to run')
 	parser.add_argument('candidates', metavar='candidates', type=str, nargs='*', help='the candidate suffix to run')
@@ -170,5 +222,19 @@
 
 	# ================================================================================
-	# Figure out all the combinations to run
+	# Fixup the different commands
+
+	# Add tasksets
+	withtaskset = False
+	if not options.notaskset and init_taskset_maps():
+		withtaskset = True
+		actions = settaskset(actions)
+
+	# ================================================================================
+	# Now that we know what to run, print it.
+	# find expected time
+	time = actions_eta(actions)
+	print("Running {} trials{}".format(len(actions), "" if time == 0 else " (expecting to take {})".format(str(datetime.timedelta(seconds=int(time)))) ))
+
+	# dry run if options ask for it
 	if options.list:
 		for a in actions:
@@ -180,8 +246,4 @@
 	# Prepare to run
 
-	# find expected time
-	time = actions_eta(actions)
-	print("Running {} trials{}".format(len(actions), "" if time == 0 else " (expecting to take {})".format(str(datetime.timedelta(seconds=int(time)))) ))
-
 	random.shuffle(actions)
 
@@ -191,5 +253,5 @@
 	first = True
 	for i, a in enumerate(actions):
-		sa = " ".join(a)
+		sa = " ".join(a[3:] if withtaskset else a)
 		if first:
 			first = False
@@ -208,7 +270,10 @@
 				match = re.search("^(.*):(.*)$", s)
 				if match:
-					fields[match.group(1).strip()] = float(match.group(2).strip().replace(',',''))
-
-		options.file.write(json.dumps([a[0][2:], sa, fields]))
+					try:
+						fields[match.group(1).strip()] = float(match.group(2).strip().replace(',',''))
+					except:
+						pass
+
+		options.file.write(json.dumps([a[3 if withtaskset else 0][2:], sa, fields]))
 		options.file.flush()
 
