Index: benchmark/readyQ/cycle.cc
===================================================================
--- benchmark/readyQ/cycle.cc	(revision 69d17484515f7e3df1c7e777114ed4c355538be3)
+++ benchmark/readyQ/cycle.cc	(revision 69d17484515f7e3df1c7e777114ed4c355538be3)
@@ -0,0 +1,132 @@
+#include "rq_bench.hpp"
+
+#include <pthread.h>
+#include <semaphore.h>
+#include <sched.h>
+#include <unistd.h>
+
+struct Pthread {
+	static int usleep(useconds_t usec) {
+		return ::usleep(usec);
+	}
+};
+
+struct Partner {
+	unsigned long long count  = 0;
+	unsigned long long blocks = 0;
+	sem_t self;
+	sem_t * next;
+};
+
+void partner_main( Partner * self ) {
+	self->count = 0;
+	for(;;) {
+		sem_wait(&self->self);
+		sem_post(self->next);
+		self->count ++;
+		if( clock_mode && stop) break;
+		if(!clock_mode && self->count >= stop_count) break;
+
+		int sval;
+		sem_getvalue(&self->self, &sval);
+		if(sval > 1) std::abort();
+		if(sval < 0) std::abort();
+	}
+
+	__atomic_fetch_add(&threads_left, -1, __ATOMIC_SEQ_CST);
+}
+
+int main(int argc, char * argv[]) {
+	unsigned ring_size = 2;
+	option_t opt[] = {
+		BENCH_OPT,
+		{ 'r', "ringsize", "Number of threads in a cycle", ring_size }
+	};
+	BENCH_OPT_PARSE("cforall cycle benchmark");
+
+	{
+		unsigned long long global_counter = 0;
+		unsigned long long global_blocks  = 0;
+		unsigned tthreads = nthreads * ring_size;
+		uint64_t start, end;
+
+		{
+			cpu_set_t cpuset;
+			int ret = pthread_getaffinity_np( pthread_self(), sizeof(cpuset), &cpuset );
+			if(ret != 0) std::abort();
+
+			unsigned cnt = CPU_COUNT_S(sizeof(cpuset), &cpuset);
+			if(cnt > nprocs) {
+				unsigned extras = cnt - nprocs;
+				for(int i = 0; i < CPU_SETSIZE && extras > 0; i++) {
+					if(CPU_ISSET_S(i, sizeof(cpuset), &cpuset)) {
+						CPU_CLR_S(i, sizeof(cpuset), &cpuset);
+						extras--;
+					}
+				}
+
+				ret = pthread_setaffinity_np( pthread_self(), sizeof(cpuset), &cpuset );
+				if(ret != 0) std::abort();
+			}
+		}
+
+		{
+			threads_left = tthreads;
+			pthread_t threads[tthreads];
+			Partner thddata[tthreads];
+			for(int i = 0; i < tthreads; i++) {
+				int ret = sem_init( &thddata[i].self, false, 0 );
+				if(ret != 0) std::abort();
+
+				unsigned pi = (i + nthreads) % tthreads;
+				thddata[i].next = &thddata[pi].self;
+			}
+			for(int i = 0; i < tthreads; i++) {
+				int ret = pthread_create( &threads[i], nullptr, reinterpret_cast<void * (*)(void *)>(partner_main), &thddata[i] );
+				if(ret != 0) std::abort();
+			}
+			printf("Starting\n");
+
+			bool is_tty = isatty(STDOUT_FILENO);
+			start = getTimeNsec();
+
+			for(int i = 0; i < nthreads; i++) {
+				sem_post(&thddata[i].self);
+			}
+			wait<Pthread>(start, is_tty);
+
+			stop = true;
+			end = getTimeNsec();
+			printf("\nDone\n");
+
+			for(int i = 0; i < tthreads; i++) {
+				sem_post(&thddata[i].self);
+				int ret = pthread_join( threads[i], nullptr );
+				if(ret != 0) std::abort();
+				global_counter += thddata[i].count;
+				global_blocks  += thddata[i].blocks;
+			}
+
+			for(int i = 0; i < tthreads; i++) {
+				int ret = sem_destroy( &thddata[i].self );
+				if(ret != 0) std::abort();
+			}
+		}
+
+		printf("Duration (ms)        : %'ld\n", to_miliseconds(end - start));
+		printf("Number of processors : %'d\n", nprocs);
+		printf("Number of threads    : %'d\n", tthreads);
+		printf("Cycle size (# thrds) : %'d\n", ring_size);
+		printf("Total Operations(ops): %'15llu\n", global_counter);
+		printf("Total blocks         : %'15llu\n", global_blocks);
+		printf("Ops per second       : %'18.2lf\n", ((double)global_counter) / to_fseconds(end - start));
+		printf("ns per ops           : %'18.2lf\n", ((double)(end - start)) / global_counter);
+		printf("Ops per threads      : %'15llu\n", global_counter / tthreads);
+		printf("Ops per procs        : %'15llu\n", global_counter / nprocs);
+		printf("Ops/sec/procs        : %'18.2lf\n", (((double)global_counter) / nprocs) / to_fseconds(end - start));
+		printf("ns per ops/procs     : %'18.2lf\n", ((double)(end - start)) / (global_counter / nprocs));
+		fflush(stdout);
+	}
+
+	return 0;
+}
Index: benchmark/readyQ/cycle.cfa
===================================================================
--- benchmark/readyQ/cycle.cfa	(revision 883c4d9d328a20f954086effbb3a3ac2cf689535)
+++ benchmark/readyQ/cycle.cfa	(revision 69d17484515f7e3df1c7e777114ed4c355538be3)
@@ -77,4 +77,5 @@
 
 			for(i; tthreads) {
+				post( thddata[i].self );
 				Partner & partner = join( *threads[i] ).partner;
 				global_counter += partner.count;
Index: benchmark/readyQ/cycle.cpp
===================================================================
--- benchmark/readyQ/cycle.cpp	(revision 883c4d9d328a20f954086effbb3a3ac2cf689535)
+++ benchmark/readyQ/cycle.cpp	(revision 69d17484515f7e3df1c7e777114ed4c355538be3)
@@ -1,5 +1,48 @@
 
 #include "rq_bench.hpp"
+#include <libfibre/fibre.h>
 
+class __attribute__((aligned(128))) bench_sem {
+	Fibre * volatile ptr = nullptr;
+public:
+	inline bool wait() {
+		static Fibre * const ready  = reinterpret_cast<Fibre * const>(1ull);
+		for(;;) {
+			Fibre * expected = this->ptr;
+			if(expected == ready) {
+				if(__atomic_compare_exchange_n(&this->ptr, &expected, nullptr, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					return false;
+				}
+			}
+			else {
+				/* paranoid */ assert( expected == nullptr );
+				if(__atomic_compare_exchange_n(&this->ptr, &expected, fibre_self(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					fibre_park();
+					return true;
+				}
+			}
+
+		}
+	}
+
+	inline bool post() {
+		static Fibre * const ready  = reinterpret_cast<Fibre * const>(1ull);
+		for(;;) {
+			Fibre * expected = this->ptr;
+			if(expected == ready) return false;
+			if(expected == nullptr) {
+				if(__atomic_compare_exchange_n(&this->ptr, &expected, ready, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					return false;
+				}
+			}
+			else {
+				if(__atomic_compare_exchange_n(&this->ptr, &expected, nullptr, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					fibre_unpark( expected );
+					return true;
+				}
+			}
+		}
+	}
+};
 struct Partner {
 	unsigned long long count  = 0;
@@ -55,5 +98,5 @@
 				thddata[i].self.post();
 			}
-			wait(start, is_tty);
+			wait<Fibre>(start, is_tty);
 
 			stop = true;
@@ -62,4 +105,5 @@
 
 			for(int i = 0; i < tthreads; i++) {
+				thddata[i].self.post();
 				fibre_join( threads[i], nullptr );
 				global_counter += thddata[i].count;
Index: benchmark/readyQ/cycle.go
===================================================================
--- benchmark/readyQ/cycle.go	(revision 883c4d9d328a20f954086effbb3a3ac2cf689535)
+++ benchmark/readyQ/cycle.go	(revision 69d17484515f7e3df1c7e777114ed4c355538be3)
@@ -63,4 +63,8 @@
 	global_counter := uint64(0)
 	for i := 0; i < tthreads; i++ {
+		select {
+		case channels[i] <- 0:
+		default:
+		}
 		global_counter += <- result
 	}
