Index: benchmark/readyQ/cycle.cfa
===================================================================
--- benchmark/readyQ/cycle.cfa	(revision b35ab2d445f52b84ed4b7a48c4de785b5b3562b4)
+++ benchmark/readyQ/cycle.cfa	(revision b35ab2d445f52b84ed4b7a48c4de785b5b3562b4)
@@ -0,0 +1,76 @@
+#include "rq_bench.hfa"
+
+thread Partner {
+	Partner * partner;
+	unsigned long long count;
+	bool first;
+};
+
+void ?{}( Partner & this ) {
+	((thread&)this){ bench_cluster };
+}
+
+void main( Partner & this ) {
+	thread_loop {
+		park();
+		unpark( *this.partner );
+	}
+
+	__atomic_fetch_add(&threads_left, -1, __ATOMIC_SEQ_CST);
+
+	if(this.first) park();
+}
+
+int main(int argc, char * argv[]) {
+	unsigned ring_size = 2;
+	cfa_option opt[] = {
+		BENCH_OPT,
+		{ 'r', "ringsize", "Number of threads in a cycle", ring_size }
+	};
+	BENCH_OPT_PARSE("cforall cycle benchmark");
+
+	{
+		unsigned long long global_counter = 0;
+		unsigned tthreads = nthreads * ring_size;
+		Time start, end;
+		BenchCluster bc = { nprocs };
+		{
+			threads_left = tthreads;
+			Partner threads[tthreads];
+			for(i; tthreads) {
+				unsigned pi = (i + nthreads) % tthreads;
+				threads[i].partner = &threads[pi];
+				threads[i].first = i < nthreads;
+			}
+			printf("Starting\n");
+
+			bool is_tty = isatty(STDOUT_FILENO);
+			start = getTimeNsec();
+
+			for(i; nthreads) {
+				unpark( threads[i] );
+			}
+			wait(start, end, is_tty);
+
+			stop = true;
+			end = getTimeNsec();
+			printf("\nDone\n");
+
+			for(i; tthreads) {
+				global_counter += join( threads[i] ).count;
+			}
+		}
+
+		printf("Took %'ld ms\n", (end - start)`ms);
+		printf("Yields per second   : %'18.2lf\n", ((double)global_counter) / (end - start)`s);
+		printf("ns per yields       : %'18.2lf\n", ((double)(end - start)`ns) / global_counter);
+		printf("Total yields        : %'15llu\n", global_counter);
+		printf("Yields per threads  : %'15llu\n", global_counter / tthreads);
+		printf("Yields per procs    : %'15llu\n", global_counter / nprocs);
+		printf("Yields/sec/procs    : %'18.2lf\n", (((double)global_counter) / nprocs) / (end - start)`s);
+		printf("ns per yields/procs : %'18.2lf\n", ((double)(end - start)`ns) / (global_counter / nprocs));
+		fflush(stdout);
+	}
+
+	return 0;
+}
Index: benchmark/readyQ/rq_bench.hfa
===================================================================
--- benchmark/readyQ/rq_bench.hfa	(revision b35ab2d445f52b84ed4b7a48c4de785b5b3562b4)
+++ benchmark/readyQ/rq_bench.hfa	(revision b35ab2d445f52b84ed4b7a48c4de785b5b3562b4)
@@ -0,0 +1,85 @@
+#include <clock.hfa>
+#include <kernel.hfa>
+#include <parseargs.hfa>
+#include <stdio.h>
+#include <stdlib.hfa>
+#include <thread.hfa>
+#include <time.hfa>
+#include <unistd.h>
+
+volatile bool stop = false;
+bool clock_mode;
+double duration = -1;
+unsigned long long stop_count = 0;
+unsigned nprocs = 1;
+unsigned nthreads = 1;
+
+volatile unsigned long long threads_left;
+
+#define thread_loop for(this.count = 0; this.count < stop_count && !stop; this.count++)
+
+#define BENCH_OPT \
+	{'d', "duration",  "Duration of the experiments in seconds", duration }, \
+	{'i', "iterations",  "Number of iterations of the experiments", stop_count }, \
+	{'t', "nthreads",  "Number of threads to use", nthreads }, \
+	{'p', "nprocs",    "Number of processors to use", nprocs }
+
+#define BENCH_OPT_PARSE(name) \
+	{ \
+		int opt_cnt = sizeof(opt) / sizeof(cfa_option); \
+		char **left; \
+		parse_args( argc, argv, opt, opt_cnt, "[OPTIONS]...\n" name, left ); \
+		if(duration > 0 && stop_count > 0) { \
+			fprintf(stderr, "--duration and --iterations cannot be used together\n"); \
+			print_args_usage(argc, argv, opt, opt_cnt, "[OPTIONS]...\n" name, true); \
+		} else if(duration > 0) { \
+			clock_mode = true; \
+			stop_count = 0xFFFFFFFFFFFFFFFF; \
+		} else if(stop_count > 0) { \
+			clock_mode = false; \
+		} else { \
+			duration = 5; clock_mode = true;\
+		} \
+	}
+
+struct cluster & bench_cluster;
+
+struct BenchCluster {
+	cluster cl;
+	processor * procs;
+	unsigned nprocs;
+};
+
+void ?{}( BenchCluster & this, unsigned nprocs ) {
+	(this.cl){ "Benchmark Cluster" };
+	&bench_cluster = &this.cl;
+	this.nprocs = nprocs;
+	this.procs  = alloc( this.nprocs );
+	for(i; this.nprocs){
+		processor * p = &this.procs[i];
+		(*p){ "Benchmark Processor", this.cl };
+	}
+}
+
+void ^?{}( BenchCluster & this ) {
+	adelete( this.nprocs, this.procs );
+	^(this.cl){};
+}
+
+void wait(Time & start, Time & end, bool is_tty) {
+	for() {
+		sleep(100`ms);
+		end = getTimeNsec();
+		Duration delta = end - start;
+		if(is_tty) {
+			printf(" %.1f\r", delta`ds);
+			fflush(stdout);
+		}
+		if( clock_mode && delta >= duration`s ) {
+			break;
+		}
+		else if( !clock_mode && threads_left == 0 ) {
+			break;
+		}
+	}
+}
