Index: benchmark/readyQ/bench.go
===================================================================
--- benchmark/readyQ/bench.go	(revision 2c7eee0158db97464d1d3b620031d72e9b57e264)
+++ benchmark/readyQ/bench.go	(revision 2c7eee0158db97464d1d3b620031d72e9b57e264)
@@ -0,0 +1,74 @@
+package main
+
+import (
+	"bufio"
+	"flag"
+	"fmt"
+	"os"
+	"runtime"
+	"sync/atomic"
+	"time"
+)
+
+var clock_mode bool
+var threads_left int64
+var stop int32
+var duration float64
+var stop_count uint64
+var nprocs int
+var nthreads int
+
+func fflush(f *bufio.Writer) {
+	defer f.Flush()
+	f.Write([]byte("\r"))
+}
+
+func wait(start time.Time, is_tty bool) {
+	f := bufio.NewWriter(os.Stdout)
+	tdur := time.Duration(duration)
+	for true {
+		time.Sleep(100 * time.Millisecond)
+		end := time.Now()
+		delta := end.Sub(start)
+		if is_tty {
+			fmt.Printf(" %.1f",delta.Seconds())
+			fflush(f)
+		}
+		if clock_mode && delta >= (tdur * time.Second) {
+			break
+		} else if !clock_mode && atomic.LoadInt64(&threads_left) == 0 {
+			break
+		}
+	}
+}
+
+func bench_init() {
+	nprocsOpt := flag.Int("p", 1, "The number of processors")
+	nthreadsOpt := flag.Int("t", 1, "The number of threads")
+	durationOpt := flag.Float64("d", 0, "Duration of the experiment in seconds")
+	stopOpt := flag.Uint64("i", 0, "Duration of the experiment in iterations")
+
+	flag.Parse()
+
+	nprocs = *nprocsOpt
+	nthreads = *nthreadsOpt
+	duration = *durationOpt
+	stop_count = *stopOpt
+
+	if duration > 0 && stop_count > 0 {
+		panic(fmt.Sprintf("--duration and --iterations cannot be used together\n"))
+	} else if duration > 0 {
+		clock_mode = true
+		stop_count = 0xFFFFFFFFFFFFFFFF
+		fmt.Printf("Running for %f seconds\n", duration)
+	} else if stop_count > 0 {
+		clock_mode = false
+		fmt.Printf("Running for %d iterations\n", stop_count)
+	} else {
+		duration = 5
+		clock_mode = true
+		fmt.Printf("Running for %f seconds\n", duration)
+	}
+
+	runtime.GOMAXPROCS(nprocs)
+}
Index: benchmark/readyQ/cycle.cfa
===================================================================
--- benchmark/readyQ/cycle.cfa	(revision 7a2a3afec8f6e6223a01e70159a47188ac961bfc)
+++ benchmark/readyQ/cycle.cfa	(revision 2c7eee0158db97464d1d3b620031d72e9b57e264)
@@ -1,21 +1,34 @@
 #include "rq_bench.hfa"
 
-thread Partner {
-	Partner * partner;
+struct Partner {
 	unsigned long long count;
+	unsigned long long blocks;
+	bench_sem self;
+	bench_sem * next;
 };
 
 void ?{}( Partner & this ) {
-	((thread&)this){ bench_cluster };
+	this.count = this.blocks = 0;
 }
 
-void main( Partner & this ) {
-	this.count = 0;
+thread BThrd {
+	Partner & partner;
+};
+
+void ?{}( BThrd & this, Partner * partner ) {
+	((thread&)this){ bench_cluster };
+	&this.partner = partner;
+}
+
+void ^?{}( BThrd & mutex this ) {}
+
+void main( BThrd & thrd ) with(thrd.partner) {
+	count = 0;
 	for() {
-		park();
-		unpark( *this.partner );
-		this.count ++;
+		blocks += wait( self );
+		post( *next );
+		count ++;
 		if( clock_mode && stop) break;
-		if(!clock_mode && this.count >= stop_count) break;
+		if(!clock_mode && count >= stop_count) break;
 	}
 
@@ -33,4 +46,5 @@
 	{
 		unsigned long long global_counter = 0;
+		unsigned long long global_blocks  = 0;
 		unsigned tthreads = nthreads * ring_size;
 		Time start, end;
@@ -38,8 +52,13 @@
 		{
 			threads_left = tthreads;
-			Partner threads[tthreads];
+			BThrd * threads[tthreads];
+			Partner thddata[tthreads];
 			for(i; tthreads) {
 				unsigned pi = (i + nthreads) % tthreads;
-				threads[i].partner = &threads[pi];
+				thddata[i].next = &thddata[pi].self;
+			}
+			for(int i = 0; i < tthreads; i++) {
+				threads[i] = malloc();
+				(*threads[i]){ &thddata[i] };
 			}
 			printf("Starting\n");
@@ -49,5 +68,5 @@
 
 			for(i; nthreads) {
-				unpark( threads[i] );
+				post( thddata[i].self );
 			}
 			wait(start, is_tty);
@@ -58,19 +77,23 @@
 
 			for(i; tthreads) {
-				global_counter += join( threads[i] ).count;
+				Partner & partner = join( *threads[i] ).partner;
+				global_counter += partner.count;
+				global_blocks  += partner.blocks;
+				delete(threads[i]);
 			}
 		}
 
-		printf("Duration (ms)       : %'ld\n", (end - start)`ms);
-		printf("Number of processors: %'d\n", nprocs);
-		printf("Number of threads   : %'d\n", tthreads);
-		printf("Cycle size (# thrds): %'d\n", ring_size);
-		printf("Yields per second   : %'18.2lf\n", ((double)global_counter) / (end - start)`s);
-		printf("ns per yields       : %'18.2lf\n", ((double)(end - start)`ns) / global_counter);
-		printf("Total yields        : %'15llu\n", global_counter);
-		printf("Yields per threads  : %'15llu\n", global_counter / tthreads);
-		printf("Yields per procs    : %'15llu\n", global_counter / nprocs);
-		printf("Yields/sec/procs    : %'18.2lf\n", (((double)global_counter) / nprocs) / (end - start)`s);
-		printf("ns per yields/procs : %'18.2lf\n", ((double)(end - start)`ns) / (global_counter / nprocs));
+		printf("Duration (ms)        : %'ld\n", (end - start)`ms);
+		printf("Number of processors : %'d\n", nprocs);
+		printf("Number of threads    : %'d\n", tthreads);
+		printf("Cycle size (# thrds) : %'d\n", ring_size);
+		printf("Total Operations(ops): %'15llu\n", global_counter);
+		printf("Total blocks         : %'15llu\n", global_blocks);
+		printf("Ops per second       : %'18.2lf\n", ((double)global_counter) / (end - start)`s);
+		printf("ns per ops           : %'18.2lf\n", ((double)(end - start)`ns) / global_counter);
+		printf("Ops per threads      : %'15llu\n", global_counter / tthreads);
+		printf("Ops per procs        : %'15llu\n", global_counter / nprocs);
+		printf("Ops/sec/procs        : %'18.2lf\n", (((double)global_counter) / nprocs) / (end - start)`s);
+		printf("ns per ops/procs     : %'18.2lf\n", ((double)(end - start)`ns) / (global_counter / nprocs));
 		fflush(stdout);
 	}
Index: benchmark/readyQ/cycle.cpp
===================================================================
--- benchmark/readyQ/cycle.cpp	(revision 7a2a3afec8f6e6223a01e70159a47188ac961bfc)
+++ benchmark/readyQ/cycle.cpp	(revision 2c7eee0158db97464d1d3b620031d72e9b57e264)
@@ -0,0 +1,86 @@
+
+#include "rq_bench.hpp"
+
+struct Partner {
+	unsigned long long count  = 0;
+	unsigned long long blocks = 0;
+	bench_sem self;
+	bench_sem * next;
+};
+
+void partner_main( Partner * self ) {
+	self->count = 0;
+	for(;;) {
+		self->blocks += self->self.wait();
+		self->next->post();
+		self->count ++;
+		if( clock_mode && stop) break;
+		if(!clock_mode && self->count >= stop_count) break;
+	}
+
+	__atomic_fetch_add(&threads_left, -1, __ATOMIC_SEQ_CST);
+}
+
+int main(int argc, char * argv[]) {
+	unsigned ring_size = 2;
+	option_t opt[] = {
+		BENCH_OPT,
+		{ 'r', "ringsize", "Number of threads in a cycle", ring_size }
+	};
+	BENCH_OPT_PARSE("cforall cycle benchmark");
+
+	{
+		unsigned long long global_counter = 0;
+		unsigned long long global_blocks  = 0;
+		unsigned tthreads = nthreads * ring_size;
+		uint64_t start, end;
+		FibreInit(1, nprocs);
+		{
+			threads_left = tthreads;
+			Fibre * threads[tthreads];
+			Partner thddata[tthreads];
+			for(int i = 0; i < tthreads; i++) {
+				unsigned pi = (i + nthreads) % tthreads;
+				thddata[i].next = &thddata[pi].self;
+			}
+			for(int i = 0; i < tthreads; i++) {
+				threads[i] = new Fibre( reinterpret_cast<void (*)(void *)>(partner_main), &thddata[i] );
+			}
+			printf("Starting\n");
+
+			bool is_tty = isatty(STDOUT_FILENO);
+			start = getTimeNsec();
+
+			for(int i = 0; i < nthreads; i++) {
+				thddata[i].self.post();
+			}
+			wait(start, is_tty);
+
+			stop = true;
+			end = getTimeNsec();
+			printf("\nDone\n");
+
+			for(int i = 0; i < tthreads; i++) {
+				fibre_join( threads[i], nullptr );
+				global_counter += thddata[i].count;
+				global_blocks  += thddata[i].blocks;
+			}
+		}
+
+		printf("Duration (ms)        : %'ld\n", to_miliseconds(end - start));
+		printf("Number of processors : %'d\n", nprocs);
+		printf("Number of threads    : %'d\n", tthreads);
+		printf("Cycle size (# thrds) : %'d\n", ring_size);
+		printf("Total Operations(ops): %'15llu\n", global_counter);
+		printf("Total blocks         : %'15llu\n", global_blocks);
+		printf("Ops per second       : %'18.2lf\n", ((double)global_counter) / to_fseconds(end - start));
+		printf("ns per ops           : %'18.2lf\n", ((double)(end - start)) / global_counter);
+		printf("Ops per threads      : %'15llu\n", global_counter / tthreads);
+		printf("Ops per procs        : %'15llu\n", global_counter / nprocs);
+		printf("Ops/sec/procs        : %'18.2lf\n", (((double)global_counter) / nprocs) / to_fseconds(end - start));
+		printf("ns per ops/procs     : %'18.2lf\n", ((double)(end - start)) / (global_counter / nprocs));
+		fflush(stdout);
+	}
+
+	return 0;
+}
Index: benchmark/readyQ/cycle.go
===================================================================
--- benchmark/readyQ/cycle.go	(revision 7a2a3afec8f6e6223a01e70159a47188ac961bfc)
+++ benchmark/readyQ/cycle.go	(revision 2c7eee0158db97464d1d3b620031d72e9b57e264)
@@ -2,9 +2,6 @@
 
 import (
-	"bufio"
 	"flag"
 	"fmt"
-	"os"
-	"runtime"
 	"sync/atomic"
 	"time"
@@ -12,34 +9,4 @@
 	"golang.org/x/text/message"
 )
-
-var clock_mode bool
-var threads_left int64
-var stop int32
-var duration float64
-var stop_count uint64
-
-func fflush(f *bufio.Writer) {
-	defer f.Flush()
-	f.Write([]byte("\r"))
-}
-
-func wait(start time.Time, is_tty bool) {
-	f := bufio.NewWriter(os.Stdout)
-	tdur := time.Duration(duration)
-	for true {
-		time.Sleep(100 * time.Millisecond)
-		end := time.Now()
-		delta := end.Sub(start)
-		if is_tty {
-			fmt.Printf(" %.1f",delta.Seconds())
-			fflush(f)
-		}
-		if clock_mode && delta >= (tdur * time.Second) {
-			break
-		} else if !clock_mode && atomic.LoadInt64(&threads_left) == 0 {
-			break
-		}
-	}
-}
 
 func partner(result chan uint64, mine chan int, next chan int) {
@@ -58,38 +25,12 @@
 
 func main() {
-	var nprocs int
-	var nthreads int
 	var ring_size int
 
-	nprocsOpt := flag.Int("p", 1, "The number of processors")
-	nthreadsOpt := flag.Int("t", 1, "The number of threads")
 	ring_sizeOpt := flag.Int("r", 2, "The number of threads per cycles")
-	durationOpt := flag.Float64("d", 0, "Duration of the experiment in seconds")
-	stopOpt := flag.Uint64("i", 0, "Duration of the experiment in iterations")
 
-	flag.Parse()
+	bench_init()
 
-	nprocs = *nprocsOpt
-	nthreads = *nthreadsOpt
 	ring_size = *ring_sizeOpt
-	duration = *durationOpt
-	stop_count = *stopOpt
 
-	if duration > 0 && stop_count > 0 {
-		panic(fmt.Sprintf("--duration and --iterations cannot be used together\n"))
-	} else if duration > 0 {
-		clock_mode = true
-		stop_count = 0xFFFFFFFFFFFFFFFF
-		fmt.Printf("Running for %f seconds\n", duration)
-	} else if stop_count > 0 {
-		clock_mode = false
-		fmt.Printf("Running for %d iterations\n", stop_count)
-	} else {
-		duration = 5
-		clock_mode = true
-		fmt.Printf("Running for %f seconds\n", duration)
-	}
-
-	runtime.GOMAXPROCS(nprocs)
 	tthreads := nthreads * ring_size
 	threads_left = int64(tthreads)
@@ -126,15 +67,15 @@
 
 	p := message.NewPrinter(language.English)
-	p.Printf("Duration (ms)       : %f\n", delta.Seconds());
-	p.Printf("Number of processors: %d\n", nprocs);
-	p.Printf("Number of threads   : %d\n", tthreads);
-	p.Printf("Cycle size (# thrds): %d\n", ring_size);
-	p.Printf("Yields per second   : %18.2f\n", float64(global_counter) / delta.Seconds())
-	p.Printf("ns per yields       : %18.2f\n", float64(delta.Nanoseconds()) / float64(global_counter))
-	p.Printf("Total yields        : %15d\n", global_counter)
-	p.Printf("Yields per threads  : %15d\n", global_counter / uint64(tthreads))
-	p.Printf("Yields per procs    : %15d\n", global_counter / uint64(nprocs))
-	p.Printf("Yields/sec/procs    : %18.2f\n", (float64(global_counter) / float64(nprocs)) / delta.Seconds())
-	p.Printf("ns per yields/procs : %18.2f\n", float64(delta.Nanoseconds()) / (float64(global_counter) / float64(nprocs)))
+	p.Printf("Duration (ms)        : %f\n", delta.Seconds());
+	p.Printf("Number of processors : %d\n", nprocs);
+	p.Printf("Number of threads    : %d\n", tthreads);
+	p.Printf("Cycle size (# thrds) : %d\n", ring_size);
+	p.Printf("Total Operations(ops): %15d\n", global_counter)
+	p.Printf("Yields per second    : %18.2f\n", float64(global_counter) / delta.Seconds())
+	p.Printf("ns per ops           : %18.2f\n", float64(delta.Nanoseconds()) / float64(global_counter))
+	p.Printf("Ops per threads      : %15d\n", global_counter / uint64(tthreads))
+	p.Printf("Ops per procs        : %15d\n", global_counter / uint64(nprocs))
+	p.Printf("Ops/sec/procs        : %18.2f\n", (float64(global_counter) / float64(nprocs)) / delta.Seconds())
+	p.Printf("ns per ops/procs     : %18.2f\n", float64(delta.Nanoseconds()) / (float64(global_counter) / float64(nprocs)))
 
 }
Index: benchmark/readyQ/rq_bench.hfa
===================================================================
--- benchmark/readyQ/rq_bench.hfa	(revision 7a2a3afec8f6e6223a01e70159a47188ac961bfc)
+++ benchmark/readyQ/rq_bench.hfa	(revision 2c7eee0158db97464d1d3b620031d72e9b57e264)
@@ -66,5 +66,5 @@
 
 void ^?{}( BenchCluster & this ) {
-	adelete( this.nprocs, this.procs );
+	adelete( this.procs );
 	^(this.cl){};
 }
@@ -87,2 +87,51 @@
 	}
 }
+
+struct bench_sem {
+	struct $thread * volatile ptr;
+};
+
+static inline {
+	void  ?{}(bench_sem & this) {
+		this.ptr = 0p;
+	}
+
+	void ^?{}(bench_sem & this) {}
+
+	bool wait(bench_sem & this) {
+		for() {
+			struct $thread * expected = this.ptr;
+			if(expected == 1p) {
+				if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					return false;
+				}
+			}
+			else {
+				/* paranoid */ verify( expected == 0p );
+				if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					park();
+					return true;
+				}
+			}
+
+		}
+	}
+
+	bool post(bench_sem & this) {
+		for() {
+			struct $thread * expected = this.ptr;
+			if(expected == 1p) return false;
+			if(expected == 0p) {
+				if(__atomic_compare_exchange_n(&this.ptr, &expected, 1p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					return false;
+				}
+			}
+			else {
+				if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					unpark( expected );
+					return true;
+				}
+			}
+		}
+	}
+}
Index: benchmark/readyQ/rq_bench.hpp
===================================================================
--- benchmark/readyQ/rq_bench.hpp	(revision 2c7eee0158db97464d1d3b620031d72e9b57e264)
+++ benchmark/readyQ/rq_bench.hpp	(revision 2c7eee0158db97464d1d3b620031d72e9b57e264)
@@ -0,0 +1,386 @@
+#include <cassert>
+
+#include <time.h>										// timespec
+#include <sys/time.h>									// timeval
+
+enum { TIMEGRAN = 1000000000LL };					// nanosecond granularity, except for timeval
+
+
+#include <libfibre/fibre.h>
+
+
+
+volatile bool stop = false;
+bool clock_mode;
+double duration = -1;
+unsigned long long stop_count = 0;
+unsigned nprocs = 1;
+unsigned nthreads = 1;
+
+volatile unsigned long long threads_left;
+
+#define BENCH_OPT \
+	{'d', "duration",  "Duration of the experiments in seconds", duration }, \
+	{'i', "iterations",  "Number of iterations of the experiments", stop_count }, \
+	{'t', "nthreads",  "Number of threads to use", nthreads }, \
+	{'p', "nprocs",    "Number of processors to use", nprocs }
+
+#define BENCH_OPT_PARSE(name) \
+	{ \
+		int opt_cnt = sizeof(opt) / sizeof(option_t); \
+		char **left; \
+		parse_args( argc, argv, opt, opt_cnt, "[OPTIONS]...\n" name, &left ); \
+		if(duration > 0 && stop_count > 0) { \
+			fprintf(stderr, "--duration and --iterations cannot be used together\n"); \
+			print_args_usage(argc, argv, opt, opt_cnt, "[OPTIONS]...\n" name, true); \
+		} else if(duration > 0) { \
+			clock_mode = true; \
+			stop_count = 0xFFFFFFFFFFFFFFFF; \
+			printf("Running for %lf seconds\n", duration); \
+		} else if(stop_count > 0) { \
+			clock_mode = false; \
+			printf("Running for %llu iterations\n", stop_count); \
+		} else { \
+			duration = 5; clock_mode = true;\
+			printf("Running for %lf seconds\n", duration); \
+		} \
+	}
+
+uint64_t getTimeNsec() {
+	timespec curr;
+	clock_gettime( CLOCK_REALTIME, &curr );
+	return (int64_t)curr.tv_sec * TIMEGRAN + curr.tv_nsec;
+}
+
+uint64_t to_miliseconds( uint64_t durtn ) { return durtn / (TIMEGRAN / 1000LL); }
+double to_fseconds(uint64_t durtn ) { return durtn / (double)TIMEGRAN; }
+uint64_t from_fseconds(double sec) { return sec * TIMEGRAN; }
+
+void wait(const uint64_t & start, bool is_tty) {
+	for(;;) {
+		Fibre::usleep(100000);
+		uint64_t end = getTimeNsec();
+		uint64_t delta = end - start;
+		if(is_tty) {
+			printf(" %.1f\r", to_fseconds(delta));
+			fflush(stdout);
+		}
+		if( clock_mode && delta >= from_fseconds(duration) ) {
+			break;
+		}
+		else if( !clock_mode && threads_left == 0 ) {
+			break;
+		}
+	}
+}
+
+class bench_sem {
+	Fibre * volatile ptr = nullptr;
+public:
+	inline bool wait() {
+		static Fibre * const ready  = reinterpret_cast<Fibre * const>(1ull);
+		for(;;) {
+			Fibre * expected = this->ptr;
+			if(expected == ready) {
+				if(__atomic_compare_exchange_n(&this->ptr, &expected, nullptr, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					return false;
+				}
+			}
+			else {
+				/* paranoid */ assert( expected == nullptr );
+				if(__atomic_compare_exchange_n(&this->ptr, &expected, fibre_self(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					fibre_park();
+					return true;
+				}
+			}
+
+		}
+	}
+
+	inline bool post() {
+		static Fibre * const ready  = reinterpret_cast<Fibre * const>(1ull);
+		for(;;) {
+			Fibre * expected = this->ptr;
+			if(expected == ready) return false;
+			if(expected == nullptr) {
+				if(__atomic_compare_exchange_n(&this->ptr, &expected, ready, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					return false;
+				}
+			}
+			else {
+				if(__atomic_compare_exchange_n(&this->ptr, &expected, nullptr, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					fibre_unpark( expected );
+					return true;
+				}
+			}
+		}
+	}
+};
+
+// ==========================================================================================
+#include <cstring>
+
+//-----------------------------------------------------------------------------
+// Typed argument parsing
+bool parse_yesno(const char * arg, bool & value ) {
+	if(strcmp(arg, "yes") == 0) {
+		value = true;
+		return true;
+	}
+
+	if(strcmp(arg, "no") == 0) {
+		value = false;
+		return true;
+	}
+
+	return false;
+}
+
+bool parse_settrue (const char *, bool & value ) {
+	value = true;
+	return true;
+}
+
+bool parse_setfalse(const char *, bool & value )  {
+	value = false;
+	return true;
+}
+
+bool parse(const char * arg, const char * & value ) {
+	value = arg;
+	return true;
+}
+
+bool parse(const char * arg, int & value) {
+	char * end;
+	int r = strtoll(arg, &end, 10);
+	if(*end != '\0') return false;
+
+	value = r;
+	return true;
+}
+
+bool parse(const char * arg, unsigned & value) {
+	char * end;
+	unsigned long long int r = strtoull(arg, &end, 10);
+	if(*end != '\0') return false;
+	if(r > UINT_MAX) return false;
+
+	value = r;
+	return true;
+}
+
+bool parse(const char * arg, unsigned long & value) {
+	char * end;
+	unsigned long long int r = strtoull(arg, &end, 10);
+	if(*end != '\0') return false;
+	if(r > ULONG_MAX) return false;
+
+	value = r;
+	return true;
+}
+
+bool parse(const char * arg, unsigned long long & value) {
+        char * end;
+        unsigned long long int r = strtoull(arg, &end, 10);
+        if(*end != '\0') return false;
+        if(r > ULLONG_MAX) return false;
+
+        value = r;
+        return true;
+}
+
+bool parse(const char * arg, double & value) {
+	char * end;
+	double r = strtod(arg, &end);
+	if(*end != '\0') return false;
+
+	value = r;
+	return true;
+}
+
+//-----------------------------------------------------------------------------
+struct option_t {
+      char short_name;
+      const char * long_name;
+      const char * help;
+      void * variable;
+      bool (*parse_fun)(const char *, void * );
+
+	template<typename T>
+	inline option_t( char short_name, const char * long_name, const char * help, T & variable ) {
+		this->short_name = short_name;
+		this->long_name  = long_name;
+		this->help       = help;
+		this->variable   = reinterpret_cast<void*>(&variable);
+		this->parse_fun  = reinterpret_cast<bool (*)(const char *, void * )>(static_cast<bool (*)(const char *, T & )>(parse));
+	}
+
+	template<typename T>
+	inline option_t( char short_name, const char * long_name, const char * help, T & variable, bool (*parse)(const char *, T & )) {
+		this->short_name = short_name;
+		this->long_name  = long_name;
+		this->help       = help;
+		this->variable   = reinterpret_cast<void*>(&variable);
+		this->parse_fun  = reinterpret_cast<bool (*)(const char *, void * )>(parse);
+	}
+};
+
+extern option_t last_option;
+
+
+//-----------------------------------------------------------------------------
+#include <cstdint>
+#include <climits>
+#include <errno.h>
+#include <unistd.h>
+extern "C" {
+	#include <getopt.h>
+	#include <sys/ioctl.h>
+
+	extern FILE * stderr;
+	extern FILE * stdout;
+
+	extern int fileno(FILE *stream);
+
+	extern int fprintf ( FILE * stream, const char * format, ... );
+
+	extern          long long int strtoll (const char* str, char** endptr, int base);
+	extern unsigned long long int strtoull(const char* str, char** endptr, int base);
+	extern                 double strtod  (const char* str, char** endptr);
+}
+
+static void usage(char * cmd, option_t options[], size_t opt_count, const char * usage, FILE * out)  __attribute__ ((noreturn));
+
+//-----------------------------------------------------------------------------
+// getopt_long wrapping
+void parse_args(
+	int argc,
+	char * argv[],
+	option_t options[],
+	size_t opt_count,
+	const char * usage_msg,
+	char ** * left
+) {
+	struct option optarr[opt_count + 2];
+	{
+		int idx = 0;
+		for(int i = 0; i < opt_count; i++) {
+			if(options[i].long_name) {
+				optarr[idx].name = options[i].long_name;
+				optarr[idx].flag = nullptr;
+				optarr[idx].val  = options[i].short_name;
+				if(    ((intptr_t)options[i].parse_fun) == ((intptr_t)parse_settrue)
+				    || ((intptr_t)options[i].parse_fun) == ((intptr_t)parse_setfalse) ) {
+					optarr[idx].has_arg = no_argument;
+				} else {
+					optarr[idx].has_arg = required_argument;
+				}
+				idx++;
+			}
+		}
+		optarr[idx+0].name = "help";
+		optarr[idx+0].has_arg = no_argument;
+		optarr[idx+0].flag = 0;
+		optarr[idx+0].val = 'h';
+		optarr[idx+1].name = 0;
+		optarr[idx+1].has_arg = no_argument;
+		optarr[idx+1].flag = 0;
+		optarr[idx+1].val = 0;
+	}
+
+	char optstring[opt_count * 3];
+	for(auto & o : optstring) {
+		o = '\0';
+	}
+	{
+		int idx = 0;
+		for(int i = 0; i < opt_count; i++) {
+			optstring[idx] = options[i].short_name;
+			idx++;
+			if(    ((intptr_t)options[i].parse_fun) != ((intptr_t)parse_settrue)
+			    && ((intptr_t)options[i].parse_fun) != ((intptr_t)parse_setfalse) ) {
+				optstring[idx] = ':';
+				idx++;
+			}
+		}
+		optstring[idx+0] = 'h';
+		optstring[idx+1] = '\0';
+	}
+
+	FILE * out = stderr;
+	for(;;) {
+		int idx = 0;
+		int opt = getopt_long(argc, argv, optstring, optarr, &idx);
+		switch(opt) {
+			case -1:
+				if(left != nullptr) *left = argv + optind;
+				return;
+			case 'h':
+				out = stdout;
+			case '?':
+				usage(argv[0], options, opt_count, usage_msg, out);
+			default:
+				for(int i = 0; i < opt_count; i++) {
+					if(opt == options[i].short_name) {
+						const char * arg = optarg ? optarg : "";
+						bool success = options[i].parse_fun( arg, options[i].variable );
+						if(success) goto NEXT_ARG;
+
+						fprintf(out, "Argument '%s' for option %c could not be parsed\n\n", arg, (char)opt);
+						usage(argv[0], options, opt_count, usage_msg, out);
+					}
+				}
+				std::abort();
+		}
+		NEXT_ARG:;
+	}
+}
+
+//-----------------------------------------------------------------------------
+// Print usage
+static void printopt(FILE * out, int width, int max, char sn, const char * ln, const char * help) {
+	int hwidth = max - (11 + width);
+	if(hwidth <= 0) hwidth = max;
+
+	fprintf(out, "  -%c, --%-*s   %.*s\n", sn, width, ln, hwidth, help);
+	for(;;) {
+		help += std::min(strlen(help), (unsigned long)hwidth);
+		if('\0' == *help) break;
+		fprintf(out, "%*s%.*s\n", width + 11, "", hwidth, help);
+	}
+}
+
+__attribute__((noreturn)) void print_args_usage(int , char * argv[], option_t options[], size_t opt_count, const char * usage_msg, bool error) {
+	usage(argv[0], options, opt_count, usage_msg, error ? stderr : stdout);
+}
+
+static __attribute__((noreturn)) void usage(char * cmd, option_t options[], size_t opt_count, const char * help, FILE * out) {
+	int width = 0;
+	{
+		for(int i = 0; i < opt_count; i++) {
+			if(options[i].long_name) {
+				int w = strlen(options[i].long_name);
+				if(w > width) width = w;
+			}
+		}
+	}
+
+	int max_width = 1000000;
+	int outfd = fileno(out);
+	if(isatty(outfd)) {
+		struct winsize size;
+		int ret = ioctl(outfd, TIOCGWINSZ, &size);
+		if(ret < 0) abort(); // "ioctl error: (%d) %s\n", (int)errno, strerror(errno)
+		max_width = size.ws_col;
+	}
+
+	fprintf(out, "Usage:\n  %s %s\n", cmd, help);
+
+	for(int i = 0; i < opt_count; i++) {
+		printopt(out, width, max_width, options[i].short_name, options[i].long_name, options[i].help);
+	}
+	fprintf(out, "  -%c, --%-*s   %s\n", 'h', width, "help", "print this help message");
+	exit(out == stdout ? 0 : 1);
+}
+
