Index: benchmark/Makefile.am
===================================================================
--- benchmark/Makefile.am	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ benchmark/Makefile.am	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -113,6 +113,4 @@
 creation_cfa_generator_DURATION = 1000000000
 creation_upp_coroutine_DURATION = ${creation_cfa_coroutine_eager_DURATION}
-creation_cfa_thread_DURATION = 10000000
-creation_upp_thread_DURATION = ${creation_cfa_thread_DURATION}
 creation_DURATION = 10000000
 
@@ -148,5 +146,5 @@
 
 cleancsv:
-	rm -f compile.csv basic.csv ctxswitch.csv mutex.csv scheduling.csv
+	rm -f compile.csv basic.csv ctxswitch.csv mutex.csv schedint.csv
 
 jenkins$(EXEEXT): cleancsv
@@ -159,6 +157,6 @@
 	+make mutex.csv
 	-+make mutex.diff.csv
-	+make scheduling.csv
-	-+make scheduling.diff.csv
+	+make schedint.csv
+	-+make schedint.diff.csv
 @DOifskipcompile@
 	cat compile.csv
@@ -169,6 +167,6 @@
 	cat mutex.csv
 	-cat mutex.diff.csv
-	cat scheduling.csv
-	-cat scheduling.diff.csv
+	cat schedint.csv
+	-cat schedint.diff.csv
 
 compile.csv:
@@ -200,5 +198,5 @@
 	$(srcdir)/fixcsv.sh $@
 
-scheduling.csv:
+schedint.csv:
 	echo "building $@"
 	echo "schedint-1,schedint-2,schedext-1,schedext-2" > $@
@@ -291,15 +289,15 @@
 ctxswitch-python_coroutine$(EXEEXT):
 	$(BENCH_V_PY)echo "#!/bin/sh" > a.out
-	echo "python3.7 $(srcdir)/ctxswitch/python_cor.py" >> a.out
+	echo "python3 $(srcdir)/ctxswitch/python_cor.py \"$$""@\"" >> a.out
 	chmod a+x a.out
 
 ctxswitch-nodejs_coroutine$(EXEEXT):
 	$(BENCH_V_NODEJS)echo "#!/bin/sh" > a.out
-	echo "nodejs $(srcdir)/ctxswitch/node_cor.js" >> a.out
+	echo "nodejs $(srcdir)/ctxswitch/node_cor.js \"$$""@\"" >> a.out
 	chmod a+x a.out
 
 ctxswitch-nodejs_await$(EXEEXT):
 	$(BENCH_V_NODEJS)echo "#!/bin/sh" > a.out
-	echo "nodejs $(srcdir)/ctxswitch/node_await.js" >> a.out
+	echo "nodejs $(srcdir)/ctxswitch/node_await.js \"$$""@\"" >> a.out
 	chmod a+x a.out
 
@@ -313,5 +311,5 @@
 	$(BENCH_V_JAVAC)javac -d $(builddir) $(srcdir)/ctxswitch/JavaThread.java
 	echo "#!/bin/sh" > a.out
-	echo "java JavaThread" >> a.out
+	echo "java JavaThread \"$$""@\"" >> a.out
 	chmod a+x a.out
 
@@ -355,5 +353,5 @@
 	$(BENCH_V_JAVAC)javac -d $(builddir) $(srcdir)/mutex/JavaThread.java
 	echo "#!/bin/sh" > a.out
-	echo "java JavaThread" >> a.out
+	echo "java JavaThread \"$$""@\"" >> a.out
 	chmod a+x a.out
 
@@ -387,5 +385,5 @@
 	$(BENCH_V_JAVAC)javac -d $(builddir) $(srcdir)/schedint/JavaThread.java
 	echo "#!/bin/sh" > a.out
-	echo "java JavaThread" >> a.out
+	echo "java JavaThread \"$$""@\"" >> a.out
 	chmod a+x a.out
 
@@ -454,10 +452,10 @@
 creation-python_coroutine$(EXEEXT):
 	$(BENCH_V_PY)echo "#!/bin/sh" > a.out
-	echo "python3.7 $(srcdir)/creation/python_cor.py" >> a.out
+	echo "python3 $(srcdir)/creation/python_cor.py \"$$""@\"" >> a.out
 	chmod a+x a.out
 
 creation-nodejs_coroutine$(EXEEXT):
 	$(BENCH_V_NODEJS)echo "#!/bin/sh" > a.out
-	echo "nodejs $(srcdir)/creation/node_cor.js" >> a.out
+	echo "nodejs $(srcdir)/creation/node_cor.js \"$$""@\"" >> a.out
 	chmod a+x a.out
 
@@ -471,5 +469,5 @@
 	$(BENCH_V_JAVAC)javac -d $(builddir) $(srcdir)/creation/JavaThread.java
 	echo "#!/bin/sh" > a.out
-	echo "java JavaThread" >> a.out
+	echo "java JavaThread \"$$""@\"" >> a.out
 	chmod a+x a.out
 
@@ -492,29 +490,29 @@
 
 compile-array$(EXEEXT):
-	$(CFACOMPILE) -fsyntax-only -w $(testdir)/array.cfa
+	$(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(testdir)/array.cfa
 
 compile-attributes$(EXEEXT):
-	$(CFACOMPILE) -fsyntax-only -w $(testdir)/attributes.cfa
+	$(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(testdir)/attributes.cfa
 
 compile-empty$(EXEEXT):
-	$(CFACOMPILE) -fsyntax-only -w $(srcdir)/compile/empty.cfa
+	$(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(srcdir)/compile/empty.cfa
 
 compile-expression$(EXEEXT):
-	$(CFACOMPILE) -fsyntax-only -w $(testdir)/expression.cfa
+	$(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(testdir)/expression.cfa
 
 compile-io$(EXEEXT):
-	$(CFACOMPILE) -fsyntax-only -w $(testdir)/io1.cfa
+	$(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(testdir)/io1.cfa
 
 compile-monitor$(EXEEXT):
-	$(CFACOMPILE) -fsyntax-only -w $(testdir)/concurrent/monitor.cfa
+	$(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(testdir)/concurrent/monitor.cfa
 
 compile-operators$(EXEEXT):
-	$(CFACOMPILE) -fsyntax-only -w $(testdir)/operators.cfa
+	$(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(testdir)/operators.cfa
 
 compile-thread$(EXEEXT):
-	$(CFACOMPILE) -fsyntax-only -w $(testdir)/concurrent/thread.cfa
+	$(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(testdir)/concurrent/thread.cfa
 
 compile-typeof$(EXEEXT):
-	$(CFACOMPILE) -fsyntax-only -w $(testdir)/typeof.cfa
+	$(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(testdir)/typeof.cfa
 
 ## =========================================================================================================
Index: benchmark/creation/JavaThread.java
===================================================================
--- benchmark/creation/JavaThread.java	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ benchmark/creation/JavaThread.java	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -47,6 +47,6 @@
 	}
 	public static void main(String[] args) throws InterruptedException {
-		if ( args.length > 2 ) System.exit( 1 );
-		if ( args.length == 2 ) { times = Long.parseLong(args[1]); }
+		if ( args.length > 1 ) System.exit( 1 );
+		if ( args.length == 1 ) { times = Long.parseLong(args[0]); }
 
 		for (int i = Integer.parseInt("5"); --i >= 0 ; ) {
Index: benchmark/ctxswitch/JavaThread.java
===================================================================
--- benchmark/ctxswitch/JavaThread.java	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ benchmark/ctxswitch/JavaThread.java	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -40,6 +40,6 @@
 	}
 	public static void main(String[] args) throws InterruptedException {
-		if ( args.length > 2 ) System.exit( 1 );
-		if ( args.length == 2 ) { times = Long.parseLong(args[1]); }
+		if ( args.length > 1 ) System.exit( 1 );
+		if ( args.length == 1 ) { times = Long.parseLong(args[0]); }
 
 		for (int i = Integer.parseInt("5"); --i >= 0 ; ) {
Index: benchmark/io/http/main.cfa
===================================================================
--- benchmark/io/http/main.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ benchmark/io/http/main.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -125,5 +125,5 @@
 						workers[i].flags   = 0;
 					}
-					unpark( workers[i] __cfaabi_dbg_ctx2 );
+					unpark( workers[i] );
 				}
 				printf("%d workers started on %d processors\n", options.clopts.nworkers, options.clopts.nprocs);
Index: benchmark/io/http/worker.cfa
===================================================================
--- benchmark/io/http/worker.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ benchmark/io/http/worker.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -22,5 +22,5 @@
 
 void main( Worker & this ) {
-	park( __cfaabi_dbg_ctx );
+	park();
 	/* paranoid */ assert( this.pipe[0] != -1 );
 	/* paranoid */ assert( this.pipe[1] != -1 );
Index: benchmark/io/readv.cfa
===================================================================
--- benchmark/io/readv.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ benchmark/io/readv.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -54,5 +54,5 @@
 
 void main( Reader & ) {
-	park( __cfaabi_dbg_ctx );
+	park();
 	/* paranoid */ assert( true == __atomic_load_n(&run, __ATOMIC_RELAXED) );
 
@@ -151,5 +151,5 @@
 
 				for(i; nthreads) {
-					unpark( threads[i] __cfaabi_dbg_ctx2 );
+					unpark( threads[i] );
 				}
 				wait(duration, start, end, is_tty);
Index: benchmark/mutex/JavaThread.java
===================================================================
--- benchmark/mutex/JavaThread.java	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ benchmark/mutex/JavaThread.java	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -47,6 +47,6 @@
 	}
 	public static void main(String[] args) throws InterruptedException {
-		if ( args.length > 2 ) System.exit( 1 );
-		if ( args.length == 2 ) { times = Long.parseLong(args[1]); }
+		if ( args.length > 1 ) System.exit( 1 );
+		if ( args.length == 1 ) { times = Long.parseLong(args[0]); }
 
 		for (int n = Integer.parseInt("5"); --n >= 0 ; ) {
Index: benchmark/readyQ/yield.cfa
===================================================================
--- benchmark/readyQ/yield.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ benchmark/readyQ/yield.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -32,5 +32,5 @@
 
 void main( Yielder & this ) {
-	park( __cfaabi_dbg_ctx );
+	park();
 	/* paranoid */ assert( true == __atomic_load_n(&run, __ATOMIC_RELAXED) );
 
@@ -70,5 +70,5 @@
 
 				for(i; nthreads) {
-					unpark( threads[i] __cfaabi_dbg_ctx2 );
+					unpark( threads[i] );
 				}
 				wait(duration, start, end, is_tty);
Index: benchmark/schedint/JavaThread.java
===================================================================
--- benchmark/schedint/JavaThread.java	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ benchmark/schedint/JavaThread.java	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -75,6 +75,6 @@
 	}
 	public static void main(String[] args) throws InterruptedException {
-		if ( args.length > 2 ) System.exit( 1 );
-		if ( args.length == 2 ) { times = Long.parseLong(args[1]); }
+		if ( args.length > 1 ) System.exit( 1 );
+		if ( args.length == 1 ) { times = Long.parseLong(args[0]); }
 
 		for (int n = Integer.parseInt("5"); --n >= 0 ; ) {
Index: doc/LaTeXmacros/common.tex
===================================================================
--- doc/LaTeXmacros/common.tex	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ doc/LaTeXmacros/common.tex	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -11,6 +11,6 @@
 %% Created On       : Sat Apr  9 10:06:17 2016
 %% Last Modified By : Peter A. Buhr
-%% Last Modified On : Wed Sep 23 21:21:55 2020
-%% Update Count     : 454
+%% Last Modified On : Mon Oct  5 09:34:46 2020
+%% Update Count     : 464
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
@@ -251,7 +251,6 @@
 \makeatother
 
-\newcommand{\CFADefaults}{%
+\newcommand{\CFAStyle}{%
 \lstset{
-language=CFA,
 columns=fullflexible,
 basicstyle=\linespread{0.9}\sf,			% reduce line spacing and use sanserif font
@@ -272,8 +271,8 @@
 	{<-}{$\leftarrow$}2 {=>}{$\Rightarrow$}2 {->}{\makebox[1ex][c]{\raisebox{0.4ex}{\rule{0.8ex}{0.075ex}}}\kern-0.2ex\textgreater}2,
 }% lstset
-}% CFADefaults
-
-\ifdefined\CFALatin%
-\lstnewenvironment{cfa}[1][]{\CFADefaults
+}% CFAStyle
+
+\ifdefined\CFALatin% extra Latin-1 escape characters
+\lstnewenvironment{cfa}[1][]{
 \lstset{
 language=CFA,
@@ -289,15 +288,10 @@
 % inline code ©...© (copyright symbol) emacs: C-q M-)
 \lstMakeShortInline©					% single-character for \lstinline
-\else% extra Latin-1 escape characters
+\else% regular ASCI characters
+\lstnewenvironment{cfa}[1][]{
 \lstset{
 language=CFA,
 escapechar=\$,							% LaTeX escape in CFA code
-moredelim=**[is][\color{red}]{@}{@},	% red highlighting `...` (backtick symbol)
-}% lstset
-\lstnewenvironment{cfa}[1][]{\CFADefaults
-\lstset{
-language=CFA,
-escapechar=\$,							% LaTeX escape in CFA code
-moredelim=**[is][\color{red}]{@}{@},	% red highlighting `...` (backtick symbol)
+moredelim=**[is][\color{red}]{@}{@},	% red highlighting @...@
 }% lstset
 \lstset{#1}
Index: doc/bibliography/pl.bib
===================================================================
--- doc/bibliography/pl.bib	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ doc/bibliography/pl.bib	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -1005,5 +1005,5 @@
     key		= {Cforall Benchmarks},
     author	= {{\textsf{C}{$\mathbf{\forall}$} Benchmarks}},
-    howpublished= {\href{https://plg.uwaterloo.ca/~cforall/doc/CforallConcurrentBenchmarks.tar}{https://\-plg.uwaterloo.ca/\-$\sim$cforall/\-doc/\-CforallConcurrentBenchmarks.tar}},
+    howpublished= {\href{https://github.com/cforall/ConcurrentBenchmarks_SPE20}{https://\-github.com/\-cforall/\-ConcurrentBenchmarks\_SPE20}},
 }
 
@@ -1973,5 +1973,5 @@
     title	= {Cooperating Sequential Processes},
     institution	= {Technological University},
-    address	= {Eindhoven, Netherlands},
+    address	= {Eindhoven, Neth.},
     year	= 1965,
     note	= {Reprinted in \cite{Genuys68} pp. 43--112.}
Index: doc/papers/concurrency/Paper.tex
===================================================================
--- doc/papers/concurrency/Paper.tex	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ doc/papers/concurrency/Paper.tex	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -224,17 +224,17 @@
 {}
 \lstnewenvironment{C++}[1][]                            % use C++ style
-{\lstset{language=C++,moredelim=**[is][\protect\color{red}]{`}{`},#1}\lstset{#1}}
+{\lstset{language=C++,moredelim=**[is][\protect\color{red}]{`}{`}}\lstset{#1}}
 {}
 \lstnewenvironment{uC++}[1][]
-{\lstset{language=uC++,moredelim=**[is][\protect\color{red}]{`}{`},#1}\lstset{#1}}
+{\lstset{language=uC++,moredelim=**[is][\protect\color{red}]{`}{`}}\lstset{#1}}
 {}
 \lstnewenvironment{Go}[1][]
-{\lstset{language=Golang,moredelim=**[is][\protect\color{red}]{`}{`},#1}\lstset{#1}}
+{\lstset{language=Golang,moredelim=**[is][\protect\color{red}]{`}{`}}\lstset{#1}}
 {}
 \lstnewenvironment{python}[1][]
-{\lstset{language=python,moredelim=**[is][\protect\color{red}]{`}{`},#1}\lstset{#1}}
+{\lstset{language=python,moredelim=**[is][\protect\color{red}]{`}{`}}\lstset{#1}}
 {}
 \lstnewenvironment{java}[1][]
-{\lstset{language=java,moredelim=**[is][\protect\color{red}]{`}{`},#1}\lstset{#1}}
+{\lstset{language=java,moredelim=**[is][\protect\color{red}]{`}{`}}\lstset{#1}}
 {}
 
@@ -284,5 +284,5 @@
 
 \begin{document}
-\linenumbers				% comment out to turn off line numbering
+%\linenumbers				% comment out to turn off line numbering
 
 \maketitle
@@ -2896,5 +2896,5 @@
 \label{s:RuntimeStructureCluster}
 
-A \newterm{cluster} is a collection of user and kernel threads, where the kernel threads run the user threads from the cluster's ready queue, and the operating system runs the kernel threads on the processors from its ready queue.
+A \newterm{cluster} is a collection of user and kernel threads, where the kernel threads run the user threads from the cluster's ready queue, and the operating system runs the kernel threads on the processors from its ready queue~\cite{Buhr90a}.
 The term \newterm{virtual processor} is introduced as a synonym for kernel thread to disambiguate between user and kernel thread.
 From the language perspective, a virtual processor is an actual processor (core).
@@ -2992,10 +2992,11 @@
 \end{cfa}
 where CPU time in nanoseconds is from the appropriate language clock.
-Each benchmark is performed @N@ times, where @N@ is selected so the benchmark runs in the range of 2--20 seconds for the specific programming language.
+Each benchmark is performed @N@ times, where @N@ is selected so the benchmark runs in the range of 2--20 seconds for the specific programming language;
+each @N@ appears after the experiment name in the following tables.
 The total time is divided by @N@ to obtain the average time for a benchmark.
 Each benchmark experiment is run 13 times and the average appears in the table.
+For languages with a runtime JIT (Java, Node.js, Python), a single half-hour long experiment is run to check stability;
+all long-experiment results are statistically equivalent, \ie median/average/standard-deviation correlate with the short-experiment results, indicating the short experiments reached a steady state.
 All omitted tests for other languages are functionally identical to the \CFA tests and available online~\cite{CforallConcurrentBenchmarks}.
-% tar --exclude-ignore=exclude -cvhf benchmark.tar benchmark
-% cp -p benchmark.tar /u/cforall/public_html/doc/concurrent_benchmark.tar
 
 \paragraph{Creation}
@@ -3006,7 +3007,6 @@
 
 \begin{multicols}{2}
-\lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}
-\begin{cfa}
-@coroutine@ MyCoroutine {};
+\begin{cfa}[xleftmargin=0pt]
+`coroutine` MyCoroutine {};
 void ?{}( MyCoroutine & this ) {
 #ifdef EAGER
@@ -3016,5 +3016,5 @@
 void main( MyCoroutine & ) {}
 int main() {
-	BENCH( for ( N ) { @MyCoroutine c;@ } )
+	BENCH( for ( N ) { `MyCoroutine c;` } )
 	sout | result;
 }
@@ -3030,19 +3030,19 @@
 
 \begin{tabular}[t]{@{}r*{3}{D{.}{.}{5.2}}@{}}
-\multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
-\CFA generator			& 0.6		& 0.6		& 0.0		\\
-\CFA coroutine lazy		& 13.4		& 13.1		& 0.5		\\
-\CFA coroutine eager	& 144.7		& 143.9		& 1.5		\\
-\CFA thread				& 466.4		& 468.0		& 11.3		\\
-\uC coroutine			& 155.6		& 155.7		& 1.7		\\
-\uC thread				& 523.4		& 523.9		& 7.7		\\
-Python generator		& 123.2		& 124.3		& 4.1		\\
-Node.js generator		& 33.4		& 33.5		& 0.3		\\
-Goroutine thread		& 751.0		& 750.5		& 3.1		\\
-Rust tokio thread		& 1860.0	& 1881.1	& 37.6		\\
-Rust thread				& 53801.0	& 53896.8	& 274.9		\\
-Java thread (   10 000)		& 119256.0	& 119679.2	& 2244.0	\\
-Java thread (1 000 000)		& 123100.0	& 123052.5	& 751.6 	\\
-Pthreads thread			& 31465.5	& 31419.5	& 140.4
+\multicolumn{1}{@{}r}{N\hspace*{10pt}} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
+\CFA generator (1B)			& 0.6		& 0.6		& 0.0		\\
+\CFA coroutine lazy	(100M)	& 13.4		& 13.1		& 0.5		\\
+\CFA coroutine eager (10M)	& 144.7		& 143.9		& 1.5		\\
+\CFA thread (10M)			& 466.4		& 468.0		& 11.3		\\
+\uC coroutine (10M)			& 155.6		& 155.7		& 1.7		\\
+\uC thread (10M)			& 523.4		& 523.9		& 7.7		\\
+Python generator (10M)		& 123.2		& 124.3		& 4.1		\\
+Node.js generator (10M)		& 33.4		& 33.5		& 0.3		\\
+Goroutine thread (10M)		& 751.0		& 750.5		& 3.1		\\
+Rust tokio thread (10M)		& 1860.0	& 1881.1	& 37.6		\\
+Rust thread	(250K)			& 53801.0	& 53896.8	& 274.9		\\
+Java thread (250K)			& 119256.0	& 119679.2	& 2244.0	\\
+% Java thread (1 000 000)		& 123100.0	& 123052.5	& 751.6 	\\
+Pthreads thread	(250K)		& 31465.5	& 31419.5	& 140.4
 \end{tabular}
 \end{multicols}
@@ -3053,19 +3053,20 @@
 Internal scheduling is measured using a cycle of two threads signalling and waiting.
 Figure~\ref{f:schedint} shows the code for \CFA, with results in Table~\ref{t:schedint}.
-Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects.
-Java scheduling is significantly greater because the benchmark explicitly creates multiple threads in order to prevent the JIT from making the program sequential, \ie removing all locking.
+Note, the \CFA incremental cost for bulk acquire is a fixed cost for small numbers of mutex objects.
+User-level threading has one kernel thread, eliminating contention between the threads (direct handoff of the kernel thread).
+Kernel-level threading has two kernel threads allowing some contention.
 
 \begin{multicols}{2}
-\lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}
-\begin{cfa}
+\setlength{\tabcolsep}{3pt}
+\begin{cfa}[xleftmargin=0pt]
 volatile int go = 0;
-@condition c;@
-@monitor@ M {} m1/*, m2, m3, m4*/;
-void call( M & @mutex p1/*, p2, p3, p4*/@ ) {
-	@signal( c );@
-}
-void wait( M & @mutex p1/*, p2, p3, p4*/@ ) {
+`condition c;`
+`monitor` M {} m1/*, m2, m3, m4*/;
+void call( M & `mutex p1/*, p2, p3, p4*/` ) {
+	`signal( c );`
+}
+void wait( M & `mutex p1/*, p2, p3, p4*/` ) {
 	go = 1;	// continue other thread
-	for ( N ) { @wait( c );@ } );
+	for ( N ) { `wait( c );` } );
 }
 thread T {};
@@ -3092,13 +3093,13 @@
 
 \begin{tabular}{@{}r*{3}{D{.}{.}{5.2}}@{}}
-\multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
-\CFA @signal@, 1 monitor	& 364.4		& 364.2		& 4.4		\\
-\CFA @signal@, 2 monitor	& 484.4		& 483.9		& 8.8		\\
-\CFA @signal@, 4 monitor	& 709.1		& 707.7		& 15.0		\\
-\uC @signal@ monitor		& 328.3		& 327.4		& 2.4		\\
-Rust cond. variable			& 7514.0	& 7437.4	& 397.2		\\
-Java @notify@ monitor (  1 000 000)		& 8717.0	& 8774.1	& 471.8		\\
-Java @notify@ monitor (100 000 000)		& 8634.0	& 8683.5	& 330.5		\\
-Pthreads cond. variable		& 5553.7	& 5576.1	& 345.6
+\multicolumn{1}{@{}r}{N\hspace*{10pt}} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
+\CFA @signal@, 1 monitor (10M)	& 364.4		& 364.2		& 4.4		\\
+\CFA @signal@, 2 monitor (10M)	& 484.4		& 483.9		& 8.8		\\
+\CFA @signal@, 4 monitor (10M)	& 709.1		& 707.7		& 15.0		\\
+\uC @signal@ monitor (10M)		& 328.3		& 327.4		& 2.4		\\
+Rust cond. variable	(1M)		& 7514.0	& 7437.4	& 397.2		\\
+Java @notify@ monitor (1M)		& 8717.0	& 8774.1	& 471.8		\\
+% Java @notify@ monitor (100 000 000)		& 8634.0	& 8683.5	& 330.5		\\
+Pthreads cond. variable (1M)	& 5553.7	& 5576.1	& 345.6
 \end{tabular}
 \end{multicols}
@@ -3109,14 +3110,14 @@
 External scheduling is measured using a cycle of two threads calling and accepting the call using the @waitfor@ statement.
 Figure~\ref{f:schedext} shows the code for \CFA with results in Table~\ref{t:schedext}.
-Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects.
+Note, the \CFA incremental cost for bulk acquire is a fixed cost for small numbers of mutex objects.
 
 \begin{multicols}{2}
-\lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}
+\setlength{\tabcolsep}{5pt}
 \vspace*{-16pt}
-\begin{cfa}
-@monitor@ M {} m1/*, m2, m3, m4*/;
-void call( M & @mutex p1/*, p2, p3, p4*/@ ) {}
-void wait( M & @mutex p1/*, p2, p3, p4*/@ ) {
-	for ( N ) { @waitfor( call : p1/*, p2, p3, p4*/ );@ }
+\begin{cfa}[xleftmargin=0pt]
+`monitor` M {} m1/*, m2, m3, m4*/;
+void call( M & `mutex p1/*, p2, p3, p4*/` ) {}
+void wait( M & `mutex p1/*, p2, p3, p4*/` ) {
+	for ( N ) { `waitfor( call : p1/*, p2, p3, p4*/ );` }
 }
 thread T {};
@@ -3135,14 +3136,14 @@
 \columnbreak
 
-\vspace*{-16pt}
+\vspace*{-18pt}
 \captionof{table}{External-scheduling comparison (nanoseconds)}
 \label{t:schedext}
 \begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}}
-\multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
-\CFA @waitfor@, 1 monitor	& 367.1	& 365.3	& 5.0	\\
-\CFA @waitfor@, 2 monitor	& 463.0	& 464.6	& 7.1	\\
-\CFA @waitfor@, 4 monitor	& 689.6	& 696.2	& 21.5	\\
-\uC \lstinline[language=uC++]|_Accept| monitor	& 328.2	& 329.1	& 3.4	\\
-Go \lstinline[language=Golang]|select| channel	& 365.0	& 365.5	& 1.2
+\multicolumn{1}{@{}r}{N\hspace*{10pt}} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
+\CFA @waitfor@, 1 monitor (10M)	& 367.1	& 365.3	& 5.0	\\
+\CFA @waitfor@, 2 monitor (10M)	& 463.0	& 464.6	& 7.1	\\
+\CFA @waitfor@, 4 monitor (10M)	& 689.6	& 696.2	& 21.5	\\
+\uC \lstinline[language=uC++]|_Accept| monitor (10M)	& 328.2	& 329.1	& 3.4	\\
+Go \lstinline[language=Golang]|select| channel (10M)	& 365.0	& 365.5	& 1.2
 \end{tabular}
 \end{multicols}
@@ -3157,8 +3158,8 @@
 
 \begin{multicols}{2}
-\lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}
-\begin{cfa}
-@monitor@ M {} m1/*, m2, m3, m4*/;
-call( M & @mutex p1/*, p2, p3, p4*/@ ) {}
+\setlength{\tabcolsep}{3pt}
+\begin{cfa}[xleftmargin=0pt]
+`monitor` M {} m1/*, m2, m3, m4*/;
+call( M & `mutex p1/*, p2, p3, p4*/` ) {}
 int main() {
 	BENCH( for( N ) call( m1/*, m2, m3, m4*/ ); )
@@ -3175,15 +3176,15 @@
 \label{t:mutex}
 \begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}}
-\multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
-test-and-test-set lock			& 19.1	& 18.9	& 0.4	\\
-\CFA @mutex@ function, 1 arg.	& 48.3	& 47.8	& 0.9	\\
-\CFA @mutex@ function, 2 arg.	& 86.7	& 87.6	& 1.9	\\
-\CFA @mutex@ function, 4 arg.	& 173.4	& 169.4	& 5.9	\\
-\uC @monitor@ member rtn.		& 54.8	& 54.8	& 0.1	\\
-Goroutine mutex lock			& 34.0	& 34.0	& 0.0	\\
-Rust mutex lock					& 33.0	& 33.2	& 0.8	\\
-Java synchronized method (   100 000 000)		& 31.0	& 30.9	& 0.5	\\
-Java synchronized method (10 000 000 000)		& 31.0 & 30.2 & 0.9 \\
-Pthreads mutex Lock				& 31.0	& 31.1	& 0.4
+\multicolumn{1}{@{}r}{N\hspace*{10pt}} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
+test-and-test-set lock (50M)		& 19.1	& 18.9	& 0.4	\\
+\CFA @mutex@ function, 1 arg. (50M)	& 48.3	& 47.8	& 0.9	\\
+\CFA @mutex@ function, 2 arg. (50M)	& 86.7	& 87.6	& 1.9	\\
+\CFA @mutex@ function, 4 arg. (50M)	& 173.4	& 169.4	& 5.9	\\
+\uC @monitor@ member rtn. (50M)		& 54.8	& 54.8	& 0.1	\\
+Goroutine mutex lock (50M)			& 34.0	& 34.0	& 0.0	\\
+Rust mutex lock (50M)				& 33.0	& 33.2	& 0.8	\\
+Java synchronized method (50M)		& 31.0	& 30.9	& 0.5	\\
+% Java synchronized method (10 000 000 000)		& 31.0 & 30.2 & 0.9 \\
+Pthreads mutex Lock (50M)			& 31.0	& 31.1	& 0.4
 \end{tabular}
 \end{multicols}
@@ -3214,15 +3215,14 @@
 
 \begin{multicols}{2}
-\lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}
-\begin{cfa}[aboveskip=0pt,belowskip=0pt]
-@coroutine@ C {};
-void main( C & ) { for () { @suspend;@ } }
+\begin{cfa}[xleftmargin=0pt]
+`coroutine` C {};
+void main( C & ) { for () { `suspend;` } }
 int main() { // coroutine test
 	C c;
-	BENCH( for ( N ) { @resume( c );@ } )
+	BENCH( for ( N ) { `resume( c );` } )
 	sout | result;
 }
 int main() { // thread test
-	BENCH( for ( N ) { @yield();@ } )
+	BENCH( for ( N ) { `yield();` } )
 	sout | result;
 }
@@ -3237,22 +3237,22 @@
 \label{t:ctx-switch}
 \begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}}
-\multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
-C function			& 1.8		& 1.8		& 0.0	\\
-\CFA generator		& 1.8		& 2.0		& 0.3	\\
-\CFA coroutine		& 32.5		& 32.9		& 0.8	\\
-\CFA thread			& 93.8		& 93.6		& 2.2	\\
-\uC coroutine		& 50.3		& 50.3		& 0.2	\\
-\uC thread			& 97.3		& 97.4		& 1.0	\\
-Python generator	& 40.9		& 41.3		& 1.5	\\
-Node.js await		& 1852.2	& 1854.7	& 16.4	\\
-Node.js generator	& 33.3		& 33.4		& 0.3	\\
-Goroutine thread	& 143.0		& 143.3		& 1.1	\\
-Rust async await	& 32.0		& 32.0		& 0.0	\\
-Rust tokio thread	& 143.0		& 143.0		& 1.7	\\
-Rust thread			& 332.0		& 331.4		& 2.4	\\
-Java thread	(      100 000)		& 405.0		& 415.0		& 17.6	\\
-Java thread (  100 000 000)			& 413.0 & 414.2 & 6.2 \\
-Java thread (5 000 000 000)			& 415.0 & 415.2 & 6.1 \\
-Pthreads thread		& 334.3		& 335.2		& 3.9
+\multicolumn{1}{@{}r}{N\hspace*{10pt}} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
+C function (10B)			& 1.8		& 1.8		& 0.0	\\
+\CFA generator (5B)			& 1.8		& 2.0		& 0.3	\\
+\CFA coroutine (100M)		& 32.5		& 32.9		& 0.8	\\
+\CFA thread (100M)			& 93.8		& 93.6		& 2.2	\\
+\uC coroutine (100M)		& 50.3		& 50.3		& 0.2	\\
+\uC thread (100M)			& 97.3		& 97.4		& 1.0	\\
+Python generator (100M)		& 40.9		& 41.3		& 1.5	\\
+Node.js await (5M)			& 1852.2	& 1854.7	& 16.4	\\
+Node.js generator (100M)	& 33.3		& 33.4		& 0.3	\\
+Goroutine thread (100M)		& 143.0		& 143.3		& 1.1	\\
+Rust async await (100M)		& 32.0		& 32.0		& 0.0	\\
+Rust tokio thread (100M)	& 143.0		& 143.0		& 1.7	\\
+Rust thread (25M)			& 332.0		& 331.4		& 2.4	\\
+Java thread (100M)			& 405.0		& 415.0		& 17.6	\\
+% Java thread (  100 000 000)			& 413.0 & 414.2 & 6.2 \\
+% Java thread (5 000 000 000)			& 415.0 & 415.2 & 6.1 \\
+Pthreads thread (25M)		& 334.3		& 335.2		& 3.9
 \end{tabular}
 \end{multicols}
@@ -3263,8 +3263,11 @@
 Languages using 1:1 threading based on pthreads can at best meet or exceed, due to language overhead, the pthread results.
 Note, pthreads has a fast zero-contention mutex lock checked in user space.
-Languages with M:N threading have better performance than 1:1 because there is no operating-system interactions.
+Languages with M:N threading have better performance than 1:1 because there is no operating-system interactions (context-switching or locking).
+As well, for locking experiments, M:N threading has less contention if only one kernel thread is used.
 Languages with stackful coroutines have higher cost than stackless coroutines because of stack allocation and context switching;
 however, stackful \uC and \CFA coroutines have approximately the same performance as stackless Python and Node.js generators.
 The \CFA stackless generator is approximately 25 times faster for suspend/resume and 200 times faster for creation than stackless Python and Node.js generators.
+The Node.js context-switch is costly when asynchronous await must enter the event engine because a promise is not fulfilled.
+Finally, the benchmark results correlate across programming languages with and without JIT, indicating the JIT has completed any runtime optimizations.
 
 
@@ -3324,5 +3327,5 @@
 
 The authors recognize the design assistance of Aaron Moss, Rob Schluntz, Andrew Beach, and Michael Brooks; David Dice for commenting and helping with the Java benchmarks; and Gregor Richards for helping with the Node.js benchmarks.
-This research is funded by a grant from Waterloo-Huawei (\url{http://www.huawei.com}) Joint Innovation Lab. %, and Peter Buhr is partially funded by the Natural Sciences and Engineering Research Council of Canada.
+This research is funded by the NSERC/Waterloo-Huawei (\url{http://www.huawei.com}) Joint Innovation Lab. %, and Peter Buhr is partially funded by the Natural Sciences and Engineering Research Council of Canada.
 
 {%
Index: doc/papers/concurrency/annex/local.bib
===================================================================
--- doc/papers/concurrency/annex/local.bib	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ doc/papers/concurrency/annex/local.bib	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -59,5 +59,5 @@
 @manual{Cpp-Transactions,
 	keywords	= {C++, Transactional Memory},
-	title		= {Technical Specification for C++ Extensions for Transactional Memory},
+	title		= {Tech. Spec. for C++ Extensions for Transactional Memory},
 	organization= {International Standard ISO/IEC TS 19841:2015 },
 	publisher   = {American National Standards Institute},
Index: doc/papers/concurrency/mail2
===================================================================
--- doc/papers/concurrency/mail2	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ doc/papers/concurrency/mail2	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -959,2 +959,117 @@
 Software: Practice and Experience Editorial Office
 
+
+
+Date: Wed, 2 Sep 2020 20:55:34 +0000
+From: Richard Jones <onbehalfof@manuscriptcentral.com>
+Reply-To: R.E.Jones@kent.ac.uk
+To: tdelisle@uwaterloo.ca, pabuhr@uwaterloo.ca
+Subject: Software: Practice and Experience - Decision on Manuscript ID
+ SPE-19-0219.R2
+
+02-Sep-2020
+
+Dear Dr Buhr,
+
+Many thanks for submitting SPE-19-0219.R2 entitled "Advanced Control-flow and Concurrency in Cforall" to Software: Practice and Experience. The paper has now been reviewed and the comments of the referees are included at the bottom of this letter. I apologise for the length of time it has taken to get these.
+
+Both reviewers consider this paper to be close to acceptance. However, before I can accept this paper, I would like you address the comments of Reviewer 2, particularly with regard to the description of the adaptation Java harness to deal with warmup. I would expect to see a convincing argument that the computation has reached a steady state. I would also like you to provide the values for N for each benchmark run. This should be very straightforward for you to do. There are a couple of papers on steady state that you may wish to consult (though I am certainly not pushing my own work).
+
+1) Barrett, Edd; Bolz-Tereick, Carl Friedrich; Killick, Rebecca; Mount, Sarah and Tratt, Laurence. Virtual Machine Warmup Blows Hot and Cold. OOPSLA 2017. https://doi.org/10.1145/3133876
+Virtual Machines (VMs) with Just-In-Time (JIT) compilers are traditionally thought to execute programs in two phases: the initial warmup phase determines which parts of a program would most benefit from dynamic compilation, before JIT compiling those parts into machine code; subsequently the program is said to be at a steady state of peak performance. Measurement methodologies almost always discard data collected during the warmup phase such that reported measurements focus entirely on peak performance. We introduce a fully automated statistical approach, based on changepoint analysis, which allows us to determine if a program has reached a steady state and, if so, whether that represents peak performance or not. Using this, we show that even when run in the most controlled of circumstances, small, deterministic, widely studied microbenchmarks often fail to reach a steady state of peak performance on a variety of common VMs. Repeating our experiment on 3 different machines, we found that at most 43.5% of pairs consistently reach a steady state of peak performance.
+
+2) Kalibera, Tomas and Jones, Richard. Rigorous Benchmarking in Reasonable Time. ISMM  2013. https://doi.org/10.1145/2555670.2464160
+Experimental evaluation is key to systems research. Because modern systems are complex and non-deterministic, good experimental methodology demands that researchers account for uncertainty. To obtain valid results, they are expected to run many iterations of benchmarks, invoke virtual machines (VMs) several times, or even rebuild VM or benchmark binaries more than once. All this repetition costs time to complete experiments. Currently, many evaluations give up on sufficient repetition or rigorous statistical methods, or even run benchmarks only in training sizes. The results reported often lack proper variation estimates and, when a small difference between two systems is reported, some are simply unreliable.In contrast, we provide a statistically rigorous methodology for repetition and summarising results that makes efficient use of experimentation time. Time efficiency comes from two key observations. First, a given benchmark on a given platform is typically prone to much less non-determinism than the common worst-case of published corner-case studies. Second, repetition is most needed where most uncertainty arises (whether between builds, between executions or between iterations). We capture experimentation cost with a novel mathematical model, which we use to identify the number of repetitions at each level of an experiment necessary and sufficient to obtain a given level of precision.We present our methodology as a cookbook that guides researchers on the number of repetitions they should run to obtain reliable results. We also show how to present results with an effect size confidence interval. As an example, we show how to use our methodology to conduct throughput experiments with the DaCapo and SPEC CPU benchmarks on three recent platforms.
+
+You have 42 days from the date of this email to submit your revision. If you are unable to complete the revision within this time, please contact me to request a short extension.
+
+You can upload your revised manuscript and submit it through your Author Center. Log into https://mc.manuscriptcentral.com/spe and enter your Author Center, where you will find your manuscript title listed under "Manuscripts with Decisions".
+
+When submitting your revised manuscript, you will be able to respond to the comments made by the referee(s) in the space provided.  You can use this space to document any changes you make to the original manuscript.
+
+If you would like help with English language editing, or other article preparation support, Wiley Editing Services offers expert help with English Language Editing, as well as translation, manuscript formatting, and figure formatting at www.wileyauthors.com/eeo/preparation. You can also check out our resources for Preparing Your Article for general guidance about writing and preparing your manuscript at www.wileyauthors.com/eeo/prepresources.
+ 
+Once again, thank you for submitting your manuscript to Software: Practice and Experience. I look forward to receiving your revision.
+
+Sincerely,
+Richard
+
+Prof. Richard Jones
+Editor, Software: Practice and Experience
+R.E.Jones@kent.ac.uk
+
+Referee(s)' Comments to Author:
+
+Reviewing: 1
+
+Comments to the Author
+Overall, I felt that this draft was an improvement on previous drafts and I don't have further changes to request. 
+
+I appreciated the new language to clarify the relationship of external and internal scheduling, for example, as well as the new measurements of Rust tokio. Also, while I still believe that the choice between thread/generator/coroutine and so forth could be made crisper and clearer, the current draft of Section 2 did seem adequate to me in terms of specifying the considerations that users would have to take into account to make the choice.
+
+
+Reviewing: 2
+
+Comments to the Author
+First: let me apologise for the delay on this review. I'll blame the global pandemic combined with my institution's senior management's counterproductive decisions for taking up most of my time and all of my energy.
+
+At this point, reading the responses, I think we've been around the course enough times that further iteration is unlikely to really improve the paper any further, so I'm happy to recommend acceptance.    My main comments are that there were some good points in the responses to *all* the reviews and I strongly encourage the authors to incorporate those discursive responses into the final paper so they may benefit readers as well as reviewers.   I agree with the recommendations of reviewer #2 that the paper could usefully be split in to two, which I think I made to a previous revision, but I'm happy to leave that decision to the Editor. 
+
+Finally, the paper needs to describe how the Java harness was adapted to deal with warmup; why the computation has warmed up and reached a steady state - similarly for js and Python. The tables should also give the "N" chosen for each benchmark run.
+ 
+minor points
+* don't start sentences with "However"
+* most downloaded isn't an "Award"
+
+
+
+Date: Thu, 1 Oct 2020 05:34:29 +0000
+From: Richard Jones <onbehalfof@manuscriptcentral.com>
+Reply-To: R.E.Jones@kent.ac.uk
+To: pabuhr@uwaterloo.ca
+Subject: Revision reminder - SPE-19-0219.R2
+
+01-Oct-2020
+
+Dear Dr Buhr
+
+SPE-19-0219.R2
+
+This is a reminder that your opportunity to revise and re-submit your manuscript will expire 14 days from now. If you require more time please contact me directly and I may grant an extension to this deadline, otherwise the option to submit a revision online, will not be available.
+
+If your article is of potential interest to the general public, (which means it must be timely, groundbreaking, interesting and impact on everyday society) then please e-mail ejp@wiley.co.uk explaining the public interest side of the research. Wiley will then investigate the potential for undertaking a global press campaign on the article.
+
+I look forward to receiving your revision.
+
+Sincerely,
+
+Prof. Richard Jones
+Editor, Software: Practice and Experience
+
+https://mc.manuscriptcentral.com/spe
+
+
+
+Date: Tue, 6 Oct 2020 15:29:41 +0000
+From: Mayank Roy Chowdhury <onbehalfof@manuscriptcentral.com>
+Reply-To: speoffice@wiley.com
+To: tdelisle@uwaterloo.ca, pabuhr@uwaterloo.ca
+Subject: SPE-19-0219.R3 successfully submitted
+
+06-Oct-2020
+
+Dear Dr Buhr,
+
+Your manuscript entitled "Advanced Control-flow and Concurrency in Cforall" has been successfully submitted online and is presently being given full consideration for publication in Software: Practice and Experience.
+
+Your manuscript number is SPE-19-0219.R3.  Please mention this number in all future correspondence regarding this submission.
+
+You can view the status of your manuscript at any time by checking your Author Center after logging into https://mc.manuscriptcentral.com/spe.  If you have difficulty using this site, please click the 'Get Help Now' link at the top right corner of the site.
+
+
+Thank you for submitting your manuscript to Software: Practice and Experience.
+
+Sincerely,
+
+Software: Practice and Experience Editorial Office
+
Index: doc/papers/concurrency/response3
===================================================================
--- doc/papers/concurrency/response3	(revision 41b8ea40e8b670142d317601e53338340fee08be)
+++ doc/papers/concurrency/response3	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -0,0 +1,27 @@
+        I would like you address the comments of Reviewer 2, particularly with regard to the description of the adaptation Java harness to deal with warmup. I would expect to see a convincing argument that the computation has reached a steady state.
+
+We understand referee2 and your concern about the JIT experiments, which is why we verified our experiments with two experts in JIT development for both Java and Node.js before submitting the paper. We also read the supplied papers, but most of the information is not applicable to our work for the following reasons.
+
+1. SPEC benchmarks are medium to large. In contrast, our benchmarks are 5-15 lines in length for each programming language (see code for the Cforall tests in the paper). Hence, there is no significant computations, complex control flow, or use of memory. They test one specific language features (context switch, mutex call, etc.) in isolation over and over again. These language features are fixed (e.g., acquiring and releasing a lock is a fixed cost). Therefore, unless the feature can be removed there is nothing to optimize at runtime. But these features cannot be removed without changing the meaning of the benchmark. If the feature is removed, the timing result would be 0. In fact, it was difficult to prevent the JIT from completely eliding some benchmarks because there are no side-effects.
+
+2. All of our benchmark results correlate across programming languages with and without JIT, indicating the JIT has completed any runtime optimizations (added this sentence to Section 8.1). Any large differences are explained by how a language implements a feature not by how the compiler/JIT precesses that feature. Section 8.1 discusses these points in detail.
+
+3. We also added a sentence about running all JIT-base programming language experiments for 30 minutes and there was no statistical difference, med/avg/std correlated with the short-run experiments, which seems a convincing argument that the benchmark has reached a steady state. If the JIT takes longer than 30 minutes to achieve its optimization goals, it is unlikely to be useful.
+
+4. The purpose of the performance section is not to draw conclusions about improvements. It is to contrast program-language implementation approaches. Section 8.1 talks about ramifications of certain design and implementation decisions with respect to overall performance. The only conclusion we draw about performance is:
+
+   Performance comparisons with other concurrent systems and languages show the Cforall approach is competitive across all basic operations, which translates directly into good performance in well-written applications with advanced control-flow.
+
+
+       I would also like you to provide the values for N for each benchmark run.
+
+Done.
+
+
+Referee 2 suggested
+
+   * don't start sentences with "However"
+
+However, there are numerous grammar sites on the web indicating "however" (a conjunction) at the start of a sentence is acceptable, e.g.:
+
+https://www.merriam-webster.com/words-at-play/can-you-start-a-sentence-with-however This is a stylistic choice, more than anything else, as we have a considerable body of evidence of writers using however to begin sentences, frequently with the meaning of "nevertheless."
Index: doc/proposals/ZeroCostPreemption.md
===================================================================
--- doc/proposals/ZeroCostPreemption.md	(revision 41b8ea40e8b670142d317601e53338340fee08be)
+++ doc/proposals/ZeroCostPreemption.md	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -0,0 +1,16 @@
+## "Zero Cost" Preemption in for Cforall ##
+
+Similar to "Zero Cost" exceptions, this is a proposal to support preemption with little to no runtime cost for the book-keeping. (Other than having exceptions).
+
+Preemption stops users threads at random locations and forces a context switch using a signal handler. Since this is not safe and/or does not make sense in many contexts, the runtime needs a system to disable interrupts for certain regions of codes.
+
+Currently, Cforall uses _[kernel] thread-local storage_(TLS) to handle this, setting a flag to false when preemption should be disabled. This works on x86/x64 but only with a specific TLS model, and does not work with ARM. The problem is that if the loading of the TLS variable is not done in a single instruction, it allows a race condition, where user-threads could disable preemption for the wrong processor, i.e., be moved to a different processor and update the previous processor.
+
+The fix being worked on is to protect the specific TLS variable with a special function.
+
+## The Proposal ##
+A better approach, would be to re-use the Exception Handling Data structure to identify regions of code that do not allow preemption. These regions of code would be marked using the same mechanism which marks stack unwinding requirements.
+
+When the signal handler is called, it would search the stack similarly to how the stack is searched when an exception is thrown and do the context switch or not based on the result.
+
+This is an optimization, since signal handlers for preemption are already rare and costly but enabling/disabling interrupts is very common (1000x more common). Using the "Zero-Cost" exception mechanism, enabling/disabling interrupts should be free at runtime and the rare signal/handler become more expensive.
Index: doc/refrat/refrat.tex
===================================================================
--- doc/refrat/refrat.tex	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ doc/refrat/refrat.tex	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -11,6 +11,6 @@
 %% Created On       : Wed Apr  6 14:52:25 2016
 %% Last Modified By : Peter A. Buhr
-%% Last Modified On : Thu Sep 24 16:34:51 2020
-%% Update Count     : 109
+%% Last Modified On : Mon Oct  5 09:02:53 2020
+%% Update Count     : 110
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
@@ -63,5 +63,5 @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
-\CFADefaults											% use default CFA format-style
+\CFAStyle												% use default CFA format-style
 \lstnewenvironment{C++}[1][]                            % use C++ style
 {\lstset{language=C++,moredelim=**[is][\protect\color{red}]{®}{®},#1}}
Index: doc/theses/fangren_yu_COOP_S20/Makefile
===================================================================
--- doc/theses/fangren_yu_COOP_S20/Makefile	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ doc/theses/fangren_yu_COOP_S20/Makefile	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -46,5 +46,4 @@
 # File Dependencies #
 
-
 ${DOCUMENT} : ${BASE}.ps
 	ps2pdf $<
Index: doc/theses/fangren_yu_COOP_S20/Report.tex
===================================================================
--- doc/theses/fangren_yu_COOP_S20/Report.tex	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ doc/theses/fangren_yu_COOP_S20/Report.tex	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -1,3 +1,3 @@
-\documentclass[twoside,12pt]{article}
+\documentclass[twoside,11pt]{article}
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -11,7 +11,8 @@
 \usepackage[labelformat=simple,aboveskip=0pt,farskip=0pt]{subfig}
 \renewcommand{\thesubfigure}{\alph{subfigure})}
+\usepackage[flushmargin]{footmisc}						% support label/reference in footnote
 \usepackage{latexsym}                                   % \Box glyph
 \usepackage{mathptmx}                                   % better math font with "times"
-\usepackage{appendix}
+\usepackage[toc]{appendix}								% article does not have appendix
 \usepackage[usenames]{color}
 \input{common}                                          % common CFA document macros
@@ -39,10 +40,10 @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
-\CFADefaults
+\CFAStyle												% CFA code-style for all languages
 \lstset{
-language=C++,											% make C++ the default language
+language=C++,moredelim=**[is][\color{red}]{@}{@}		% make C++ the default language
 }% lstset
 \lstnewenvironment{C++}[1][]                            % use C++ style
-{\lstset{language=C++,moredelim=**[is][\color{red}]{@}{@},#1}}{}
+{\lstset{language=C++,moredelim=**[is][\color{red}]{@}{@}}\lstset{#1}}{}
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -87,5 +88,5 @@
 \section{Overview}
 
-cfa-cc is the reference compiler for the \CFA programming language, which is a non-object-oriented extension to C.
+@cfa-cc@ is the reference compiler for the \CFA programming language, which is a non-object-oriented extension to C.
 \CFA attempts to introduce productive modern programming language features to C while maintaining as much backward-compatibility as possible, so that most existing C programs can seamlessly work with \CFA.
 
@@ -381,7 +382,4 @@
 \subsubsection{Source: \lstinline{AST/SymbolTable.hpp}}
 
-
-\subsubsection{Source: \lstinline{SymTab/Indexer.h}}
-
 Function
 \begin{C++}
@@ -612,35 +610,35 @@
 
 
-\begin{appendices}[toc,titletoc]
+\appendix
 \section{Appendix}
-
 
 \subsection{Kinds of Type Parameters}
 \label{s:KindsTypeParameters}
 
-A type parameter in a @forall@ clause has three possible kinds:
+A type parameter in a @forall@ clause has 3 kinds:
 \begin{enumerate}[listparindent=0pt]
 \item
-@dtype@: any data type (built-in or user defined).
-
-There is also a difference between opaque types (incomplete types, \ie those with only a forward declaration) and concrete types.
-Only concrete types can be directly used as a variable type.
-
-\CFA provides the @otype@ shorthand to require a type parameter be concrete, which also implicitly asserts the existence of its default and copy constructors, assignment, and destructor\footnote{\CFA implements the same automatic resource management (RAII) semantics as \CC.}.
-\item
-@ftype@: any function type.
-
-@ftype@ provides two purposes:
-\begin{itemize}
-\item
-Differentiate function pointer from data pointer because (in theory) some systems have different sizes for these pointers.
-\item
-Disallow a function pointer to match an overloaded data pointer, since variables and functions can have the same names.
-\end{itemize}
+@dtype@: any data type (built-in or user defined) that is not a concrete type.
+
+A non-concrete type is an incomplete type such as an opaque type or pointer/reference with an implicit (pointer) size and implicitly generated reference and dereference operations.
+\item
+@otype@: any data type (built-in or user defined) that is concrete type.
+
+A concrete type is a complete type, \ie types that can be used to create a variable, which also implicitly asserts the existence of default and copy constructors, assignment, and destructor\footnote{\CFA implements the same automatic resource management (RAII) semantics as \CC.}.
+% \item
+% @ftype@: any function type.
+% 
+% @ftype@ provides two purposes:
+% \begin{itemize}
+% \item
+% Differentiate function pointer from data pointer because (in theory) some systems have different sizes for these pointers.
+% \item
+% Disallow a function pointer to match an overloaded data pointer, since variables and functions can have the same names.
+% \end{itemize}
 
 \item
 @ttype@: tuple (variadic) type.
 
-@ttype@ parameter may only appear as type of the last parameter in a function, and it provides a type-safe way to implement variadic functions.
+Restricted to the type for the last parameter in a function, it provides a type-safe way to implement variadic functions.
 Note however, that it has certain restrictions, as described in the implementation section below.
 \end{enumerate}
@@ -673,10 +671,7 @@
 \begin{enumerate}
 \item
-All types are function declarations are candidates of implicit parameters.
+All types, variables, and functions are candidates of implicit parameters
 \item
 The parameter (assertion) name must match the actual declarations.
-\item
-Currently, assertions are all functions.
-Note that since \CFA has variable overloading, implicit value parameters might also be supported in the future.
 \end{enumerate}
 
@@ -732,5 +727,4 @@
 In particular, polymorphic variadic recursion must be structural (\ie the number of arguments decreases in any possible recursive calls), otherwise code generation gets into an infinite loop.
 The \CFA compiler sets a limit on assertion depth and reports an error if assertion resolution does not terminate within the limit (as for \lstinline[language=C++]@templates@ in \CC).
-\end{appendices}
 
 \bibliographystyle{plain}
Index: doc/theses/thierry_delisle_PhD/code/readQ_example/Makefile
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readQ_example/Makefile	(revision 41b8ea40e8b670142d317601e53338340fee08be)
+++ doc/theses/thierry_delisle_PhD/code/readQ_example/Makefile	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -0,0 +1,6 @@
+all: gui-proto
+
+CXXFLAGS = -fpic -g -O0 -I.
+
+gui-proto: proto-gui/main.o thrdlib/thread.o
+	$(CXX) -pthread -ldl -o ${@} ${^} -ftls-model=initial-exec
Index: doc/theses/thierry_delisle_PhD/code/readQ_example/proto-gui/main.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readQ_example/proto-gui/main.cpp	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ doc/theses/thierry_delisle_PhD/code/readQ_example/proto-gui/main.cpp	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -1,3 +1,3 @@
-#include "thrdlib/thread.h"
+#include "thrdlib/thread.hpp"
 
 #include <cassert>
@@ -5,8 +5,33 @@
 #include <algorithm>
 #include <atomic>
+#include <iostream>
 #include <memory>
 #include <vector>
 
 #include <getopt.h>
+using thrdlib::thread_t;
+
+
+extern __attribute__((aligned(128))) thread_local struct {
+	void * volatile this_thread;
+	void * volatile this_processor;
+	void * volatile this_stats;
+
+	struct {
+		volatile unsigned short disable_count;
+		volatile bool enabled;
+		volatile bool in_progress;
+	} preemption_state;
+
+	#if defined(__SIZEOF_INT128__)
+		__uint128_t rand_seed;
+	#else
+		uint64_t rand_seed;
+	#endif
+	struct {
+		uint64_t fwd_seed;
+		uint64_t bck_seed;
+	} ready_rng;
+} kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
 
 //--------------------
@@ -36,5 +61,5 @@
 			assert( expected == reset );
 			if( std::atomic_compare_exchange_strong( &state, &expected, self) ) {
-				thrdlib_park( self );
+				thrdlib::park( self );
 				ret = true;
 				goto END;
@@ -54,5 +79,5 @@
 		if( got == reset ) return false;
 
-		thrdlib_unpark( got );
+		thrdlib::unpark( got );
 		return true;
 	}
@@ -109,5 +134,5 @@
 	the_stats_thread = self;
 	fence();
-	thrdlib_park( self );
+	thrdlib::park( self );
 
 	std::vector<bool> seen;
@@ -115,5 +140,5 @@
 
 	while(last_produced < nproduce) {
-		thrdlib_yield();
+		thrdlib::yield();
 		thrd_stats.stats.ran++;
 		if( last_produced > 0 ) seen.at(last_produced - 1) = true;
@@ -147,5 +172,5 @@
 
 void Renderer( thread_t self ) {
-	thrdlib_unpark( the_stats_thread );
+	thrdlib::unpark( the_stats_thread );
 	for(unsigned i = 0; i < nproduce; i++) {
 		auto & frame = frames[i % nframes];
@@ -178,4 +203,6 @@
 	fsize    = 1000;
 	nproduce = 60;
+
+	const char * framework;
 
 	for(;;) {
@@ -196,4 +223,10 @@
 			case -1:
 				/* paranoid */ assert(optind <= argc);
+				if( optind == argc ) {
+					std::cerr << "Must specify a framework" << std::endl;
+					goto usage;
+
+				}
+				framework = argv[optind];
 				goto run;
 			case 'b':
@@ -228,5 +261,5 @@
 				std::cerr << opt << std::endl;
 			usage:
-				std::cerr << "Usage: " << argv[0] << " [options]" << std::endl;
+				std::cerr << "Usage: " << argv[0] << " [options] framework" << std::endl;
 				std::cerr << std::endl;
 				std::cerr << "  -b, --buff=COUNT    Number of frames to buffer" << std::endl;
@@ -237,4 +270,5 @@
 	}
 	run:
+	assert( framework );
 
 	frames.reset(new Frame[nframes]);
@@ -246,19 +280,21 @@
 	std::cout << "(Buffering " << nframes << ")" << std::endl;
 
-	thrdlib_setproccnt( 2 );
-
-	thread_t stats     = thrdlib_create( Stats     );
+	thrdlib::init( framework, 2 );
+
+	thread_t stats     = thrdlib::create( Stats );
 	std::cout << "Created Stats Thread" << std::endl;
-	while( the_stats_thread == nullptr ) thrdlib_yield();
+	while( the_stats_thread == nullptr ) thrdlib::yield();
+
 	std::cout << "Creating Main Threads" << std::endl;
-	thread_t renderer  = thrdlib_create( Renderer  );
-	// while(true);
-	thread_t simulator = thrdlib_create( Simulator );
+	thread_t renderer  = thrdlib::create( Renderer  );
+	thread_t simulator = thrdlib::create( Simulator );
 
 	std::cout << "Running" << std::endl;
 
-	thrdlib_join( simulator );
-	thrdlib_join( renderer  );
-	thrdlib_join( stats     );
+	thrdlib::join( simulator );
+	thrdlib::join( renderer  );
+	thrdlib::join( stats     );
+
+	thrdlib::clean();
 
 	std::cout << "----------" << std::endl;
Index: doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/Makefile
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/Makefile	(revision 41b8ea40e8b670142d317601e53338340fee08be)
+++ doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/Makefile	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -0,0 +1,19 @@
+all: fibre.so pthread.so cforall.so
+
+clean:
+	rm -rf fibre.so pthread.so
+
+CXXFLAGS=-Wall -Wextra -O3 -g -fpic -std=c++17 -pthread -ftls-model=initial-exec
+
+pthread.so: pthread.cpp Makefile
+	$(CXX) $(CXXFLAGS) -shared -o ${@} ${<}
+
+fibre.so: fibre.cpp Makefile
+	$(CXX) $(CXXFLAGS) -shared -o ${@} ${<} -lfibre
+
+CFAINC=${HOME}/local/include/cfa-dev
+CFALIB=${HOME}/local/lib/cfa-dev/x64-debug
+CFAFLAGS=-z execstack -I${CFAINC} -I${CFAINC}/concurrency -L${CFALIB} -Wl,-rpath,${CFALIB}
+
+cforall.so: cforall.cpp Makefile
+	$(CXX) $(CXXFLAGS) $(CFAFLAGS) -shared -o ${@} ${<} -lcfathread -lcfa -ldl -lm
Index: doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/cforall.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/cforall.cpp	(revision 41b8ea40e8b670142d317601e53338340fee08be)
+++ doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/cforall.cpp	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -0,0 +1,43 @@
+#include <cassert>
+#include <clib/cfathread.h>
+
+typedef cfathread_t thread_t;
+static_assert(sizeof(thread_t) == sizeof(void*), "thread_t musst be of same size as void*");
+
+#if !defined(__cplusplus)
+#error no __cplusplus define!
+#endif
+
+extern "C" {
+	//--------------------
+	// Basic thread support
+	thread_t thrdlib_create( void (*the_main)( thread_t ) ) {
+		return cfathread_create( the_main );
+	}
+
+	void thrdlib_join( thread_t handle ) {
+		cfathread_join( handle );
+	}
+
+	void thrdlib_park( thread_t ) {
+		cfathread_park();
+	}
+
+	void thrdlib_unpark( thread_t handle ) {
+		cfathread_unpark( handle );
+	}
+
+	void thrdlib_yield( void ) {
+		cfathread_yield();
+	}
+
+	//--------------------
+	// Basic kernel features
+	void thrdlib_init( int procs ) {
+		cfathread_setproccnt(procs);
+	}
+
+	void thrdlib_clean( void ) {
+		cfathread_setproccnt(1);
+	}
+}
Index: doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/fibre.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/fibre.cpp	(revision 41b8ea40e8b670142d317601e53338340fee08be)
+++ doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/fibre.cpp	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -0,0 +1,48 @@
+#include <cassert>
+#include <libfibre/cfibre.h>
+
+typedef cfibre_t thread_t;
+static_assert(sizeof(thread_t) == sizeof(void*), "thread_t musst be of same size as void*");
+
+void * fibre_runner(void * arg) {
+	auto the_main = (void (*)( thread_t ))arg;
+	the_main( cfibre_self() );
+	return nullptr;
+}
+
+extern "C" {
+	//--------------------
+	// Basic thread support
+	thread_t thrdlib_create( void (*the_main)( thread_t ) ) {
+		thread_t fibre;
+		cfibre_create( &fibre, nullptr, fibre_runner, (void*)the_main );
+		return fibre;
+	}
+
+	void thrdlib_join( thread_t handle ) {
+		cfibre_join( handle, nullptr );
+	}
+
+	void thrdlib_park( thread_t handle ) {
+		assert( handle == cfibre_self() );
+		cfibre_park();
+	}
+
+	void thrdlib_unpark( thread_t handle ) {
+		cfibre_unpark( handle );
+	}
+
+	void thrdlib_yield( void ) {
+		cfibre_yield();
+	}
+
+	//--------------------
+	// Basic kernel features
+	void thrdlib_init( int procs ) {
+		cfibre_init_n(1, procs );
+	}
+
+	void thrdlib_clean( void ) {
+
+	}
+}
Index: doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/pthread.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/pthread.cpp	(revision 41b8ea40e8b670142d317601e53338340fee08be)
+++ doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/pthread.cpp	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -0,0 +1,99 @@
+#include <pthread.h>
+#include <errno.h>
+#include <cstring>
+#include <cstdio>
+#include <iostream>
+
+#define CHECKED(x) { int err = x; if( err != 0 ) { std::cerr << "KERNEL ERROR: Operation \"" #x "\" return error " << err << " - " << strerror(err) << std::endl; std::abort(); } }
+
+struct __bin_sem_t {
+	pthread_mutex_t 	lock;
+	pthread_cond_t  	cond;
+	int     		val;
+
+	__bin_sem_t() {
+		// Create the mutex with error checking
+		pthread_mutexattr_t mattr;
+		pthread_mutexattr_init( &mattr );
+		pthread_mutexattr_settype( &mattr, PTHREAD_MUTEX_ERRORCHECK_NP);
+		pthread_mutex_init(&lock, &mattr);
+
+		pthread_cond_init (&cond, nullptr);
+		val = 0;
+	}
+
+	~__bin_sem_t() {
+		CHECKED( pthread_mutex_destroy(&lock) );
+		CHECKED( pthread_cond_destroy (&cond) );
+	}
+
+	void wait() {
+		CHECKED( pthread_mutex_lock(&lock) );
+			while(val < 1) {
+				pthread_cond_wait(&cond, &lock);
+			}
+			val -= 1;
+		CHECKED( pthread_mutex_unlock(&lock) );
+	}
+
+	bool post() {
+		bool needs_signal = false;
+
+		CHECKED( pthread_mutex_lock(&lock) );
+			if(val < 1) {
+				val += 1;
+				pthread_cond_signal(&cond);
+				needs_signal = true;
+			}
+		CHECKED( pthread_mutex_unlock(&lock) );
+
+		return needs_signal;
+	}
+};
+
+#undef CHECKED
+
+//--------------------
+// Basic types
+struct pthread_runner_t {
+	pthread_t handle;
+	__bin_sem_t sem;
+};
+typedef pthread_runner_t * thread_t;
+
+static_assert(sizeof(thread_t) == sizeof(void*), "thread_t musst be of same size as void*");
+
+extern "C" {
+	//--------------------
+	// Basic thread support
+	thread_t thrdlib_create( void (*main)( thread_t ) ) {
+		thread_t thrd = new pthread_runner_t();
+		int r = pthread_create( &thrd->handle, nullptr, (void *(*)(void *))main, thrd );
+		if( r != 0 ) std::abort();
+		return thrd;
+	}
+
+	void thrdlib_join( thread_t handle ) {
+		void * ret;
+		int r = pthread_join( handle->handle, &ret );
+		if( r != 0 ) std::abort();
+		delete handle;
+	}
+
+	void thrdlib_park( thread_t handle ) {
+		handle->sem.wait();
+	}
+
+	void thrdlib_unpark( thread_t handle ) {
+		handle->sem.post();
+	}
+
+	void thrdlib_yield( void ) {
+		int r = pthread_yield();
+		if( r != 0 ) std::abort();
+	}
+
+	//--------------------
+	// Basic kernel features
+	void thrdlib_init( int ) {}
+}
Index: doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/thread.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/thread.cpp	(revision 41b8ea40e8b670142d317601e53338340fee08be)
+++ doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/thread.cpp	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -0,0 +1,68 @@
+#include "thread.hpp"
+
+#include <cstdarg>										// va_start, va_end
+#include <cstdio>
+#include <cstring>										// strlen
+extern "C" {
+	#include <unistd.h>										// _exit, getpid
+	#include <signal.h>
+	#include <dlfcn.h>										// dlopen, dlsym
+	#include <execinfo.h>									// backtrace, messages
+}
+
+#include <iostream>
+#include <string>
+
+using thrdlib::thread_t;
+
+thread_t (*thrdlib::create)( void (*main)( thread_t ) ) = nullptr;
+void (*thrdlib::join)( thread_t handle ) = nullptr;
+void (*thrdlib::park)( thread_t handle ) = nullptr;
+void (*thrdlib::unpark)( thread_t handle ) = nullptr;
+void (*thrdlib::yield)( void ) = nullptr;
+void (*lib_clean)(void) = nullptr;
+
+typedef void (*fptr_t)();
+static fptr_t open_symbol( void * library, const char * symbol, bool required ) {
+	void * ptr = dlsym( library, symbol );
+
+	const char * error = dlerror();
+	if ( required && error ) {
+		std::cerr << "Fetching symbol '" << symbol << "' failed with error '" << error << "'\n";
+		std::abort();
+	}
+
+	return (fptr_t)ptr;
+}
+
+//--------------------
+// Basic kernel features
+void thrdlib::init( const char * name, int procs ) {
+	std::string file = __FILE__;
+	std::size_t found = file.find_last_of("/");
+  	std::string libname = file.substr(0,found+1) + name + ".so";
+
+	std::cout << "Use framework " << name << "(" << libname << ")\n";
+
+	void * library = dlopen( libname.c_str(), RTLD_NOW );
+	if ( const char * error = dlerror() ) {
+		std::cerr << "Could not open library '" << libname << "' from name '" << name <<"'\n";
+		std::cerr << "Error was : '" << error << "'\n";
+		std::abort();
+	}
+
+	void (*lib_init)( int ) = (void (*)( int ))open_symbol( library, "thrdlib_init", false );
+	lib_clean = open_symbol( library, "thrdlib_clean" , false );
+
+	thrdlib::create = (typeof(thrdlib::create))open_symbol( library, "thrdlib_create", true  );
+	thrdlib::join   = (typeof(thrdlib::join  ))open_symbol( library, "thrdlib_join"  , true  );
+	thrdlib::park   = (typeof(thrdlib::park  ))open_symbol( library, "thrdlib_park"  , true  );
+	thrdlib::unpark = (typeof(thrdlib::unpark))open_symbol( library, "thrdlib_unpark", true  );
+	thrdlib::yield  = (typeof(thrdlib::yield ))open_symbol( library, "thrdlib_yield" , true  );
+
+	lib_init( procs );
+}
+
+void thrdlib::clean( void ) {
+	if(lib_clean) lib_clean();
+}
Index: c/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/thread.h
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/thread.h	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ 	(revision )
@@ -1,2 +1,0 @@
-
-#include "thread_pthread.h"
Index: doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/thread.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/thread.hpp	(revision 41b8ea40e8b670142d317601e53338340fee08be)
+++ doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/thread.hpp	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -0,0 +1,18 @@
+#pragma once
+
+namespace thrdlib {
+	typedef void * thread_t;
+
+	//--------------------
+	// Basic thread support
+	extern thread_t (*create)( void (*main)( thread_t ) );
+	extern void (*join)( thread_t handle );
+	extern void (*park)( thread_t handle );
+	extern void (*unpark)( thread_t handle );
+	extern void (*yield)( void ) ;
+
+	//--------------------
+	// Basic kernel features
+	extern void init( const char * name, int procs );
+	extern void clean( void );
+};
Index: c/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/thread_pthread.h
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/thread_pthread.h	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ 	(revision )
@@ -1,107 +1,0 @@
-#pragma once
-
-#include <pthread.h>
-#include <errno.h>
-#include <cstring>
-#include <cstdio>
-#include <iostream>
-
-#define CHECKED(x) { int err = x; if( err != 0 ) { std::cerr << "KERNEL ERROR: Operation \"" #x "\" return error " << err << " - " << strerror(err) << std::endl; std::abort(); } }
-
-struct __bin_sem_t {
-	pthread_mutex_t 	lock;
-	pthread_cond_t  	cond;
-	int     		val;
-
-	__bin_sem_t() {
-		// Create the mutex with error checking
-		pthread_mutexattr_t mattr;
-		pthread_mutexattr_init( &mattr );
-		pthread_mutexattr_settype( &mattr, PTHREAD_MUTEX_ERRORCHECK_NP);
-		pthread_mutex_init(&lock, &mattr);
-
-		pthread_cond_init (&cond, nullptr);
-		val = 0;
-	}
-
-	~__bin_sem_t() {
-		CHECKED( pthread_mutex_destroy(&lock) );
-		CHECKED( pthread_cond_destroy (&cond) );
-	}
-
-	void wait() {
-		CHECKED( pthread_mutex_lock(&lock) );
-			while(val < 1) {
-				pthread_cond_wait(&cond, &lock);
-			}
-			val -= 1;
-		CHECKED( pthread_mutex_unlock(&lock) );
-	}
-
-	bool post() {
-		bool needs_signal = false;
-
-		CHECKED( pthread_mutex_lock(&lock) );
-			if(val < 1) {
-				val += 1;
-				pthread_cond_signal(&cond);
-				needs_signal = true;
-			}
-		CHECKED( pthread_mutex_unlock(&lock) );
-
-		return needs_signal;
-	}
-};
-
-#undef CHECKED
-
-#if defined(__cforall) || defined(__cpluplus)
-extern "C" {
-#endif
-	//--------------------
-	// Basic types
-	struct pthread_runner_t {
-		pthread_t handle;
-		__bin_sem_t sem;
-	};
-	typedef pthread_runner_t * thread_t;
-
-	//--------------------
-	// Basic thread support
-	thread_t thrdlib_create( void (*main)( thread_t ) ) {
-		thread_t thrd = new pthread_runner_t();
-		int r = pthread_create( &thrd->handle, nullptr, (void *(*)(void *))main, thrd );
-		if( r != 0 ) std::abort();
-		return thrd;
-	}
-
-	void thrdlib_join( thread_t handle ) {
-		void * ret;
-		int r = pthread_join( handle->handle, &ret );
-		if( r != 0 ) std::abort();
-		delete handle;
-	}
-
-	void thrdlib_park( thread_t handle ) {
-		handle->sem.wait();
-	}
-
-	void thrdlib_unpark( thread_t handle ) {
-		handle->sem.post();
-	}
-
-	void thrdlib_yield( void ) {
-		int r = pthread_yield();
-		if( r != 0 ) std::abort();
-	}
-
-	//--------------------
-	// Basic kernel features
-	void thrdlib_setproccnt( int ) {
-
-	}
-
-
-#if defined(__cforall) || defined(__cpluplus)
-}
-#endif
Index: doc/user/Makefile
===================================================================
--- doc/user/Makefile	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ doc/user/Makefile	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -55,5 +55,5 @@
 
 ${DOCUMENT} : ${BASE}.ps
-	ps2pdf $<
+	ps2pdf -dPDFSETTINGS=/prepress $<
 
 ${BASE}.ps : ${BASE}.dvi
Index: doc/user/user.tex
===================================================================
--- doc/user/user.tex	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ doc/user/user.tex	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -11,6 +11,6 @@
 %% Created On       : Wed Apr  6 14:53:29 2016
 %% Last Modified By : Peter A. Buhr
-%% Last Modified On : Thu Sep 24 16:34:52 2020
-%% Update Count     : 3997
+%% Last Modified On : Mon Oct  5 08:57:29 2020
+%% Update Count     : 3998
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
@@ -66,5 +66,5 @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
-\CFADefaults											% use default CFA format-style
+\CFAStyle												% use default CFA format-style
 \lstnewenvironment{C++}[1][]                            % use C++ style
 {\lstset{language=C++,moredelim=**[is][\protect\color{red}]{®}{®},#1}}
Index: libcfa/src/bits/containers.hfa
===================================================================
--- libcfa/src/bits/containers.hfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ libcfa/src/bits/containers.hfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -157,4 +157,15 @@
 			tail = &get_next( *val );
 			*tail = 1p;
+		}
+
+		T * peek( __queue(T) & this ) {
+			verify(*this.tail == 1p);
+			T * head = this.head;
+			if( head != 1p ) {
+				verify(*this.tail == 1p);
+				return head;
+			}
+			verify(*this.tail == 1p);
+			return 0p;
 		}
 
Index: libcfa/src/bits/locks.hfa
===================================================================
--- libcfa/src/bits/locks.hfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ libcfa/src/bits/locks.hfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -164,6 +164,6 @@
 
 	struct $thread;
-	extern void park( __cfaabi_dbg_ctx_param );
-	extern void unpark( struct $thread * this __cfaabi_dbg_ctx_param2 );
+	extern void park( void );
+	extern void unpark( struct $thread * this );
 	static inline struct $thread * active_thread ();
 
@@ -191,5 +191,5 @@
 					/* paranoid */ verify( expected == 0p );
 					if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-						park( __cfaabi_dbg_ctx );
+						park();
 						return true;
 					}
@@ -210,5 +210,5 @@
 				else {
 					if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-						unpark( expected __cfaabi_dbg_ctx2 );
+						unpark( expected );
 						return true;
 					}
@@ -244,5 +244,5 @@
 				/* paranoid */ verify( expected == 0p );
 				if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-					park( __cfaabi_dbg_ctx );
+					park();
 					/* paranoid */ verify( this.ptr == 1p );
 					return true;
@@ -256,5 +256,5 @@
 			struct $thread * got = __atomic_exchange_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
 			if( got == 0p ) return false;
-			unpark( got __cfaabi_dbg_ctx2 );
+			unpark( got );
 			return true;
 		}
Index: libcfa/src/concurrency/CtxSwitch-i386.S
===================================================================
--- libcfa/src/concurrency/CtxSwitch-i386.S	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ libcfa/src/concurrency/CtxSwitch-i386.S	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -10,6 +10,6 @@
 // Created On       : Tue Dec 6 12:27:26 2016
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Sun Aug 16 08:46:22 2020
-// Update Count     : 4
+// Last Modified On : Sun Sep  6 18:23:37 2020
+// Update Count     : 5
 //
 
@@ -35,5 +35,5 @@
 
 	// Copy the "from" context argument from the stack to register eax
-	// Return address is at 0(%esp), with parameters following
+	// Return address is at 0(%esp), with parameters following.
 
 	movl 4(%esp),%eax
@@ -50,7 +50,7 @@
 	movl %ebp,FP_OFFSET(%eax)
 
-	// Copy the "to" context argument from the stack to register eax
-	// Having pushed three words (= 12 bytes) on the stack, the
-	// argument is now at 8 + 12 = 20(%esp)
+	// Copy the "to" context argument from the stack to register eax. Having
+	// pushed 3 words (= 12 bytes) on the stack, the argument is now at
+	// 8 + 12 = 20(%esp).
 
 	movl 20(%esp),%eax
Index: libcfa/src/concurrency/alarm.cfa
===================================================================
--- libcfa/src/concurrency/alarm.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ libcfa/src/concurrency/alarm.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -130,5 +130,5 @@
 
 	register_self( &node );
-	park( __cfaabi_dbg_ctx );
+	park();
 
 	/* paranoid */ verify( !node.set );
Index: libcfa/src/concurrency/clib/cfathread.cfa
===================================================================
--- libcfa/src/concurrency/clib/cfathread.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ libcfa/src/concurrency/clib/cfathread.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -34,5 +34,5 @@
 extern "C" {
 	//--------------------
-	// Basic thread managenemt
+	// Basic thread management
 	CRunner * cfathread_create( void (*main)( CRunner * ) ) {
 		return new( main );
@@ -44,9 +44,9 @@
 
 	void cfathread_park( void ) {
-		park( __cfaabi_dbg_ctx );
+		park();
 	}
 
 	void cfathread_unpark( CRunner * thrd ) {
-		unpark( *thrd __cfaabi_dbg_ctx2 );
+		unpark( *thrd );
 	}
 
Index: libcfa/src/concurrency/clib/cfathread.h
===================================================================
--- libcfa/src/concurrency/clib/cfathread.h	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ libcfa/src/concurrency/clib/cfathread.h	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -17,5 +17,5 @@
 #include "invoke.h"
 
-#if defined(__cforall) || defined(__cpluplus)
+#if defined(__cforall) || defined(__cplusplus)
 extern "C" {
 #endif
@@ -39,5 +39,5 @@
 
 
-#if defined(__cforall) || defined(__cpluplus)
+#if defined(__cforall) || defined(__cplusplus)
 }
 #endif
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ libcfa/src/concurrency/invoke.h	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -93,4 +93,6 @@
 
 	};
+	// Wrapper for gdb
+	struct cfathread_coroutine_t { struct $coroutine debug; };
 
 	static inline struct __stack_t * __get_stack( struct $coroutine * cor ) {
@@ -129,4 +131,6 @@
 		struct __condition_node_t * dtor_node;
 	};
+	// Wrapper for gdb
+	struct cfathread_monitor_t { struct $monitor debug; };
 
 	struct __monitor_group_t {
@@ -186,16 +190,10 @@
 		} node;
 
-		#ifdef __CFA_DEBUG__
-			// previous function to park/unpark the thread
-			const char * park_caller;
-			int park_result;
-			enum __Coroutine_State park_state;
-			bool park_stale;
-			const char * unpark_caller;
-			int unpark_result;
-			enum __Coroutine_State unpark_state;
-			bool unpark_stale;
+		#if defined( __CFA_WITH_VERIFY__ )
+			unsigned long long canary;
 		#endif
 	};
+	// Wrapper for gdb
+	struct cfathread_thread_t { struct $thread debug; };
 
 	#ifdef __CFA_DEBUG__
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ libcfa/src/concurrency/io.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -69,5 +69,5 @@
 		if( block ) {
 			enable_interrupts( __cfaabi_dbg_ctx );
-			park( __cfaabi_dbg_ctx );
+			park();
 			disable_interrupts();
 		}
@@ -97,5 +97,5 @@
 
 		if(nextt) {
-			unpark( nextt __cfaabi_dbg_ctx2 );
+			unpark( nextt );
 			enable_interrupts( __cfaabi_dbg_ctx );
 			return true;
Index: libcfa/src/concurrency/io/setup.cfa
===================================================================
--- libcfa/src/concurrency/io/setup.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ libcfa/src/concurrency/io/setup.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -247,5 +247,4 @@
 					thrd.link.next = 0p;
 					thrd.link.prev = 0p;
-					__cfaabi_dbg_debug_do( thrd.unpark_stale = true );
 
 					// Fixup the thread state
@@ -267,5 +266,5 @@
 
 				// unpark the fast io_poller
-				unpark( &thrd __cfaabi_dbg_ctx2 );
+				unpark( &thrd );
 			}
 			else {
@@ -276,5 +275,5 @@
 			}
 		} else {
-			unpark( &thrd __cfaabi_dbg_ctx2 );
+			unpark( &thrd );
 		}
 
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ libcfa/src/concurrency/kernel.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -246,8 +246,4 @@
 		thrd_dst->state = Active;
 
-		__cfaabi_dbg_debug_do(
-			thrd_dst->park_stale   = true;
-			thrd_dst->unpark_stale = true;
-		)
 		// Update global state
 		kernelTLS.this_thread = thrd_dst;
@@ -255,15 +251,19 @@
 		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 		/* paranoid */ verify( kernelTLS.this_thread == thrd_dst );
+		/* paranoid */ verify( thrd_dst->context.SP );
 		/* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) < ((uintptr_t)__get_stack(thrd_dst->curr_cor)->base ) || thrd_dst->curr_cor == proc_cor, "ERROR : Destination $thread %p has been corrupted.\n StackPointer too small.\n", thrd_dst ); // add escape condition if we are setting up the processor
 		/* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) > ((uintptr_t)__get_stack(thrd_dst->curr_cor)->limit) || thrd_dst->curr_cor == proc_cor, "ERROR : Destination $thread %p has been corrupted.\n StackPointer too large.\n", thrd_dst ); // add escape condition if we are setting up the processor
+		/* paranoid */ verify( 0x0D15EA5E0D15EA5E == thrd_dst->canary );
+
 
 
 		// set context switch to the thread that the processor is executing
-		verify( thrd_dst->context.SP );
 		__cfactx_switch( &proc_cor->context, &thrd_dst->context );
 		// when __cfactx_switch returns we are back in the processor coroutine
 
+		/* paranoid */ verify( 0x0D15EA5E0D15EA5E == thrd_dst->canary );
 		/* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) > ((uintptr_t)__get_stack(thrd_dst->curr_cor)->limit), "ERROR : Destination $thread %p has been corrupted.\n StackPointer too large.\n", thrd_dst );
 		/* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) < ((uintptr_t)__get_stack(thrd_dst->curr_cor)->base ), "ERROR : Destination $thread %p has been corrupted.\n StackPointer too small.\n", thrd_dst );
+		/* paranoid */ verify( thrd_dst->context.SP );
 		/* paranoid */ verify( kernelTLS.this_thread == thrd_dst );
 		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
@@ -288,5 +288,5 @@
 			// The thread has halted, it should never be scheduled/run again
 			// We may need to wake someone up here since
-			unpark( this->destroyer __cfaabi_dbg_ctx2 );
+			unpark( this->destroyer );
 			this->destroyer = 0p;
 			break RUNNING;
@@ -298,5 +298,4 @@
 		// set state of processor coroutine to active and the thread to inactive
 		int old_ticket = __atomic_fetch_sub(&thrd_dst->ticket, 1, __ATOMIC_SEQ_CST);
-		__cfaabi_dbg_debug_do( thrd_dst->park_result = old_ticket; )
 		switch(old_ticket) {
 			case 1:
@@ -335,6 +334,8 @@
 			__x87_store;
 		#endif
-		verify( proc_cor->context.SP );
+		/* paranoid */ verify( proc_cor->context.SP );
+		/* paranoid */ verify( 0x0D15EA5E0D15EA5E == thrd_src->canary );
 		__cfactx_switch( &thrd_src->context, &proc_cor->context );
+		/* paranoid */ verify( 0x0D15EA5E0D15EA5E == thrd_src->canary );
 		#if defined( __i386 ) || defined( __x86_64 )
 			__x87_load;
@@ -368,4 +369,6 @@
 	/* paranoid */ #endif
 	/* paranoid */ verifyf( thrd->link.next == 0p, "Expected null got %p", thrd->link.next );
+	/* paranoid */ verify( 0x0D15EA5E0D15EA5E == thrd->canary );
+
 
 	if (thrd->preempted == __NO_PREEMPTION) thrd->state = Ready;
@@ -404,10 +407,6 @@
 
 // KERNEL ONLY unpark with out disabling interrupts
-void __unpark(  struct __processor_id_t * id, $thread * thrd __cfaabi_dbg_ctx_param2 ) {
-	// record activity
-	__cfaabi_dbg_record_thrd( *thrd, false, caller );
-
+void __unpark(  struct __processor_id_t * id, $thread * thrd ) {
 	int old_ticket = __atomic_fetch_add(&thrd->ticket, 1, __ATOMIC_SEQ_CST);
-	__cfaabi_dbg_debug_do( thrd->unpark_result = old_ticket; thrd->unpark_state = thrd->state; )
 	switch(old_ticket) {
 		case 1:
@@ -427,20 +426,17 @@
 }
 
-void unpark( $thread * thrd __cfaabi_dbg_ctx_param2 ) {
+void unpark( $thread * thrd ) {
 	if( !thrd ) return;
 
 	disable_interrupts();
-	__unpark( (__processor_id_t*)kernelTLS.this_processor, thrd __cfaabi_dbg_ctx_fwd2 );
+	__unpark( (__processor_id_t*)kernelTLS.this_processor, thrd );
 	enable_interrupts( __cfaabi_dbg_ctx );
 }
 
-void park( __cfaabi_dbg_ctx_param ) {
+void park( void ) {
 	/* paranoid */ verify( kernelTLS.preemption_state.enabled );
 	disable_interrupts();
 	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 	/* paranoid */ verify( kernelTLS.this_thread->preempted == __NO_PREEMPTION );
-
-	// record activity
-	__cfaabi_dbg_record_thrd( *kernelTLS.this_thread, true, caller );
 
 	returnToKernel();
@@ -650,5 +646,5 @@
 		// atomically release spin lock and block
 		unlock( lock );
-		park( __cfaabi_dbg_ctx );
+		park();
 		return true;
 	}
@@ -671,5 +667,5 @@
 
 	// make new owner
-	unpark( thrd __cfaabi_dbg_ctx2 );
+	unpark( thrd );
 
 	return thrd != 0p;
@@ -682,5 +678,5 @@
 	count += diff;
 	for(release) {
-		unpark( pop_head( waiting ) __cfaabi_dbg_ctx2 );
+		unpark( pop_head( waiting ) );
 	}
 
@@ -698,15 +694,4 @@
 			this.prev_thrd = kernelTLS.this_thread;
 		}
-
-		void __cfaabi_dbg_record_thrd($thread & this, bool park, const char prev_name[]) {
-			if(park) {
-				this.park_caller   = prev_name;
-				this.park_stale    = false;
-			}
-			else {
-				this.unpark_caller = prev_name;
-				this.unpark_stale  = false;
-			}
-		}
 	}
 )
Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -118,6 +118,6 @@
 
 	extern "Cforall" {
-		extern void park( __cfaabi_dbg_ctx_param );
-		extern void unpark( struct $thread * this __cfaabi_dbg_ctx_param2 );
+		extern void park( void );
+		extern void unpark( struct $thread * this );
 		static inline struct $thread * active_thread () { return TL_GET( this_thread ); }
 
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -451,4 +451,7 @@
 	link.next = 0p;
 	link.prev = 0p;
+	#if defined( __CFA_WITH_VERIFY__ )
+		canary = 0x0D15EA5E0D15EA5E;
+	#endif
 
 	node.next = 0p;
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -64,5 +64,5 @@
 
 // KERNEL ONLY unpark with out disabling interrupts
-void __unpark( struct __processor_id_t *, $thread * thrd __cfaabi_dbg_ctx_param2 );
+void __unpark( struct __processor_id_t *, $thread * thrd );
 
 static inline bool __post(single_sem & this, struct __processor_id_t * id) {
@@ -77,5 +77,5 @@
 		else {
 			if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-				__unpark( id, expected __cfaabi_dbg_ctx2 );
+				__unpark( id, expected );
 				return true;
 			}
Index: libcfa/src/concurrency/locks.cfa
===================================================================
--- libcfa/src/concurrency/locks.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
+++ libcfa/src/concurrency/locks.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -0,0 +1,428 @@
+#include "locks.hfa"
+#include "kernel_private.hfa"
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <kernel.hfa>
+#include <stdlib.hfa>
+#include <thread.hfa>
+
+///////////////////////////////////////////////////////////////////
+//// info_thread
+///////////////////////////////////////////////////////////////////
+forall(dtype L | is_blocking_lock(L)) {
+	void ?{}( info_thread(L) & this, $thread * t ) {
+		this.t = t;
+		this.lock = 0p;
+	}
+
+	void ?{}( info_thread(L) & this, $thread * t, uintptr_t info ) {
+		this.t = t;
+		this.info = info;
+		this.lock = 0p;
+	}
+
+	void ^?{}( info_thread(L) & this ){
+		// default
+	}
+
+	info_thread(L) *& get_next( info_thread(L) & this ) {
+		return this.next;
+	}
+}
+///////////////////////////////////////////////////////////////////
+//// Blocking Locks
+///////////////////////////////////////////////////////////////////
+
+void ?{}( blocking_lock & this, bool multi_acquisition, bool strict_owner ) {
+	this.lock{};
+	this.blocked_threads{};
+	this.wait_count = 0;
+	this.multi_acquisition = multi_acquisition;
+	this.strict_owner = strict_owner;
+	this.owner = 0p;
+	this.recursion_count = 0;
+}
+
+void ^?{}( blocking_lock & this ) {
+	// default
+}
+
+void ?{}( mutex_lock & this ) {
+	((blocking_lock &)this){ false, false };
+}
+
+void ^?{}( mutex_lock & this ) {
+	// default
+}
+
+void ?{}( owner_lock & this ) {
+	((blocking_lock &)this){ true, true };
+}
+
+void ^?{}( owner_lock & this ) {
+	// default
+}
+
+void ?{}( recursive_mutex_lock & this ) {
+	((blocking_lock &)this){ true, false };
+}
+
+void ^?{}( recursive_mutex_lock & this ) {
+	// default
+}
+
+void lock( blocking_lock & this ) with( this ) {
+	lock( lock __cfaabi_dbg_ctx2 );
+	if ( owner == kernelTLS.this_thread && !multi_acquisition) {
+		fprintf(stderr, "A single acquisition lock holder attempted to reacquire the lock resulting in a deadlock."); // Possibly throw instead
+		exit(EXIT_FAILURE);
+	} else if ( owner != 0p && owner != kernelTLS.this_thread ) {
+		append( blocked_threads, kernelTLS.this_thread );
+		wait_count++;
+		unlock( lock );
+		park( __cfaabi_dbg_ctx );
+	} else if ( owner == kernelTLS.this_thread && multi_acquisition ) {
+		recursion_count++;
+		unlock( lock );
+	} else {
+		owner = kernelTLS.this_thread;
+		recursion_count = 1;
+		unlock( lock );
+	}
+}
+
+bool try_lock( blocking_lock & this ) with( this ) {
+	bool ret = false;
+	lock( lock __cfaabi_dbg_ctx2 );
+	if ( owner == 0p ) {
+		owner = kernelTLS.this_thread;
+		if ( multi_acquisition ) recursion_count = 1;
+		ret = true;
+	} else if ( owner == kernelTLS.this_thread && multi_acquisition ) {
+		recursion_count++;
+		ret = true;
+	}
+	unlock( lock );
+	return ret;
+}
+
+void unlock( blocking_lock & this ) with( this ) {
+	lock( lock __cfaabi_dbg_ctx2 );
+	if ( owner == 0p ){ // no owner implies lock isn't held
+		fprintf( stderr, "There was an attempt to release a lock that isn't held" );
+		return;
+	} else if ( strict_owner && owner != kernelTLS.this_thread ) {
+		fprintf( stderr, "A thread other than the owner attempted to release an owner lock" );
+		return;
+	}
+	recursion_count--;
+	if ( recursion_count == 0 ) {
+		$thread * thrd = pop_head( blocked_threads );
+		owner = thrd;
+		recursion_count = ( thrd && multi_acquisition ? 1 : 0 );
+		wait_count--;
+		unpark( thrd __cfaabi_dbg_ctx2 );
+	}
+	unlock( lock );
+}
+
+size_t wait_count( blocking_lock & this ) with( this ) {
+	return wait_count;
+}
+
+
+void set_recursion_count( blocking_lock & this, size_t recursion ) with( this ) {
+	recursion_count = recursion;
+}
+
+size_t get_recursion_count( blocking_lock & this ) with( this ) {
+	return recursion_count;
+}
+
+void add_( blocking_lock & this, $thread * t ) with( this ) {
+    lock( lock __cfaabi_dbg_ctx2 );
+	if ( owner != 0p ) {
+		append( blocked_threads, t );
+		wait_count++;
+		unlock( lock );
+	} else {
+		owner = t;
+		if ( multi_acquisition ) recursion_count = 1;
+		unpark( t __cfaabi_dbg_ctx2 );
+		unlock( lock );
+	}
+}
+
+void remove_( blocking_lock & this ) with( this ) {
+    lock( lock __cfaabi_dbg_ctx2 );
+	if ( owner == 0p ){ // no owner implies lock isn't held
+		fprintf( stderr, "A lock that is not held was passed to a synchronization lock" );
+	} else if ( strict_owner && owner != kernelTLS.this_thread ) {
+		fprintf( stderr, "A thread other than the owner of a lock passed it to a synchronization lock" );
+	} else {
+		$thread * thrd = pop_head( blocked_threads );
+		owner = thrd;
+		recursion_count = ( thrd && multi_acquisition ? 1 : 0 );
+		wait_count--;
+		unpark( thrd __cfaabi_dbg_ctx2 );
+	}
+	unlock( lock );
+}
+
+///////////////////////////////////////////////////////////////////
+//// Overloaded routines for traits
+///////////////////////////////////////////////////////////////////
+
+// In an ideal world this may not be necessary
+// Is it possible for nominal inheritance to inherit traits??
+// If that occurs we would avoid all this extra code
+
+void lock( mutex_lock & this ){
+	lock( (blocking_lock &)this );
+}
+
+void unlock( mutex_lock & this ){
+	unlock( (blocking_lock &)this );
+}
+
+void add_( mutex_lock & this, struct $thread * t ){
+	add_( (blocking_lock &)this, t );
+}
+
+void remove_( mutex_lock & this ){
+	remove_( (blocking_lock &)this );
+}
+
+void set_recursion_count( mutex_lock & this, size_t recursion ){
+	set_recursion_count( (blocking_lock &)this, recursion );
+}
+
+size_t get_recursion_count( mutex_lock & this ){
+	get_recursion_count( (blocking_lock &)this );
+}
+
+void lock( recursive_mutex_lock & this ){
+	lock( (blocking_lock &)this );
+}
+
+void unlock( recursive_mutex_lock & this ){
+	unlock( (blocking_lock &)this );
+}
+
+void add_( recursive_mutex_lock & this, struct $thread * t ){
+	add_( (blocking_lock &)this, t );
+}
+
+void remove_( recursive_mutex_lock & this ){
+	remove_( (blocking_lock &)this );
+}
+
+void set_recursion_count( recursive_mutex_lock & this, size_t recursion ){
+	set_recursion_count( (blocking_lock &)this, recursion );
+}
+
+size_t get_recursion_count( recursive_mutex_lock & this ){
+	get_recursion_count( (blocking_lock &)this );
+}
+
+///////////////////////////////////////////////////////////////////
+//// Synchronization Locks
+///////////////////////////////////////////////////////////////////
+
+forall(dtype L | is_blocking_lock(L)) {
+	void ?{}( synchronization_lock(L) & this, bool reacquire_after_signal ){
+		this.lock{};
+		this.blocked_threads{};
+		this.count = 0;
+		this.reacquire_after_signal = reacquire_after_signal;
+	}
+
+	void ^?{}( synchronization_lock(L) & this ){
+		// default
+	}
+
+	void ?{}( condition_variable(L) & this ){
+		((synchronization_lock(L) &)this){ true };
+	}
+
+	void ^?{}( condition_variable(L) & this ){
+		// default
+	}
+
+	void ?{}( thread_queue(L) & this ){
+		((synchronization_lock(L) &)this){ false };
+	}
+
+	void ^?{}( thread_queue(L) & this ){
+		// default
+	}
+
+	bool notify_one( synchronization_lock(L) & this ) with( this ) {
+		lock( lock __cfaabi_dbg_ctx2 );
+		bool ret = !!blocked_threads;
+		info_thread(L) * popped = pop_head( blocked_threads );
+		if(popped != 0p) {
+			if( reacquire_after_signal ){
+				add_(*popped->lock, popped->t);
+			} else {
+				unpark(
+					popped->t __cfaabi_dbg_ctx2
+				);
+			}
+		}
+		unlock( lock );
+		return ret;
+	}
+
+	bool notify_all( synchronization_lock(L) & this ) with(this) {
+		lock( lock __cfaabi_dbg_ctx2 );
+		bool ret = blocked_threads ? true : false;
+		while( blocked_threads ) {
+			info_thread(L) * popped = pop_head( blocked_threads );
+			if(popped != 0p){
+				if( reacquire_after_signal ){
+					add_(*popped->lock, popped->t);
+				} else {
+					unpark(
+						popped->t __cfaabi_dbg_ctx2
+					);
+				}
+			}
+		}
+		unlock( lock );
+		return ret;
+	}
+
+	uintptr_t front( synchronization_lock(L) & this ) with(this) {
+		return (*peek(blocked_threads)).info;
+	}
+
+	bool empty( synchronization_lock(L) & this ) with(this) {
+		return blocked_threads ? false : true;
+	}
+
+	int counter( synchronization_lock(L) & this ) with(this) {
+		return count;
+	}
+
+	void queue_info_thread( synchronization_lock(L) & this, info_thread(L) & i ) with(this) {
+		lock( lock __cfaabi_dbg_ctx2 );
+		append( blocked_threads, &i );
+		count++;
+		unlock( lock );
+		park( __cfaabi_dbg_ctx );
+	}
+
+
+	void wait( synchronization_lock(L) & this ) with(this) {
+		info_thread( L ) i = { kernelTLS.this_thread };
+		queue_info_thread( this, i );
+	}
+
+	void wait( synchronization_lock(L) & this, uintptr_t info ) with(this) {
+		info_thread( L ) i = { kernelTLS.this_thread, info };
+		queue_info_thread( this, i );
+	}
+	// I still need to implement the time delay wait routines
+	bool wait( synchronization_lock(L) & this, Duration duration ) with(this) {
+		timeval tv = { time(0) };
+		Time t = { tv };
+		return wait( this, t + duration );
+	}
+
+	bool wait( synchronization_lock(L) & this, uintptr_t info, Duration duration ) with(this) {
+		// TODO: ADD INFO
+		return wait( this, duration );
+	}
+
+	bool wait( synchronization_lock(L) & this, Time time ) with(this) {
+		return false; //default
+	}
+
+	bool wait( synchronization_lock(L) & this, uintptr_t info, Time time ) with(this) {
+		// TODO: ADD INFO
+		return wait( this, time );
+	}
+
+	void queue_info_thread_unlock( synchronization_lock(L) & this, L & l, info_thread(L) & i ) with(this) {
+		lock( lock __cfaabi_dbg_ctx2 );
+		append( this.blocked_threads, &i );
+		count++;
+		i.lock = &l;
+		size_t recursion_count = get_recursion_count(l);
+		remove_( l );
+		unlock( lock );
+		park( __cfaabi_dbg_ctx ); // blocks here
+
+		set_recursion_count(l, recursion_count); // resets recursion count here after waking
+	}
+
+	void wait( synchronization_lock(L) & this, L & l ) with(this) {
+		info_thread(L) i = { kernelTLS.this_thread };
+		queue_info_thread_unlock( this, l, i );
+	}
+
+	void wait( synchronization_lock(L) & this, L & l, uintptr_t info ) with(this) {
+		info_thread(L) i = { kernelTLS.this_thread, info };
+		queue_info_thread_unlock( this, l, i );
+	}
+
+	bool wait( synchronization_lock(L) & this, L & l, Duration duration ) with(this) {
+		timeval tv = { time(0) };
+		Time t = { tv };
+		return wait( this, l, t + duration );
+	}
+
+	bool wait( synchronization_lock(L) & this, L & l, uintptr_t info, Duration duration ) with(this) {
+		// TODO: ADD INFO
+		return wait( this, l, duration );
+	}
+
+	bool wait( synchronization_lock(L) & this, L & l, Time time ) with(this) {
+		return false; //default
+	}
+
+	bool wait( synchronization_lock(L) & this, L & l, uintptr_t info, Time time ) with(this) {
+		// TODO: ADD INFO
+		return wait( this, l, time );
+	}
+}
+
+///////////////////////////////////////////////////////////////////
+//// condition lock alternative approach
+///////////////////////////////////////////////////////////////////
+
+// the solution below is less efficient but does not require the lock to have a specific add/remove routine
+
+///////////////////////////////////////////////////////////////////
+//// is_simple_lock
+///////////////////////////////////////////////////////////////////
+
+forall(dtype L | is_simple_lock(L)) {
+	void ?{}( condition_lock(L) & this ){
+		// default
+	}
+
+	void ^?{}( condition_lock(L) & this ){
+		// default
+	}
+
+	bool notify_one( condition_lock(L) & this ) with(this) {
+		return notify_one( c_var );
+	}
+
+	bool notify_all( condition_lock(L) & this ) with(this) {
+		return notify_all( c_var );
+	}
+
+	void wait( condition_lock(L) & this, L & l ) with(this) {
+		lock( m_lock );
+		size_t recursion = get_recursion_count( l );
+		unlock( l );
+		wait( c_var, m_lock );
+		lock( l );
+		set_recursion_count( l , recursion );
+		unlock( m_lock );
+	}
+}
Index: libcfa/src/concurrency/locks.hfa
===================================================================
--- libcfa/src/concurrency/locks.hfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
+++ libcfa/src/concurrency/locks.hfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -0,0 +1,211 @@
+#include <stdbool.h>
+
+#include "bits/algorithm.hfa"
+#include "bits/locks.hfa"
+#include "bits/containers.hfa"
+
+#include "invoke.h"
+
+#include "time_t.hfa"
+#include "time.hfa"
+#include <sys/time.h>
+
+///////////////////////////////////////////////////////////////////
+//// is_blocking_lock
+///////////////////////////////////////////////////////////////////
+
+trait is_blocking_lock(dtype L | sized(L)) {
+	void add_( L &, struct $thread * );		// For synchronization locks to use when acquiring
+	void remove_( L & );    // For synchronization locks to use when releasing
+	size_t get_recursion_count( L & ); // to get recursion count for cond lock to reset after waking
+	void set_recursion_count( L &, size_t recursion ); // to set recursion count after getting signalled;
+};
+
+///////////////////////////////////////////////////////////////////
+//// info_thread
+///////////////////////////////////////////////////////////////////
+
+forall(dtype L | is_blocking_lock(L)) {
+	struct info_thread {
+		struct $thread * t;
+		uintptr_t info;
+		info_thread(L) * next;
+		L * lock;
+	};
+
+
+	void ?{}( info_thread(L) & this, $thread * t );
+	void ?{}( info_thread(L) & this, $thread * t, uintptr_t info );
+	void ^?{}( info_thread(L) & this );
+
+	info_thread(L) *& get_next( info_thread(L) & this );
+}
+
+///////////////////////////////////////////////////////////////////
+//// Blocking Locks
+///////////////////////////////////////////////////////////////////
+struct blocking_lock {
+	// Spin lock used for mutual exclusion
+	__spinlock_t lock;
+
+	// List of blocked threads
+	__queue_t( struct $thread ) blocked_threads;
+
+	// Count of current blocked threads
+	size_t wait_count;
+
+	// Flag if the lock allows multiple acquisition
+	bool multi_acquisition;
+
+	// Flag if lock can be released by non owner
+	bool strict_owner;
+
+	// Current thread owning the lock
+	struct $thread * owner;
+
+	// Number of recursion level
+	size_t recursion_count;
+};
+
+struct mutex_lock {
+	inline blocking_lock;
+};
+
+struct owner_lock {
+	inline blocking_lock;
+};
+
+struct recursive_mutex_lock {
+	inline blocking_lock;
+};
+
+void ?{}( blocking_lock & this, bool multi_acquisition, bool strict_owner );
+void ^?{}( blocking_lock & this );
+
+void ?{}( mutex_lock & this );
+void ^?{}( mutex_lock & this );
+
+void ?{}( owner_lock & this );
+void ^?{}( owner_lock & this );
+
+void ?{}( recursive_mutex_lock & this );
+void ^?{}( recursive_mutex_lock & this );
+
+void lock( blocking_lock & this );
+bool try_lock( blocking_lock & this );
+void unlock( blocking_lock & this );
+void add_( blocking_lock & this, struct $thread * t );
+void remove_( blocking_lock & this );
+size_t wait_count( blocking_lock & this );
+void set_recursion_count( blocking_lock & this, size_t recursion );
+size_t get_recursion_count( blocking_lock & this );
+
+void lock( mutex_lock & this );
+void unlock( mutex_lock & this );
+void add_( mutex_lock & this, struct $thread * t );
+void remove_( mutex_lock & this );
+void set_recursion_count( mutex_lock & this, size_t recursion );
+size_t get_recursion_count( mutex_lock & this );
+
+void lock( recursive_mutex_lock & this );
+void unlock( recursive_mutex_lock & this );
+void add_( recursive_mutex_lock & this, struct $thread * t );
+void remove_( recursive_mutex_lock & this );
+void set_recursion_count( recursive_mutex_lock & this, size_t recursion );
+size_t get_recursion_count( recursive_mutex_lock & this );
+
+///////////////////////////////////////////////////////////////////
+//// Synchronization Locks
+///////////////////////////////////////////////////////////////////
+forall(dtype L | is_blocking_lock(L)) {
+	struct synchronization_lock {
+		// Spin lock used for mutual exclusion
+		__spinlock_t lock;
+
+		// List of blocked threads
+		__queue_t( info_thread(L) ) blocked_threads;
+
+		// Count of current blocked threads
+		int count;
+
+		// If true threads will reacquire the lock they block on upon waking
+		bool reacquire_after_signal;
+	};
+
+	struct condition_variable {
+		inline synchronization_lock(L);
+	};
+
+	struct thread_queue {
+		inline synchronization_lock(L);
+	};
+
+
+	void ?{}( synchronization_lock(L) & this, bool multi_acquisition, bool strict_owner );
+	void ^?{}( synchronization_lock(L) & this );
+
+	void ?{}( condition_variable(L) & this );
+	void ^?{}( condition_variable(L) & this );
+
+	void ?{}( thread_queue(L) & this );
+	void ^?{}( thread_queue(L) & this );
+
+	bool notify_one( synchronization_lock(L) & this );
+	bool notify_all( synchronization_lock(L) & this );
+
+	uintptr_t front( synchronization_lock(L) & this );
+
+	bool empty( synchronization_lock(L) & this );
+	int counter( synchronization_lock(L) & this );
+
+	// wait functions that are not passed a mutex lock
+	void wait( synchronization_lock(L) & this );
+	void wait( synchronization_lock(L) & this, uintptr_t info );
+	bool wait( synchronization_lock(L) & this, Duration duration );
+	bool wait( synchronization_lock(L) & this, uintptr_t info, Duration duration );
+	bool wait( synchronization_lock(L) & this, Time time );
+	bool wait( synchronization_lock(L) & this, uintptr_t info, Time time );
+
+	// wait functions that are passed a lock
+	bool notify_one( synchronization_lock(L) & this, L & l );
+	bool notify_all( synchronization_lock(L) & this, L & l );
+
+	void wait( synchronization_lock(L) & this, L & l );
+	void wait( synchronization_lock(L) & this, L & l, uintptr_t info );
+	bool wait( synchronization_lock(L) & this, L & l, Duration duration );
+	bool wait( synchronization_lock(L) & this, L & l, uintptr_t info, Duration duration );
+	bool wait( synchronization_lock(L) & this, L & l, Time time );
+	bool wait( synchronization_lock(L) & this, L & l, uintptr_t info, Time time );
+}
+
+///////////////////////////////////////////////////////////////////
+//// condition lock alternative approach
+///////////////////////////////////////////////////////////////////
+
+
+///////////////////////////////////////////////////////////////////
+//// is_simple_lock
+///////////////////////////////////////////////////////////////////
+
+trait is_simple_lock(dtype L | sized(L)) {
+	void lock( L & );		// For synchronization locks to use when acquiring
+	void unlock( L & );    // For synchronization locks to use when releasing
+	size_t get_recursion_count( L & ); // to get recursion count for cond lock to reset after waking
+	void set_recursion_count( L &, size_t recursion ); // to set recursion count after getting signalled;
+};
+
+forall(dtype L | is_simple_lock(L)) {
+	struct condition_lock {
+		// Spin lock used for mutual exclusion
+		mutex_lock m_lock;
+
+		condition_variable( mutex_lock ) c_var;
+	};
+
+	void ?{}( condition_lock(L) & this );
+	void ^?{}( condition_lock(L) & this );
+
+	bool notify_one( condition_lock(L) & this );
+	bool notify_all( condition_lock(L) & this );
+	void wait( condition_lock(L) & this, L & l );
+}
Index: libcfa/src/concurrency/monitor.cfa
===================================================================
--- libcfa/src/concurrency/monitor.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ libcfa/src/concurrency/monitor.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -122,5 +122,5 @@
 
 		unlock( this->lock );
-		park( __cfaabi_dbg_ctx );
+		park();
 
 		__cfaabi_dbg_print_safe( "Kernel : %10p Entered  mon %p\n", thrd, this);
@@ -201,8 +201,8 @@
 		// Release the next thread
 		/* paranoid */ verifyf( urgent->owner->waiting_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
-		unpark( urgent->owner->waiting_thread __cfaabi_dbg_ctx2 );
+		unpark( urgent->owner->waiting_thread );
 
 		// Park current thread waiting
-		park( __cfaabi_dbg_ctx );
+		park();
 
 		// Some one was waiting for us, enter
@@ -222,5 +222,5 @@
 
 		// Park current thread waiting
-		park( __cfaabi_dbg_ctx );
+		park();
 
 		/* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
@@ -264,5 +264,5 @@
 	//We need to wake-up the thread
 	/* paranoid */ verifyf( !new_owner || new_owner == this->owner, "Expected owner to be %p, got %p (m: %p)", new_owner, this->owner, this );
-	unpark( new_owner __cfaabi_dbg_ctx2 );
+	unpark( new_owner );
 }
 
@@ -493,9 +493,9 @@
 	// Wake the threads
 	for(int i = 0; i < thread_count; i++) {
-		unpark( threads[i] __cfaabi_dbg_ctx2 );
+		unpark( threads[i] );
 	}
 
 	// Everything is ready to go to sleep
-	park( __cfaabi_dbg_ctx );
+	park();
 
 	// We are back, restore the owners and recursions
@@ -575,8 +575,8 @@
 
 	// unpark the thread we signalled
-	unpark( signallee __cfaabi_dbg_ctx2 );
+	unpark( signallee );
 
 	//Everything is ready to go to sleep
-	park( __cfaabi_dbg_ctx );
+	park();
 
 
@@ -679,8 +679,8 @@
 
 				// unpark the thread we signalled
-				unpark( next __cfaabi_dbg_ctx2 );
+				unpark( next );
 
 				//Everything is ready to go to sleep
-				park( __cfaabi_dbg_ctx );
+				park();
 
 				// We are back, restore the owners and recursions
@@ -724,5 +724,5 @@
 
 	//Everything is ready to go to sleep
-	park( __cfaabi_dbg_ctx );
+	park();
 
 
Index: libcfa/src/concurrency/mutex.cfa
===================================================================
--- libcfa/src/concurrency/mutex.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ libcfa/src/concurrency/mutex.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -42,5 +42,5 @@
 		append( blocked_threads, kernelTLS.this_thread );
 		unlock( lock );
-		park( __cfaabi_dbg_ctx );
+		park();
 	}
 	else {
@@ -65,5 +65,5 @@
 	this.is_locked = (this.blocked_threads != 0);
 	unpark(
-		pop_head( this.blocked_threads ) __cfaabi_dbg_ctx2
+		pop_head( this.blocked_threads )
 	);
 	unlock( this.lock );
@@ -97,5 +97,5 @@
 		append( blocked_threads, kernelTLS.this_thread );
 		unlock( lock );
-		park( __cfaabi_dbg_ctx );
+		park();
 	}
 }
@@ -124,5 +124,5 @@
 		owner = thrd;
 		recursion_count = (thrd ? 1 : 0);
-		unpark( thrd __cfaabi_dbg_ctx2 );
+		unpark( thrd );
 	}
 	unlock( lock );
@@ -142,5 +142,5 @@
 	lock( lock __cfaabi_dbg_ctx2 );
 	unpark(
-		pop_head( this.blocked_threads ) __cfaabi_dbg_ctx2
+		pop_head( this.blocked_threads )
 	);
 	unlock( lock );
@@ -151,5 +151,5 @@
 	while(this.blocked_threads) {
 		unpark(
-			pop_head( this.blocked_threads ) __cfaabi_dbg_ctx2
+			pop_head( this.blocked_threads )
 		);
 	}
@@ -161,5 +161,5 @@
 	append( this.blocked_threads, kernelTLS.this_thread );
 	unlock( this.lock );
-	park( __cfaabi_dbg_ctx );
+	park();
 }
 
@@ -170,5 +170,5 @@
 	unlock(l);
 	unlock(this.lock);
-	park( __cfaabi_dbg_ctx );
+	park();
 	lock(l);
 }
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ libcfa/src/concurrency/preemption.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -274,5 +274,5 @@
 		kernelTLS.this_stats = this->curr_cluster->stats;
 	#endif
-	__unpark( id, this __cfaabi_dbg_ctx2 );
+	__unpark( id, this );
 }
 
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ libcfa/src/concurrency/thread.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -39,4 +39,7 @@
 	link.prev = 0p;
 	link.preferred = -1;
+	#if defined( __CFA_WITH_VERIFY__ )
+		canary = 0x0D15EA5E0D15EA5E;
+	#endif
 
 	node.next = 0p;
@@ -48,4 +51,7 @@
 
 void ^?{}($thread& this) with( this ) {
+	#if defined( __CFA_WITH_VERIFY__ )
+		canary = 0xDEADDEADDEADDEAD;
+	#endif
 	unregister(curr_cluster, this);
 	^self_cor{};
Index: libcfa/src/concurrency/thread.hfa
===================================================================
--- libcfa/src/concurrency/thread.hfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ libcfa/src/concurrency/thread.hfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -88,13 +88,13 @@
 //----------
 // Park thread: block until corresponding call to unpark, won't block if unpark is already called
-void park( __cfaabi_dbg_ctx_param );
+void park( void );
 
 //----------
 // Unpark a thread, if the thread is already blocked, schedule it
 //                  if the thread is not yet block, signal that it should rerun immediately
-void unpark( $thread * this __cfaabi_dbg_ctx_param2 );
+void unpark( $thread * this );
 
 forall( dtype T | is_thread(T) )
-static inline void unpark( T & this __cfaabi_dbg_ctx_param2 ) { if(!&this) return; unpark( get_thread( this ) __cfaabi_dbg_ctx_fwd2 );}
+static inline void unpark( T & this ) { if(!&this) return; unpark( get_thread( this ) );}
 
 //----------
Index: src/Parser/lex.ll
===================================================================
--- src/Parser/lex.ll	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ src/Parser/lex.ll	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -10,6 +10,6 @@
  * Created On       : Sat Sep 22 08:58:10 2001
  * Last Modified By : Peter A. Buhr
- * Last Modified On : Sat Feb 15 11:05:50 2020
- * Update Count     : 737
+ * Last Modified On : Tue Oct  6 18:15:41 2020
+ * Update Count     : 743
  */
 
@@ -62,5 +62,5 @@
 #define IDENTIFIER_RETURN()	RETURN_VAL( typedefTable.isKind( yytext ) )
 
-#ifdef HAVE_KEYWORDS_FLOATXX								// GCC >= 7 => keyword, otherwise typedef
+#ifdef HAVE_KEYWORDS_FLOATXX							// GCC >= 7 => keyword, otherwise typedef
 #define FLOATXX(v) KEYWORD_RETURN(v);
 #else
@@ -292,5 +292,5 @@
 __restrict__	{ KEYWORD_RETURN(RESTRICT); }			// GCC
 return			{ KEYWORD_RETURN(RETURN); }
-	/* resume			{ KEYWORD_RETURN(RESUME); }				// CFA */
+ /* resume			{ KEYWORD_RETURN(RESUME); }				// CFA */
 short			{ KEYWORD_RETURN(SHORT); }
 signed			{ KEYWORD_RETURN(SIGNED); }
Index: src/Parser/parser.yy
===================================================================
--- src/Parser/parser.yy	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ src/Parser/parser.yy	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -10,6 +10,6 @@
 // Created On       : Sat Sep  1 20:22:55 2001
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Thu May 28 12:11:45 2020
-// Update Count     : 4500
+// Last Modified On : Tue Oct  6 18:24:18 2020
+// Update Count     : 4610
 //
 
@@ -278,6 +278,6 @@
 %token OTYPE FTYPE DTYPE TTYPE TRAIT					// CFA
 %token SIZEOF OFFSETOF
-// %token RESUME									// CFA
-%token SUSPEND									// CFA
+// %token RESUME											// CFA
+%token SUSPEND											// CFA
 %token ATTRIBUTE EXTENSION								// GCC
 %token IF ELSE SWITCH CASE DEFAULT DO WHILE FOR BREAK CONTINUE GOTO RETURN
@@ -329,5 +329,5 @@
 %type<en> conditional_expression		constant_expression			assignment_expression		assignment_expression_opt
 %type<en> comma_expression				comma_expression_opt
-%type<en> argument_expression_list_opt		argument_expression			default_initialize_opt
+%type<en> argument_expression_list_opt	argument_expression			default_initialize_opt
 %type<ifctl> if_control_expression
 %type<fctl> for_control_expression		for_control_expression_list
@@ -370,5 +370,5 @@
 %type<decl> assertion assertion_list assertion_list_opt
 
-%type<en>   bit_subrange_size_opt bit_subrange_size
+%type<en> bit_subrange_size_opt bit_subrange_size
 
 %type<decl> basic_declaration_specifier basic_type_name basic_type_specifier direct_type indirect_type
@@ -793,5 +793,4 @@
 	| '(' aggregate_control '&' ')' cast_expression		// CFA
 		{ $$ = new ExpressionNode( build_keyword_cast( $2, $5 ) ); }
-		// VIRTUAL cannot be opt because of look ahead issues
 	| '(' VIRTUAL ')' cast_expression					// CFA
 		{ $$ = new ExpressionNode( new VirtualCastExpr( maybeMoveBuild< Expression >( $4 ), maybeMoveBuildType( nullptr ) ) ); }
@@ -920,9 +919,9 @@
 	| unary_expression assignment_operator assignment_expression
 		{
-			if ( $2 == OperKinds::AtAssn ) {
-				SemanticError( yylloc, "C @= assignment is currently unimplemented." ); $$ = nullptr;
-			} else {
+//			if ( $2 == OperKinds::AtAssn ) {
+//				SemanticError( yylloc, "C @= assignment is currently unimplemented." ); $$ = nullptr;
+//			} else {
 				$$ = new ExpressionNode( build_binary_val( $2, $1, $3 ) );
-			} // if
+//			} // if
 		}
 	| unary_expression '=' '{' initializer_list_opt comma_opt '}'
@@ -1676,38 +1675,14 @@
 
 typedef_expression:
-		// GCC, naming expression type: typedef name = exp; gives a name to the type of an expression
+		// deprecated GCC, naming expression type: typedef name = exp; gives a name to the type of an expression
 	TYPEDEF identifier '=' assignment_expression
 		{
-			// $$ = DeclarationNode::newName( 0 );			// unimplemented
-			SemanticError( yylloc, "Typedef expression is currently unimplemented." ); $$ = nullptr;
+			SemanticError( yylloc, "Typedef expression is deprecated, use typeof(...) instead." ); $$ = nullptr;
 		}
 	| typedef_expression pop ',' push identifier '=' assignment_expression
 		{
-			// $$ = DeclarationNode::newName( 0 );			// unimplemented
-			SemanticError( yylloc, "Typedef expression is currently unimplemented." ); $$ = nullptr;
-		}
-	;
-
-//c_declaration:
-//	declaring_list pop ';'
-//	| typedef_declaration pop ';'
-//	| typedef_expression pop ';'						// GCC, naming expression type
-//	| sue_declaration_specifier pop ';'
-//	;
-//
-//declaring_list:
-//		// A semantic check is required to ensure asm_name only appears on declarations with implicit or explicit static
-//		// storage-class
-//	 declarator asm_name_opt initializer_opt
-//		{
-//			typedefTable.addToEnclosingScope( IDENTIFIER );
-//			$$ = ( $2->addType( $1 ))->addAsmName( $3 )->addInitializer( $4 );
-//		}
-//	| declaring_list ',' attribute_list_opt declarator asm_name_opt initializer_opt
-//		{
-//			typedefTable.addToEnclosingScope( IDENTIFIER );
-//			$$ = $1->appendList( $1->cloneBaseType( $4->addAsmName( $5 )->addInitializer( $6 ) ) );
-//		}
-//	;
+			SemanticError( yylloc, "Typedef expression is deprecated, use typeof(...) instead." ); $$ = nullptr;
+		}
+	;
 
 c_declaration:
@@ -1715,5 +1690,5 @@
 		{ $$ = distAttr( $1, $2 ); }
 	| typedef_declaration
-	| typedef_expression								// GCC, naming expression type
+	| typedef_expression								// deprecated GCC, naming expression type
 	| sue_declaration_specifier
 	;
@@ -2094,5 +2069,6 @@
 		{ yyy = true; $$ = AggregateDecl::Union; }
 	| EXCEPTION											// CFA
-		{ yyy = true; $$ = AggregateDecl::Exception; }
+		// { yyy = true; $$ = AggregateDecl::Exception; }
+		{ SemanticError( yylloc, "exception aggregate is currently unimplemented." ); $$ = AggregateDecl::NoAggregate; }
 	;
 
Index: tests/.expect/array.txt
===================================================================
--- tests/.expect/array.txt	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ tests/.expect/array.txt	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -1,2 +1,2 @@
 array.cfa: In function '_X4mainFi___1':
-array.cfa:54:9: note: #pragma message: Compiled
+array.cfa:55:9: note: #pragma message: Compiled
Index: tests/.expect/expression.txt
===================================================================
--- tests/.expect/expression.txt	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ tests/.expect/expression.txt	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -1,2 +1,2 @@
 expression.cfa: In function '_X4mainFi___1':
-expression.cfa:88:9: note: #pragma message: Compiled
+expression.cfa:89:9: note: #pragma message: Compiled
Index: sts/.expect/poly-cycle.txt
===================================================================
--- tests/.expect/poly-cycle.txt	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ 	(revision )
@@ -1,1 +1,0 @@
-Success!
Index: tests/.expect/poly-d-cycle.txt
===================================================================
--- tests/.expect/poly-d-cycle.txt	(revision 41b8ea40e8b670142d317601e53338340fee08be)
+++ tests/.expect/poly-d-cycle.txt	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -0,0 +1,1 @@
+Success!
Index: tests/.expect/poly-o-cycle.txt
===================================================================
--- tests/.expect/poly-o-cycle.txt	(revision 41b8ea40e8b670142d317601e53338340fee08be)
+++ tests/.expect/poly-o-cycle.txt	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -0,0 +1,1 @@
+Success!
Index: tests/array.cfa
===================================================================
--- tests/array.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ tests/array.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -1,11 +1,11 @@
-//                               -*- Mode: C -*- 
-// 
+//                               -*- Mode: C -*-
+//
 // Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
 //
 // The contents of this file are covered under the licence agreement in the
 // file "LICENCE" distributed with Cforall.
-// 
+//
 // array.cfa -- test array declarations
-// 
+//
 // Author           : Peter A. Buhr
 // Created On       : Tue Feb 19 21:18:06 2019
@@ -13,5 +13,5 @@
 // Last Modified On : Sun Sep 27 09:05:40 2020
 // Update Count     : 4
-// 
+//
 
 int a1[0];
@@ -50,5 +50,7 @@
 
 int main() {
-	#pragma message( "Compiled" )						// force non-empty .expect file
+	#if !defined(NO_COMPILED_PRAGMA)
+		#pragma message( "Compiled" )	// force non-empty .expect file
+	#endif
 }
 
Index: tests/concurrent/park/contention.cfa
===================================================================
--- tests/concurrent/park/contention.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ tests/concurrent/park/contention.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -21,9 +21,9 @@
 		if(blocked[idx]) {
 			Thread * thrd = __atomic_exchange_n(&blocked[idx], 0p, __ATOMIC_SEQ_CST);
-			unpark( *thrd __cfaabi_dbg_ctx2 );
+			unpark( *thrd );
 		} else {
 			Thread * thrd = __atomic_exchange_n(&blocked[idx], &this, __ATOMIC_SEQ_CST);
-			unpark( *thrd __cfaabi_dbg_ctx2 );
-			park( __cfaabi_dbg_ctx );
+			unpark( *thrd );
+			park();
 		}
 	}
@@ -41,5 +41,5 @@
 			int idx = myrand() % blocked_size;
 			Thread * thrd = __atomic_exchange_n(&blocked[idx], 0p, __ATOMIC_SEQ_CST);
-			unpark( *thrd __cfaabi_dbg_ctx2 );
+			unpark( *thrd );
 			yield( myrand() % 20 );
 		}
Index: tests/concurrent/park/force_preempt.cfa
===================================================================
--- tests/concurrent/park/force_preempt.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ tests/concurrent/park/force_preempt.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -30,5 +30,5 @@
 
 		// Unpark this thread, don't force a yield
-		unpark( this __cfaabi_dbg_ctx2 );
+		unpark( this );
 		assert(mask == 0xCAFEBABA);
 
@@ -43,5 +43,5 @@
 		// Park this thread,
 		assert(mask == (id_hash ^ 0xCAFEBABA));
-		park( __cfaabi_dbg_ctx );
+		park();
 		assert(mask == (id_hash ^ 0xCAFEBABA));
 
Index: tests/concurrent/park/start_parked.cfa
===================================================================
--- tests/concurrent/park/start_parked.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ tests/concurrent/park/start_parked.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -3,5 +3,5 @@
 thread Parker {};
 void main( Parker & ) {
-	park( __cfaabi_dbg_ctx );
+	park();
 }
 
@@ -9,5 +9,5 @@
 	for(1000) {
 		Parker parker;
-		unpark( parker __cfaabi_dbg_ctx2 );
+		unpark( parker );
 	}
 	printf( "done\n" );									// non-empty .expect file
Index: tests/expression.cfa
===================================================================
--- tests/expression.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ tests/expression.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -84,4 +84,6 @@
 	(S)@{2}`mary;
 
-	#pragma message( "Compiled" )			// force non-empty .expect file
+	#if !defined(NO_COMPILED_PRAGMA)
+		#pragma message( "Compiled" )	// force non-empty .expect file
+	#endif
 } // main
Index: sts/poly-cycle.cfa
===================================================================
--- tests/poly-cycle.cfa	(revision 2fb35df396c012cc3908fbb4d946ba80cad0ce88)
+++ 	(revision )
@@ -1,28 +1,0 @@
-// Check that a cycle of polymorphic data structures can be instancated.
-
-#include <stdio.h>
-
-forall(otype T)
-struct func_table;
-
-forall(otype U)
-struct object {
-	func_table(U) * virtual_table;
-};
-
-forall(otype T)
-struct func_table {
-	void (*object_func)(object(T) *);
-};
-
-void func(object(int) *) {
-	printf("Success!\n");
-}
-
-func_table(int) an_instance = { func };
-
-int main(int argc, char * argv[]) {
-	object(int) x = { 0p };
-	an_instance.object_func( &x );
-	return 0;
-}
Index: tests/poly-d-cycle.cfa
===================================================================
--- tests/poly-d-cycle.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
+++ tests/poly-d-cycle.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -0,0 +1,28 @@
+// Check that a cycle of polymorphic dtype structures can be instancated.
+
+#include <stdio.h>
+
+forall(dtype T)
+struct func_table;
+
+forall(dtype U)
+struct object {
+	func_table(U) * virtual_table;
+};
+
+forall(dtype T)
+struct func_table {
+	void (*object_func)(object(T) *);
+};
+
+void func(object(int) *) {
+	printf("Success!\n");
+}
+
+func_table(int) an_instance = { func };
+
+int main(int argc, char * argv[]) {
+	object(int) x = { 0p };
+	an_instance.object_func( &x );
+	return 0;
+}
Index: tests/poly-o-cycle.cfa
===================================================================
--- tests/poly-o-cycle.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
+++ tests/poly-o-cycle.cfa	(revision 41b8ea40e8b670142d317601e53338340fee08be)
@@ -0,0 +1,28 @@
+// Check that a cycle of polymorphic otype structures can be instancated.
+
+#include <stdio.h>
+
+forall(otype T)
+struct func_table;
+
+forall(otype U)
+struct object {
+	func_table(U) * virtual_table;
+};
+
+forall(otype T)
+struct func_table {
+	void (*object_func)(object(T) *);
+};
+
+void func(object(int) *) {
+	printf("Success!\n");
+}
+
+func_table(int) an_instance = { func };
+
+int main(int argc, char * argv[]) {
+	object(int) x = { 0p };
+	an_instance.object_func( &x );
+	return 0;
+}
