Index: doc/papers/concurrency/Paper.tex
===================================================================
--- doc/papers/concurrency/Paper.tex	(revision 9e0a3604fe0045465504142faaf73f8d1a3815b0)
+++ doc/papers/concurrency/Paper.tex	(revision 397edf7a638bd273ce80fb1fb107f1eaabf93eec)
@@ -2705,39 +2705,7 @@
 \label{results}
 
-To verify the implementation of the \CFA runtime, a series of microbenchmarks are performed comparing \CFA with Java OpenJDK-9, Go 1.9.2 and \uC 7.0.0.
-The benchmark computer is an AMD Opteron\texttrademark\ 6380 NUMA 64-core, 8 socket, 2.5 GHz processor, running Ubuntu 16.04.6 LTS, and \uC/\CFA are compiled with gcc 6.5.
-
-\begin{comment}
-\begin{table}
-\centering
-\caption{Experiment environment}
-\label{t:machine}
-
-\begin{tabular}{|l|r||l|r|}
-\hline
-Architecture		& x86\_64 				& NUMA node(s) 	& 8 \\
-\hline
-CPU op-mode(s)		& 32-bit, 64-bit 		& Model name 	& AMD Opteron\texttrademark\ Processor 6380 \\
-\hline
-Byte Order			& Little Endian 		& CPU Freq 		& 2.5 GHz \\
-\hline
-CPU(s)				& 64 					& L1d cache 	& 16 KiB \\
-\hline
-Thread(s) per core	& 2 					& L1i cache 	& 64 KiB \\
-\hline
-Core(s) per socket	& 8 					& L2 cache 		& 2048 KiB \\
-\hline
-Socket(s)			& 4 					& L3 cache 		& 6144 KiB \\
-\hline
-\hline
-Operating system	& Ubuntu 16.04.3 LTS	& Kernel		& Linux 4.4-97-generic \\
-\hline
-gcc					& 6.3	 				& \CFA			& 1.0.0 \\
-\hline
-Java				& OpenJDK-9 			& Go			& 1.9.2 \\
-\hline
-\end{tabular}
-\end{table}
-\end{comment}
+To verify the implementation of the \CFA runtime, a series of microbenchmarks are performed comparing \CFA with pthreads, Java OpenJDK-9, Go 1.12.6 and \uC 7.0.0.
+For comparison, the package must be multi-processor (M:N), which excludes libdill/libmil~\cite{libdill} (M:1)), and use a shared-memory programming model, \eg not message passing.
+The benchmark computer is an AMD Opteron\texttrademark\ 6380 NUMA 64-core, 8 socket, 2.5 GHz processor, running Ubuntu 16.04.6 LTS, and \CFA/\uC are compiled with gcc 6.5.
 
 All benchmarks are run using the following harness.
@@ -2749,4 +2717,5 @@
 Each benchmark is performed @N@ times, where @N@ varies depending on the benchmark;
 the total time is divided by @N@ to obtain the average time for a benchmark.
+Each benchmark experiment is run 31 times.
 All omitted tests for other languages are functionally identical to the \CFA tests and available online~\cite{CforallBenchMarks}.
 
@@ -2779,12 +2748,115 @@
 \begin{tabular}[t]{@{}r*{3}{D{.}{.}{5.2}}@{}}
 \multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
-Pthreads				& 28091		& 28073.39	& 163.1		\\
-\CFA Coroutine Lazy		& 6			& 6.07		& 0.26		\\
-\CFA Coroutine Eager	& 520		& 520.61	& 2.04		\\
-\CFA Thread				& 2032		& 2016.29	& 112.07	\\
-\uC Coroutine			& 106		& 107.36	& 1.47		\\
-\uC Thread				& 536.5		& 537.07	& 4.64		\\
-Goroutine				& 3103		& 3086.29	& 90.25		\\
-Java Thread				& 103416.5	& 103732.29	& 1137		\\
+\CFA Coroutine Lazy		& 14.3		& 14.3		& 0.32		\\
+\CFA Coroutine Eager	& 2203.7	& 2205.6	& 26.03		\\
+\CFA Thread				& 1257.8	& 1291.2	& 86.19		\\
+\uC Coroutine			& 92.2		& 91.4		& 1.58		\\
+\uC Thread				& 499.5		& 500.1		& 5.67		\\
+Goroutine				& 4397.0	& 4362.8	& 390.77	\\
+Java Thread				& 107405.0	& 107794.8	& 1601.33	\\
+% Qthreads				& 159.9		& 159.6		& 0.73		\\
+Pthreads				& 32920.9	& 32882.7	& 213.55
+\end{tabular}
+\end{multicols}
+
+
+\paragraph{Internal Scheduling}
+
+Internal scheduling is measured using a cycle of two threads signalling and waiting.
+Figure~\ref{f:int-sched} shows the code for \CFA, with results in Table~\ref{tab:int-sched}.
+Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects.
+Java scheduling is significantly greater because the benchmark explicitly creates multiple thread in order to prevent the JIT from making the program sequential, \ie removing all locking.
+
+\begin{multicols}{2}
+\lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}
+\begin{cfa}
+volatile int go = 0;
+@monitor@ M { @condition c;@ } m;
+void __attribute__((noinline))
+do_call( M & @mutex@ a1 ) { @signal( c );@ }
+thread T {};
+void main( T & this ) {
+	while ( go == 0 ) { yield(); }
+	while ( go == 1 ) { do_call( m ); }
+}
+int  __attribute__((noinline))
+do_wait( M & mutex m ) with(m) {
+	go = 1;	// continue other thread
+	BENCH( for ( N ) { @wait( c );@ } );
+	go = 0;	// stop other thread
+	sout | result`ns;
+}
+int main() {
+	T t;
+	do_wait( m );
+}
+\end{cfa}
+\captionof{figure}{\CFA Internal-scheduling benchmark}
+\label{f:int-sched}
+
+\columnbreak
+
+\vspace*{-16pt}
+\captionof{table}{Internal-scheduling comparison (nanoseconds)}
+\label{tab:int-sched}
+\bigskip
+
+\begin{tabular}{@{}r*{3}{D{.}{.}{5.2}}@{}}
+\multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
+\CFA @signal@, 1 @monitor@	& 367.0		& 371.5		& 17.34		\\
+\CFA @signal@, 2 @monitor@	& 477.2		& 478.6		& 8.31		\\
+\CFA @signal@, 4 @monitor@	& 725.8		& 734.0		& 17.98		\\
+\uC @signal@				& 322.8		& 323.0 	& 3.64		\\
+Java @notify@				& 16520.0	& 20096.7	& 9378.53	\\
+Pthreads Cond. Variable		& 4931.3	& 5057.0 	& 326.80
+\end{tabular}
+\end{multicols}
+
+
+\paragraph{External Scheduling}
+
+External scheduling is measured using a cycle of two threads calling and accepting the call using the @waitfor@ statement.
+Figure~\ref{f:ext-sched} shows the code for \CFA, with results in Table~\ref{tab:ext-sched}.
+Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects.
+
+\begin{multicols}{2}
+\lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}
+\vspace*{-16pt}
+\begin{cfa}
+volatile int go = 0;
+@monitor@ M {} m;
+thread T {};
+void __attribute__((noinline))
+do_call( M & @mutex@ ) {}
+void main( T & ) {
+	while ( go == 0 ) { yield(); }
+	while ( go == 1 ) { do_call( m ); }
+}
+int __attribute__((noinline))
+do_wait( M & @mutex@ m ) {
+	go = 1;	// continue other thread
+	BENCH( for ( N ) { @waitfor( do_call, m );@ } )
+	go = 0;	// stop other thread
+	sout | result`ns;
+}
+int main() {
+	T t;
+	do_wait( m );
+}
+\end{cfa}
+\captionof{figure}{\CFA external-scheduling benchmark}
+\label{f:ext-sched}
+
+\columnbreak
+
+\vspace*{-16pt}
+\captionof{table}{External-scheduling comparison (nanoseconds)}
+\label{tab:ext-sched}
+\begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}}
+\multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
+\CFA @waitfor@, 1 @monitor@	& 366.7		& 369.5	& 7.52	\\
+\CFA @waitfor@, 2 @monitor@	& 453.6		& 455.8	& 12.38	\\
+\CFA @waitfor@, 4 @monitor@	& 671.6		& 672.4	& 14.16	\\
+\uC @_Accept@				& 336.0		& 335.8		& 3.22
 \end{tabular}
 \end{multicols}
@@ -2825,13 +2897,14 @@
 \begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}}
 \multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
-C function		& 2			& 2		& 0		\\
-\CFA generator	& 2			& 2		& 0		\\
-\CFA Coroutine	& 49	& 48.68		& 0.47	\\
-\CFA Thread		& 105	& 105.57	& 1.37	\\
-\uC Coroutine	& 44	& 44		& 0		\\
-\uC Thread		& 100	& 99.29		& 0.96	\\
-Goroutine		& 145	& 147.25	& 4.15	\\
-Java Thread		& 373.5	& 375.14	& 8.72	\\
-Pthreads Thread	& 333.5	& 332.96	& 4.1
+C function		& 1.8		& 1.8	& 0		\\
+\CFA generator	& 2.7		& 2.4	& 0.27	\\
+\CFA Coroutine	& 37.8		& 37.7	& 0.22	\\
+\CFA Thread		& 93.6		& 93.8	& 1.46	\\
+\uC Coroutine	& 52.7		& 52.8	& 0.28	\\
+\uC Thread		& 93.4		& 93.7	& 1.04	\\
+Goroutine		& 140.0		& 139.7	& 2.93	\\
+Java Thread		& 374.0		& 375.8	& 10.38	\\
+% Qthreads Thread	& 159.5		& 159.3	& 0.71	\\
+Pthreads Thread	& 334.4		& 335.0	& 1.95	\\
 \end{tabular}
 \end{multicols}
@@ -2869,113 +2942,11 @@
 \begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}}
 \multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
-test and test-and-test lock		& 26		& 26	& 0		\\
-Pthreads Mutex Lock				& 31		& 31.71	& 0.97	\\
-\uC @monitor@ member rtn.		& 31		& 31	& 0		\\
-\CFA @mutex@ function, 1 arg.	& 46		& 46.68	& 0.93	\\
-\CFA @mutex@ function, 2 arg.	& 84		& 85.36	& 1.99	\\
-\CFA @mutex@ function, 4 arg.	& 158		& 161	& 4.22	\\
-Java synchronized method		& 27.5		& 29.79	& 2.93
-\end{tabular}
-\end{multicols}
-
-
-\paragraph{Internal Scheduling}
-
-Internal scheduling is measured using a cycle of two threads signalling and waiting.
-Figure~\ref{f:int-sched} shows the code for \CFA, with results in Table~\ref{tab:int-sched}.
-Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects.
-Java scheduling is significantly greater because the benchmark explicitly creates multiple thread in order to prevent the JIT from making the program sequential, \ie removing all locking.
-
-\begin{multicols}{2}
-\lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}
-\begin{cfa}
-volatile int go = 0;
-@monitor@ M { @condition c;@ } m;
-void __attribute__((noinline))
-do_call( M & @mutex@ a1 ) { @signal( c );@ }
-thread T {};
-void main( T & this ) {
-	while ( go == 0 ) { yield(); }
-	while ( go == 1 ) { do_call( m ); }
-}
-int  __attribute__((noinline))
-do_wait( M & mutex m ) with(m) {
-	go = 1;	// continue other thread
-	BENCH( for ( N ) { @wait( c );@ } );
-	go = 0;	// stop other thread
-	sout | result`ns;
-}
-int main() {
-	T t;
-	do_wait( m );
-}
-\end{cfa}
-\captionof{figure}{\CFA Internal-scheduling benchmark}
-\label{f:int-sched}
-
-\columnbreak
-
-\vspace*{-16pt}
-\captionof{table}{Internal-scheduling comparison (nanoseconds)}
-\label{tab:int-sched}
-\bigskip
-
-\begin{tabular}{@{}r*{3}{D{.}{.}{5.2}}@{}}
-\multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
-Pthreads Cond. Variable		& 6005		& 5681.43 	& 835.45	\\
-\uC @signal@				& 324		& 325.54 	& 3.02		\\
-\CFA @signal@, 1 @monitor@	& 368.5		& 370.61	& 4.77		\\
-\CFA @signal@, 2 @monitor@	& 467		& 470.5		& 6.79		\\
-\CFA @signal@, 4 @monitor@	& 700.5		& 702.46	& 7.23		\\
-Java @notify@				& 15471		& 172511	& 5689
-\end{tabular}
-\end{multicols}
-
-
-\paragraph{External Scheduling}
-
-External scheduling is measured using a cycle of two threads calling and accepting the call using the @waitfor@ statement.
-Figure~\ref{f:ext-sched} shows the code for \CFA, with results in Table~\ref{tab:ext-sched}.
-Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects.
-
-\begin{multicols}{2}
-\lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}
-\vspace*{-16pt}
-\begin{cfa}
-volatile int go = 0;
-@monitor@ M {} m;
-thread T {};
-void __attribute__((noinline))
-do_call( M & @mutex@ ) {}
-void main( T & ) {
-	while ( go == 0 ) { yield(); }
-	while ( go == 1 ) { do_call( m ); }
-}
-int __attribute__((noinline))
-do_wait( M & @mutex@ m ) {
-	go = 1;	// continue other thread
-	BENCH( for ( N ) { @waitfor( do_call, m );@ } )
-	go = 0;	// stop other thread
-	sout | result`ns;
-}
-int main() {
-	T t;
-	do_wait( m );
-}
-\end{cfa}
-\captionof{figure}{\CFA external-scheduling benchmark}
-\label{f:ext-sched}
-
-\columnbreak
-
-\vspace*{-16pt}
-\captionof{table}{External-scheduling comparison (nanoseconds)}
-\label{tab:ext-sched}
-\begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}}
-\multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
-\uC @_Accept@				& 358		& 359.11	& 2.53		\\
-\CFA @waitfor@, 1 @monitor@	& 359		& 360.93	& 4.07		\\
-\CFA @waitfor@, 2 @monitor@	& 450		& 449.39	& 6.62		\\
-\CFA @waitfor@, 4 @monitor@	& 652		& 655.64	& 7.73
+test and test-and-test lock		& 19.1	& 19.0	& 0.36	\\
+\CFA @mutex@ function, 1 arg.	& 46.6	& 46.8	& 0.86	\\
+\CFA @mutex@ function, 2 arg.	& 84.1	& 85.3	& 1.86	\\
+\CFA @mutex@ function, 4 arg.	& 158.6	& 160.7	& 3.07	\\
+\uC @monitor@ member rtn.		& 54.0	& 53.7	& 0.83	\\
+Java synchronized method		& 27.0	& 27.1	& 0.25	\\
+Pthreads Mutex Lock				& 33.6	& 32.7	& 1.12
 \end{tabular}
 \end{multicols}