Index: doc/papers/concurrency/Paper.tex
===================================================================
--- doc/papers/concurrency/Paper.tex	(revision b199e5464466762e6443182fb8b497b19366f31a)
+++ doc/papers/concurrency/Paper.tex	(revision 64188cc968edc37fc38282342a8e2bac5ad63a79)
@@ -22,4 +22,5 @@
 \captionsetup{justification=raggedright,singlelinecheck=false}
 \usepackage{dcolumn}						% align decimal points in tables
+\usepackage{capt-of}
 
 \hypersetup{breaklinks=true}
@@ -2158,5 +2159,5 @@
 \begin{cfa}
 unsigned int N = 10_000_000;
-#define BENCH( run, result ) Time before = getTimeNsec(); run; result = (getTimeNsec() - before) / N;
+#define BENCH( run ) Time before = getTimeNsec(); run; Duration result = (getTimeNsec() - before) / N;
 \end{cfa}
 The method used to get time is @clock_gettime( CLOCK_REALTIME )@.
@@ -2184,9 +2185,6 @@
 void main( C & ) { for ( ;; ) { @suspend();@ } }
 int main() {
-	Duration result;
 	BENCH(
-		for ( size_t i = 0; i < N; i += 1 ) { @resume( c );@ },
-		result
-	)
+		for ( size_t i = 0; i < N; i += 1 ) { @resume( c );@ } )
 	sout | result`ns | endl;
 }
@@ -2200,9 +2198,6 @@
 
 int main() {
-	Duration result;
 	BENCH(
-		for ( size_t i = 0; i < N; i += 1 ) { @yield();@ },
-		result
-	)
+		for ( size_t i = 0; i < N; i += 1 ) { @yield();@ } )
 	sout | result`ns | endl;
 }
@@ -2213,13 +2208,12 @@
 \quad
 \subfloat[Thread]{\label{f:ExternalState}\usebox\myboxB}
-\caption{\CFA Context-switch benchmark}
+\captionof{figure}{\CFA context-switch benchmark}
 \label{f:ctx-switch}
-\end{figure}
-
-\begin{table}
+
 \centering
-\caption{Context Switch comparison (nanoseconds)}
+
+\captionof{table}{Context switch comparison (nanoseconds)}
 \label{tab:ctx-switch}
-
+\bigskip
 \begin{tabular}{|r|*{3}{D{.}{.}{3.2}|}}
 \cline{2-4}
@@ -2235,17 +2229,8 @@
 \hline
 \end{tabular}
-\end{table}
-
-
-\paragraph{Mutual-Exclusion}
-
-Mutual exclusion is measured by entering/leaving a critical section.
-For monitors, entering and leaving a monitor routine is measured.
-Figure~\ref{f:mutex} shows the code for \CFA with all results in Table~\ref{tab:mutex}.
-To put the results in context, the cost of entering a non-inline routine and the cost of acquiring and releasing a @pthread_mutex@ lock is also measured.
-Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects.
-
-\begin{samepage}
-\begin{figure}[!p]
+
+\bigskip
+\bigskip
+
 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}
 \begin{cfa}
@@ -2253,17 +2238,16 @@
 void __attribute__((noinline)) do_call( M & mutex m/*, m2, m3, m4*/ ) {}
 int main() {
-	Duration result;
-	BENCH( for( size_t i = 0; i < N; i += 1 ) { @do_call( m1/*, m2, m3, m4*/ );@ }, result )
+	BENCH( for( size_t i = 0; i < N; i += 1 ) { @do_call( m1/*, m2, m3, m4*/ );@ } )
 	sout | result`ns | endl;
 }
 \end{cfa}
-\caption{\CFA benchmark code used to measure mutex routines.}
+\captionof{figure}{\CFA acquire/release mutex benchmark}
 \label{f:mutex}
-\end{figure}
-
-\begin{table}[!p]
+
 \centering
-\caption{Mutex routine comparison (nanoseconds)}
+
+\captionof{table}{Mutex comparison (nanoseconds)}
 \label{tab:mutex}
+\bigskip
 
 \begin{tabular}{|r|*{3}{D{.}{.}{3.2}|}}
@@ -2281,6 +2265,14 @@
 \hline
 \end{tabular}
-\end{table}
-\end{samepage}
+\end{figure}
+
+
+\paragraph{Mutual-Exclusion}
+
+Mutual exclusion is measured by entering/leaving a critical section.
+For monitors, entering and leaving a monitor routine is measured.
+Figure~\ref{f:mutex} shows the code for \CFA with all results in Table~\ref{tab:mutex}.
+To put the results in context, the cost of entering a non-inline routine and the cost of acquiring and releasing a @pthread_mutex@ lock is also measured.
+Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects.
 
 
@@ -2291,6 +2283,5 @@
 Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects.
 
-\begin{samepage}
-\begin{figure}[!p]
+\begin{figure}
 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}
 \begin{cfa}
@@ -2305,7 +2296,6 @@
 }
 int  __attribute__((noinline)) do_wait( M & mutex m ) {
-	Duration result;
 	go = 1;	// continue other thread
-	BENCH( for ( size_t i = 0; i < N; i += 1 ) { @wait( c );@ }, result );
+	BENCH( for ( size_t i = 0; i < N; i += 1 ) { @wait( c );@ } );
 	go = 0;	// stop other thread
 	sout | result`ns | endl;
@@ -2316,12 +2306,12 @@
 }
 \end{cfa}
-\caption{Internal scheduling benchmark}
+\captionof{figure}{\CFA Internal scheduling benchmark}
 \label{f:int-sched}
-\end{figure}
-
-\begin{table}[!p]
+
 \centering
-\caption{Internal scheduling comparison (nanoseconds)}
+\captionof{table}{Internal scheduling comparison (nanoseconds)}
 \label{tab:int-sched}
+\bigskip
+
 \begin{tabular}{|r|*{3}{D{.}{.}{5.2}|}}
 \cline{2-4}
@@ -2336,6 +2326,5 @@
 \hline
 \end{tabular}
-\end{table}
-\end{samepage}
+\end{figure}
 
 
@@ -2346,5 +2335,4 @@
 Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects.
 
-\begin{samepage}
 \begin{figure}
 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}
@@ -2359,6 +2347,7 @@
 }
 int __attribute__((noinline)) do_wait( M & mutex m ) {
-	Duration result;
-	go = 1; BENCH( for ( size_t i = 0; i < N; i += 1 ) { @waitfor( do_call, m );@ }, result ) go = 0;
+	go = 1;	// continue other thread
+	BENCH( for ( size_t i = 0; i < N; i += 1 ) { @waitfor( do_call, m );@ } )
+	go = 0;	// stop other thread
 	sout | result`ns | endl;
 }
@@ -2368,17 +2357,17 @@
 }
 \end{cfa}
-\caption{Benchmark code for external scheduling}
+\captionof{figure}{\CFA external scheduling benchmark}
 \label{f:ext-sched}
-\end{figure}
-
-\begin{table}
+
 \centering
-\caption{External scheduling comparison (nanoseconds)}
+
+\captionof{table}{External scheduling comparison (nanoseconds)}
 \label{tab:ext-sched}
+\bigskip
 \begin{tabular}{|r|*{3}{D{.}{.}{3.2}|}}
 \cline{2-4}
 \multicolumn{1}{c|}{} & \multicolumn{1}{c|}{Median} &\multicolumn{1}{c|}{Average} & \multicolumn{1}{c|}{Std Dev} \\
 \hline
-\uC @Accept@				& 350		& 350.61	& 3.11  \\
+\uC @_Accept@				& 350		& 350.61	& 3.11  \\
 \CFA @waitfor@, 1 @monitor@	& 358.5		& 358.36	& 3.82  \\
 \CFA @waitfor@, 2 @monitor@	& 422		& 426.79	& 7.95  \\
@@ -2386,16 +2375,8 @@
 \hline
 \end{tabular}
-\end{table}
-\end{samepage}
-
-
-\paragraph{Object Creation}
-
-Object creation is measured by creating/deleting the specific kind of concurrent object.
-Figure~\ref{f:creation} shows the code for \CFA, with results in Table~\ref{tab:creation}.
-The only note here is that the call stacks of \CFA coroutines are lazily created, therefore without priming the coroutine to force stack creation, the creation cost is artificially low.
-
-\begin{figure}
-\centering
+
+\bigskip
+\medskip
+
 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}
 \begin{cfa}
@@ -2403,17 +2384,17 @@
 void main( MyThread & ) {}
 int main() {
-	Duration result;
-	BENCH( for ( size_t i = 0; i < N; i += 1 ) { @MyThread m;@ }, result )
+	BENCH( for ( size_t i = 0; i < N; i += 1 ) { @MyThread m;@ } )
 	sout | result`ns | endl;
 }
 \end{cfa}
-\caption{Benchmark code for \CFA object creation}
+\captionof{figure}{\CFA object creation benchmark}
 \label{f:creation}
-\end{figure}
-
-\begin{table}
+
 \centering
-\caption{Creation comparison (nanoseconds)}
+
+\captionof{table}{Creation comparison (nanoseconds)}
 \label{tab:creation}
+\bigskip
+
 \begin{tabular}{|r|*{3}{D{.}{.}{5.2}|}}
 \cline{2-4}
@@ -2430,5 +2411,12 @@
 \hline
 \end{tabular}
-\end{table}
+\end{figure}
+
+
+\paragraph{Object Creation}
+
+Object creation is measured by creating/deleting the specific kind of concurrent object.
+Figure~\ref{f:creation} shows the code for \CFA, with results in Table~\ref{tab:creation}.
+The only note here is that the call stacks of \CFA coroutines are lazily created, therefore without priming the coroutine to force stack creation, the creation cost is artificially low.