Index: doc/papers/concurrency/Paper.tex
===================================================================
--- doc/papers/concurrency/Paper.tex	(revision aac71977640c99334881f777535dfd1070e0a264)
+++ doc/papers/concurrency/Paper.tex	(revision bb37870b4f40d34fb70d1cfcb9ab7b91908aa4bb)
@@ -484,5 +484,5 @@
 One of the important features that are missing in C is threading\footnote{While the C11 standard defines a ``threads.h'' header, it is minimal and defined as optional.
 As such, library support for threading is far from widespread.
-At the time of writing the paper, neither \texttt{gcc} nor \texttt{clang} support ``threads.h'' in their respective standard libraries.}.
+At the time of writing the paper, neither \protect\lstinline|gcc| nor \protect\lstinline|clang| support ``threads.h'' in their standard libraries.}.
 On modern architectures, a lack of threading is unacceptable~\cite{Sutter05, Sutter05b}, and therefore modern programming languages must have the proper tools to allow users to write efficient concurrent programs to take advantage of parallelism.
 As an extension of C, \CFA needs to express these concepts in a way that is as natural as possible to programmers familiar with imperative languages.
@@ -500,10 +500,9 @@
 \begin{figure}
 \begin{center}
-\begin{tabular}{c|c|c}
-\begin{cfa}
-// Using callback
-void fibonacci_func(
-	int n,
-	void (* callback)( int )
+\begin{tabular}{@{}lll@{}}
+\multicolumn{1}{c}{\textbf{callback}} & \multicolumn{1}{c}{\textbf{output array}} & \multicolumn{1}{c}{\textbf{external state}} \\
+\begin{cfa}
+void fib_func(
+	int n, void (* callback)( int )
 ) {
 	int fn, f1 = 0, f2 = 1;
@@ -518,5 +517,5 @@
 		printf( "%d\n", n );
 	}
-	fibonacci_func( 10, print_fib );
+	fib_func( 10, print_fib );
 }
 
@@ -524,8 +523,6 @@
 &
 \begin{cfa}
-// Using output array
-void fibonacci_array(
-	int n,
-	int * array
+void fib_array(
+	int n, int * array
 ) {
 	int fn, f1 = 0, f2 = 1;
@@ -538,5 +535,5 @@
 int main() {
 	int a[10];
-	fibonacci_array( 10, a );
+	fib_array( 10, a );
 	for ( int i = 0; i < 10; i++ ) {
 		printf( "%d\n", a[i] );
@@ -546,21 +543,19 @@
 &
 \begin{cfa}
-// Using external state
-typedef struct {
-	int f1, f2;
-} Iterator_t;
-int fibonacci_state(
-	Iterator_t * it
+
+typedef struct { int f1, f2; } Fib;
+int fib_state(
+	Fib * fib
 ) {
-	int ret = it->f1;
-	int fn = it->f1 + it->f2;
-	it->f2 = it->f1; it->f1 = fn;
+	int ret = fib->f1;
+	int fn = fib->f1 + fib->f2;
+	fib->f2 = fib->f1; fib->f1 = fn;
 	return ret;
 }
 int main() {
-	Iterator_t it = { 0, 1 };
+	Fib fib = { 0, 1 };
+
 	for ( int i = 0; i < 10; i++ ) {
-		printf( "%d\n",
-			fibonacci_state( &it ) );
+		printf( "%d\n", fib_state( &fib ) );
 	}
 }
@@ -568,6 +563,6 @@
 \end{tabular}
 \end{center}
-\caption{C Fibonacci Implementations}
-\label{lst:fibonacci-c}
+\caption{Fibonacci Implementations in C}
+\label{lst:fib-c}
 \end{figure}
 
@@ -605,5 +600,5 @@
 int main() {
 	Fibonacci f1, f2;
-	for ( int i = 1; i <= 10; i += 1 ) {
+	for ( int i = 1; i <= 10; i++ ) {
 		sout | next( f1 ) | next( f2 ) | endl;
 	}
@@ -761,9 +756,9 @@
 symmetric_coroutine<>::yield_type
 \end{cfa}
-Often, the canonical threading paradigm in languages is based on function pointers, \texttt{pthread} being one of the most well-known examples.
+Often, the canonical threading paradigm in languages is based on function pointers, @pthread@ being one of the most well-known examples.
 The main problem of this approach is that the thread usage is limited to a generic handle that must otherwise be wrapped in a custom type.
 Since the custom type is simple to write in \CFA and solves several issues, added support for routine/lambda based coroutines adds very little.
 
-A variation of this would be to use a simple function pointer in the same way \texttt{pthread} does for threads:
+A variation of this would be to use a simple function pointer in the same way @pthread@ does for threads:
 \begin{cfa}
 void foo( coroutine_t cid, void* arg ) {
@@ -1439,5 +1434,5 @@
 For the purpose of translating the given cfa-code into \CFA-code, any method of introducing a monitor is acceptable, e.g., @mutex@ parameters, global variables, pointer parameters, or using locals with the @mutex@ statement.
 
-\begin{figure}[!t]
+\begin{figure}
 \begin{multicols}{2}
 Waiting thread
@@ -1664,6 +1659,5 @@
 \hline
 \begin{cfa}[tabsize=3]
-monitor DatingService
-{
+monitor DatingService {
 	// compatibility codes
 	enum{ CCodes = 20 };
@@ -1677,42 +1671,25 @@
 condition exchange;
 
-int girl(int phoneNo, int cfa)
-{
+int girl(int phoneNo, int cfa) {
 	// no compatible boy ?
-	if(empty(boys[cfa]))
-	{
-		// wait for boy
-		wait(girls[cfa]);
-
-		// make phone number available
-		girlPhoneNo = phoneNo;
-
-		// wake boy from chair
-		signal(exchange);
-	}
-	else
-	{
-		// make phone number available
-		girlPhoneNo = phoneNo;
-
-		// wake boy
-		signal(boys[cfa]);
-
-		// sit in chair
-		wait(exchange);
+	if(empty(boys[cfa])) {
+		wait(girls[cfa]);		// wait for boy
+		girlPhoneNo = phoneNo;		// make phone number available
+		signal(exchange);		// wake boy from chair
+	} else {
+		girlPhoneNo = phoneNo;		// make phone number available
+		signal(boys[cfa]);		// wake boy
+		wait(exchange);		// sit in chair
 	}
 	return boyPhoneNo;
 }
-
-int boy(int phoneNo, int cfa)
-{
+int boy(int phoneNo, int cfa) {
 	// same as above
 	// with boy/girl interchanged
 }
 \end{cfa}&\begin{cfa}[tabsize=3]
-monitor DatingService
-{
-	// compatibility codes
-	enum{ CCodes = 20 };
+monitor DatingService {
+
+	enum{ CCodes = 20 }; 	// compatibility codes
 
 	int girlPhoneNo;
@@ -1724,25 +1701,13 @@
 // exchange is not needed
 
-int girl(int phoneNo, int cfa)
-{
+int girl(int phoneNo, int cfa) {
 	// no compatible boy ?
-	if(empty(boys[cfa]))
-	{
-		// wait for boy
-		wait(girls[cfa]);
-
-		// make phone number available
-		girlPhoneNo = phoneNo;
-
-		// wake boy from chair
-		signal(exchange);
-	}
-	else
-	{
-		// make phone number available
-		girlPhoneNo = phoneNo;
-
-		// wake boy
-		signal_block(boys[cfa]);
+	if(empty(boys[cfa])) {
+		wait(girls[cfa]);		// wait for boy
+		girlPhoneNo = phoneNo;		// make phone number available
+		signal(exchange);		// wake boy from chair
+	} else {
+		girlPhoneNo = phoneNo;		// make phone number available
+		signal_block(boys[cfa]); 		// wake boy
 
 		// second handshake unnecessary
@@ -1752,6 +1717,5 @@
 }
 
-int boy(int phoneNo, int cfa)
-{
+int boy(int phoneNo, int cfa) {
 	// same as above
 	// with boy/girl interchanged
@@ -2071,5 +2035,6 @@
 
 \begin{figure}
-\begin{cfa}[caption={Various correct and incorrect uses of the or, else, and timeout clause around a waitfor statement},label={lst:waitfor2}]
+\lstset{language=CFA,deletedelim=**[is][]{`}{`}}
+\begin{cfa}
 monitor A{};
 
@@ -2078,42 +2043,30 @@
 
 void foo( A & mutex a, bool b, int t ) {
-	// Correct : blocking case
-	waitfor(f1, a);
-
-	// Correct : block with statement
-	waitfor(f1, a) {
+	waitfor(f1, a);							$\C{// Correct : blocking case}$
+
+	waitfor(f1, a) {						$\C{// Correct : block with statement}$
 		sout | "f1" | endl;
 	}
-
-	// Correct : block waiting for f1 or f2
-	waitfor(f1, a) {
+	waitfor(f1, a) {						$\C{// Correct : block waiting for f1 or f2}$
 		sout | "f1" | endl;
 	} or waitfor(f2, a) {
 		sout | "f2" | endl;
 	}
-
-	// Correct : non-blocking case
-	waitfor(f1, a); or else;
-
-	// Correct : non-blocking case
-	waitfor(f1, a) {
+	waitfor(f1, a); or else;				$\C{// Correct : non-blocking case}$
+
+	waitfor(f1, a) {						$\C{// Correct : non-blocking case}$
 		sout | "blocked" | endl;
 	} or else {
 		sout | "didn't block" | endl;
 	}
-
-	// Correct : block at most 10 seconds
-	waitfor(f1, a) {
+	waitfor(f1, a) {						$\C{// Correct : block at most 10 seconds}$
 		sout | "blocked" | endl;
 	} or timeout( 10`s) {
 		sout | "didn't block" | endl;
 	}
-
-	// Correct : block only if b == true
-	// if b == false, don't even make the call
+	// Correct : block only if b == true if b == false, don't even make the call
 	when(b) waitfor(f1, a);
 
-	// Correct : block only if b == true
-	// if b == false, make non-blocking call
+	// Correct : block only if b == true if b == false, make non-blocking call
 	waitfor(f1, a); or when(!b) else;
 
@@ -2124,9 +2077,10 @@
 	waitfor(f1, a); or timeout(t); or else;
 
-	// Incorrect : order must be
-	// waitfor [or waitfor... [or timeout] [or else]]
+	// Incorrect : order must be waitfor [or waitfor... [or timeout] [or else]]
 	timeout(t); or waitfor(f1, a); or else;
 }
 \end{cfa}
+\caption{Correct and incorrect uses of the or, else, and timeout clause around a waitfor statement}
+\label{lst:waitfor2}
 \end{figure}
 
@@ -2304,5 +2258,5 @@
 It is important to present the difference between the two acquiring options: \textbf{callsite-locking} and entry-point locking, i.e., acquiring the monitors before making a mutex routine-call or as the first operation of the mutex routine-call.
 For example:
-\begin{table}[H]
+\begin{table}
 \begin{center}
 \begin{tabular}{|c|c|c|}
@@ -2394,5 +2348,5 @@
 
 \subsection{Processors}
-Parallelism in \CFA is built around using processors to specify how much parallelism is desired. \CFA processors are object wrappers around kernel threads, specifically \texttt{pthread}s in the current implementation of \CFA.
+Parallelism in \CFA is built around using processors to specify how much parallelism is desired. \CFA processors are object wrappers around kernel threads, specifically @pthread@s in the current implementation of \CFA.
 Indeed, any parallelism must go through operating-system libraries.
 However, \textbf{uthread} are still the main source of concurrency, processors are simply the underlying source of parallelism.
@@ -2403,5 +2357,5 @@
 \subsection{Stack Management}
 One of the challenges of this system is to reduce the footprint as much as possible.
-Specifically, all \texttt{pthread}s created also have a stack created with them, which should be used as much as possible.
+Specifically, all @pthread@s created also have a stack created with them, which should be used as much as possible.
 Normally, coroutines also create their own stack to run on, however, in the case of the coroutines used for processors, these coroutines run directly on the \textbf{kthread} stack, effectively stealing the processor stack.
 The exception to this rule is the Main Processor, i.e., the initial \textbf{kthread} that is given to any program.
@@ -2468,5 +2422,5 @@
 The following figure is the traditional illustration of a monitor (repeated from page~\pageref{fig:ClassicalMonitor} for convenience):
 
-\begin{figure}[H]
+\begin{figure}
 \begin{center}
 {\resizebox{0.4\textwidth}{!}{\input{monitor}}}
@@ -2483,5 +2437,5 @@
 Secondly, the thread waiting on the condition has to be separated across multiple monitors, seen in figure \ref{fig:monitor_cfa}.
 
-\begin{figure}[H]
+\begin{figure}
 \begin{center}
 {\resizebox{0.8\textwidth}{!}{\input{int_monitor}}}
@@ -2497,5 +2451,5 @@
 For a specific signalling operation every monitor needs a piece of thread on its AS-stack.
 
-\begin{figure}[b]
+\begin{figure}
 \begin{multicols}{2}
 Entry
@@ -2533,5 +2487,5 @@
 The data structures used for the AS-stack are reused extensively for external scheduling, but in the case of internal scheduling, the data is allocated using variable-length arrays on the call stack of the @wait@ and @signal_block@ routines.
 
-\begin{figure}[H]
+\begin{figure}
 \begin{center}
 {\resizebox{0.8\textwidth}{!}{\input{monitor_structs.pstex_t}}}
@@ -2682,5 +2636,5 @@
 As it was subtly alluded in section \ref{threads}, @thread@s in \CFA are in fact monitors, which means that all monitor features are available when using threads.
 For example, here is a very simple two thread pipeline that could be used for a simulator of a game engine:
-\begin{figure}[H]
+\begin{figure}
 \begin{cfa}[caption={Toy simulator using \protect\lstinline|thread|s and \protect\lstinline|monitor|s.},label={lst:engine-v1}]
 // Visualization declaration
@@ -2714,5 +2668,5 @@
 One of the obvious complaints of the previous code snippet (other than its toy-like simplicity) is that it does not handle exit conditions and just goes on forever.
 Luckily, the monitor semantics can also be used to clearly enforce a shutdown order in a concise manner:
-\begin{figure}[H]
+\begin{figure}
 \begin{cfa}[caption={Same toy simulator with proper termination condition.},label={lst:engine-v2}]
 // Visualization declaration
@@ -2767,4 +2721,5 @@
 However, once clusters are fully implemented, it will be possible to create fibers and \textbf{uthread} in the same system, as in listing \ref{lst:fiber-uthread}
 \begin{figure}
+\lstset{language=CFA,deletedelim=**[is][]{`}{`}}
 \begin{cfa}[caption={Using fibers and \textbf{uthread} side-by-side in \CFA},label={lst:fiber-uthread}]
 // Cluster forward declaration
@@ -2826,5 +2781,5 @@
 Table \ref{tab:machine} shows the characteristics of the machine used to run the benchmarks.
 All tests were made on this machine.
-\begin{table}[H]
+\begin{table}
 \begin{center}
 \begin{tabular}{| l | r | l | r |}
@@ -3108,5 +3063,5 @@
 \subsection{Object Creation}
 Finally, the last benchmark measures the cost of creation for concurrent objects.
-Listing \ref{lst:creation} shows the code for \texttt{pthread}s and \CFA threads, with results shown in table \ref{tab:creation}.
+Listing \ref{lst:creation} shows the code for @pthread@s and \CFA threads, with results shown in table \ref{tab:creation}.
 As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
 The only note here is that the call stacks of \CFA coroutines are lazily created, therefore without priming the coroutine, the creation cost is very low.
@@ -3114,5 +3069,5 @@
 \begin{figure}
 \begin{center}
-\texttt{pthread}
+@pthread@
 \begin{cfa}
 int main() {
@@ -3151,6 +3106,6 @@
 \end{cfa}
 \end{center}
-\begin{cfa}[caption={Benchmark code for \texttt{pthread}s and \CFA to measure object creation},label={lst:creation}]
-\end{cfa}
+\caption{Benchmark code for \protect\lstinline|pthread|s and \CFA to measure object creation}
+\label{lst:creation}
 \end{figure}