Index: doc/theses/colby_parsons_MMAth/text/waituntil.tex
===================================================================
--- doc/theses/colby_parsons_MMAth/text/waituntil.tex	(revision 2e7a299ad51edb6d34971ddaf98d68e555e25fc4)
+++ doc/theses/colby_parsons_MMAth/text/waituntil.tex	(revision afb3d68419e40020edeb24ec6f52f94c972fccb1)
@@ -123,6 +123,6 @@
 
 \begin{figure}
-\begin{lstlisting}[language=ada,literate=]
-task type buffer is -- thread
+\begin{lstlisting}[language=ada,literate=,{moredelim={**[is][\color{red}]{@}{@}}}]
+task type buffer is -- thread type
 	... -- buffer declarations
 	count : integer := 0;
@@ -130,6 +130,6 @@
 	loop
 		select
-			when count < Size => -- guard
-			accept insert( elem : in ElemType ) do  -- method
+			@when count < Size@ => -- guard
+			@accept insert( elem : in ElemType )@ do  -- method
 				... -- add to buffer
 				count := count + 1;
@@ -137,12 +137,12 @@
 			-- executed if this accept called
 		or
-			when count > 0 => -- guard
-			accept remove( elem : out ElemType ) do  -- method
+			@when count > 0@ => -- guard
+			@accept remove( elem : out ElemType )@ do  -- method
 				... --remove and return from buffer via parameter
 				count := count - 1;
 			end;
 			-- executed if this accept called
-		or delay 10.0;  -- unblock after 10 seconds without call
-		or else -- do not block, cannot appear with delay
+		or @delay 10.0@;  -- unblock after 10 seconds without call
+		or @else@ -- do not block, cannot appear with delay
 		end select;
 	end loop;
@@ -174,8 +174,8 @@
 
 \begin{lrbox}{\myboxA}
-\begin{lstlisting}[language=go,literate=]
+\begin{lstlisting}[language=go,literate=,{moredelim={**[is][\color{red}]{@}{@}}}]
 func main() {
-	insert := make( chan int, Size )
-	remove := make( chan int, Size )
+	@insert := make( chan int, Size )@
+	@remove := make( chan int, Size )@
 	term := make( chan string )
 	finish := make( chan string )
@@ -184,8 +184,8 @@
 		L: for {
 			select { // wait for message
-			  case i = <- buffer:
+			  @case i = <- buffer:@
 			  case <- term: break L
 			}
-			remove <- i;
+			@remove <- i;@
 		}
 		finish <- "STOP" // completion
@@ -201,14 +201,14 @@
 
 \begin{lrbox}{\myboxB}
-\begin{lstlisting}[language=uC++=]
+\begin{lstlisting}[language=uC++,{moredelim={**[is][\color{red}]{@}{@}}}]
 _Task BoundedBuffer {
 	... // buffer declarations
 	int count = 0;
   public:
-	void insert( int elem ) {
+	@void insert( int elem )@ {
 		... // add to buffer
 		count += 1;
 	}
-	int remove() {
+	@int remove()@ {
 		... // remove and return from buffer
 		count -= 1;
@@ -218,6 +218,6 @@
 		for ( ;; ) {
 			_Accept( ~buffer ) break;
-			or _When ( count < Size ) _Accept( insert );
-			or _When ( count > 0 ) _Accept( remove );
+			@or _When ( count < Size ) _Accept( insert )@;
+			@or _When ( count > 0 ) _Accept( remove )@;
 		}
 	}
@@ -241,5 +241,5 @@
 Both @_Accept@ and @_Select@ statements provide guards for multiplexing clauses, as well as, timeout, and @else@ clauses.
 
-There are other languages that provide \gls{synch_multiplex}, including Rust's @select!@ over futures~\cite{rust:select}, OCaml's @select@ over channels~\cite{ocaml:channel}, and C++14's @when_any@ over futures~\cite{cpp:whenany}.
+There are other languages that provide \gls{synch_multiplex}, including Rust's @select!@ over futures~\cite{rust:select}, Java's @allof@/@anyof@ over futures~\cite{java:allof:anyof}, OCaml's @select@ over channels~\cite{ocaml:channel}, and C++14's @when_any@ over futures~\cite{cpp:whenany}.
 Note that while C++14 and Rust provide \gls{synch_multiplex}, the implementations leave much to be desired as both rely on polling to wait on multiple resources.
 
@@ -250,7 +250,7 @@
 Similar, actor systems circumvent the \gls{synch_multiplex} problem as actors only block when waiting for the next message never in a behaviour.
 While these approaches solve the \gls{synch_multiplex} problem, they introduce other issues.
-Consider the case where a thread has a single source of communication and it wants a set of @N@ resources.
-It must sequentially request the @N@ resources and wait for each response.
-During the receives for the @N@ resources, it can receive other communication, and has to save and postpone these communications, or discard them.
+Consider the case where a thread has a single source of communication and it wants a set of $N$ resources.
+It must sequentially request the $N$ resources and wait for each response.
+During the receives for the $N$ resources, it can receive other communication, and has to save and postpone these communications, or discard them.
 % If the requests for the other resources need to be retracted, the burden falls on the programmer to determine how to synchronize appropriately to ensure that only one resource is delivered.
 
@@ -294,8 +294,8 @@
 int i = 0;
 
-waituntil( Lock ) { ... }
-or when( i == 0 ) waituntil( i << Channel ) { ... }
-and waituntil( Future ) { ... }
-or waituntil( timeout( 1`s ) ) { ... }
+waituntil( @Lock@ ) { ... }
+or when( i == 0 ) waituntil( i << @Channel@ ) { ... }
+and waituntil( @Future@ ) { ... }
+or waituntil( @timeout( 1`s )@ ) { ... }
 // else { ... }
 \end{cfa}
@@ -316,16 +316,17 @@
 \begin{cfa}
 future(int) bar, foo;
-waituntil( foo ) { ... } or waituntil( bar ) { ... }
-\end{cfa}
-The reason for this semantics is that prioritizing resources can be useful in certain problems.
-In the rare case where there is a starvation problem with the ordering, it possible to follow a @waituntil@ with its reverse form:
+waituntil( foo ) { ... } or waituntil( bar ) { ... } // prioritize foo
+\end{cfa}
+The reason for this semantics is that prioritizing resources can be useful in certain problems, such as shutdown.
+In the rare case where there is a starvation problem with the ordering, it possible to follow a @waituntil@ with its reverse form, alternating which resource has the highest priority:
 \begin{cfa}
 waituntil( foo ) { ... } or waituntil( bar ) { ... } // prioritize foo
 waituntil( bar ) { ... } or waituntil( foo ) { ... } // prioritize bar
 \end{cfa}
+While this approach is not general for many resources, it handles many basic cases.
 
 The \CFA @and@ semantics match the @and@ semantics of \uC \lstinline[language=uC++]{_Select}.
 When multiple clauses are joined by @and@, the @waituntil@ makes a thread wait for all to be available, but still runs the corresponding code blocks \emph{as they become available}.
-When an @and@ clause becomes available, the waiting thread unblocks and runs that clause's code-block, and then the thread waits again for the next available clause or the @waituntil@ statement is now true.
+When an @and@ clause becomes available, the waiting thread unblocks and runs that clause's code-block, and then the thread waits again for the next available clause or the @waituntil@ statement is now satisfied.
 This semantics allows work to be done in parallel while synchronizing over a set of resources, and furthermore, gives a good reason to use the @and@ operator.
 If the @and@ operator waited for all clauses to be available before running, it is the same as just acquiring those resources consecutively by a sequence of @waituntil@ statements.
@@ -389,8 +390,8 @@
 \section{\lstinline{waituntil} Implementation}
 
-The @waituntil@ statement is not inherently complex, and Figure~\ref{f:WU_Impl} only shows the basic outline of the @waituntil@ algorithm.
-The complexity comes from the consideration of race conditions and synchronization needed when supporting various primitives.
-The following sections then use examples to fill in details missing in Figure~\ref{f:WU_Impl}.
-The full pseudocode for the @waituntil@ algorithm is presented in Figure~\ref{f:WU_Full_Impl}.
+The @waituntil@ statement is not inherently complex, and Figure~\ref{f:WU_Impl} shows the basic outline of the @waituntil@ algorithm.
+Complexity comes from the consideration of race conditions and synchronization needed when supporting various primitives.
+The following sections use examples to fill in complexity details missing in Figure~\ref{f:WU_Impl}.
+After which, the full pseudocode for the @waituntil@ algorithm is presented in Figure~\ref{f:WU_Full_Impl}.
 
 \begin{figure}
@@ -451,10 +452,10 @@
 When a @select_node@ reaches the front of the lock's queue and gains ownership, the thread blocked on the @waituntil@ is unblocked.
 Now, the lock is held by the @waituntil@ thread until the code block is executed, and then the node is unregistered, during which the lock is released.
-Immediately releasing the lock prevents the waiting thread from holding multiple locks and potentially introducing a deadlock.
+Immediately releasing the lock after the code block prevents the waiting thread from holding multiple locks and potentially introducing a deadlock.
 As such, the only unregistered nodes associated with locks are the ones that have not run.
 
 \subsection{Timeouts}
 
-A timeout for the @waituntil@ statement is a duration passed to \lstinline[deletekeywords={timeout}]{timeout}, \eg:
+A timeout for the @waituntil@ statement is a duration passed to routine \lstinline[deletekeywords={timeout}]{timeout}\footnote{\lstinline{timeout} is a quasi-keyword in \CFA, allowing it to be used an identifier.}, \eg:
 \begin{cquote}
 \begin{tabular}{@{}l|l@{}}
@@ -482,10 +483,10 @@
 Here, the multiple timeouts are useful because the durations can change during execution and the separate clauses provide different code blocks if a timeout triggers.
 Multiple timeouts can also be used with @and@ to provide a minimal delay before proceeding.
-In following example, either channel @C1@ or @C2@ must be satisfied but nothing can be done for at least 1 or 3 seconds after the channel read, respctively.
+In following example, either channel @C1@ or @C2@ must be satisfied but nothing can be done for at least 1 or 3 seconds after the channel read, respectively.
 \begin{cfa}[deletekeywords={timeout}]
 waituntil( i << C1 ); and waituntil( timeout( 1`s ) );
 or waituntil( i << C2 ); and waituntil( timeout( 3`s ) );
 \end{cfa}
-If only @C2@ is satisfied, \emph{both} timeout code-blocks trigger because 1 second ocurs before 3 seconds.
+If only @C2@ is satisfied, \emph{both} timeout code-blocks trigger because 1 second occurs before 3 seconds.
 Note, the \CFA @waitfor@ statement only provides a single @timeout@ clause because it only supports @or@ semantics.
 
@@ -497,7 +498,7 @@
 
 Channels require more complexity to allow synchronous multiplexing.
-For locks, when an outside thread releases a lock and unblocks the waituntil thread (WUT), the lock's MX property is passed to the WUT (no spinning locks).
+For locks, when an outside thread releases a lock and unblocks the @waituntil@ thread (WUT), the lock's MX property is passed to the WUT (no spinning locks).
 For futures, the outside thread deliveries a value to the future and unblocks any waiting threads, including WUTs.
-In either case, after the WUT unblocks it is safe to execute its the corresponding code block knowing access to the resource is protected by the lock or the read-only state of the future.
+In either case, after the WUT unblocks, it is safe to execute its corresponding code block knowing access to the resource is protected by the lock or the read-only state of the future.
 Similarly, for channels, when an outside thread inserts a value into a channel, it must unblock the WUT.
 However, for channels, there is a race condition that poses an issue.
@@ -510,16 +511,20 @@
 Furthermore, if both @and@ and @or@ operators are used, the @or@ operations stop behaving like exclusive-or due to the race among channel operations, \eg:
 \begin{cfa}
+int i;
 waituntil( i << A ) {} and waituntil( i << B ) {}
 or waituntil( i << C ) {} and waituntil( i << D ) {}
 \end{cfa}
 If exclusive-or semantics are followed, only the code blocks for @A@ and @B@ are run, or the code blocks for @C@ and @D@.
-However, four outside threads can simultaneously put values into @i@ and attempt to unblock the WUT to run the four code-blocks.
+However, four outside threads inserting into each channel can simultaneously put values into @i@ and attempt to unblock the WUT to run the four code-blocks.
 This case introduces a race with complexity that increases with the size of the @waituntil@ statement.
 However, due to TOCTOU issues, it is impossible to know if all resources are available without acquiring all the internal locks of channels in the subtree of the @waituntil@ clauses.
 This approach is a poor solution for two reasons.
-It is possible that once all the locks are acquired the subtree is not satisfied and the locks must be released.
+It is possible that once all the locks are acquired, the subtree is not satisfied and the locks must be released.
 This work incurs a high cost for signalling threads and heavily increase contention on internal channel locks.
 Furthermore, the @waituntil@ statement is polymorphic and can support resources that do not have internal locks, which also makes this approach infeasible.
 As such, the exclusive-or semantics are lost when using both @and@ and @or@ operators since it cannot be supported without significant complexity and significantly affects @waituntil@ performance.
+Therefore, the example of reusing variable @i@ by multiple output channels is considered a user error without exclusive-or semantics.
+Given aliasing in C, it is impossible to even warn of the potential race.
+In the future, it would be interesting to support Go-like syntax, \lstinline[language=Go]{case i := <- ...}, defining a new scoped @i@ variable for each clause.
 
 It was deemed important that exclusive-or semantics are maintained when only @or@ operators are used, so this situation has been special-cased, and is handled by having all clauses race to set a value \emph{before} operating on the channel.
@@ -527,5 +532,5 @@
 \begin{cquote}
 \begin{tabular}{@{}l|l|l@{}}
-\multicolumn{3}{@{}l@{}}{\lstinline{channel A, B; // zero size channels}} \\
+\multicolumn{3}{@{}l@{}}{\lstinline{channel(int) A, B; // zero size channels}} \\
 thread 1 & thread 2 & thread 3 \\
 \begin{cfa}
@@ -556,9 +561,9 @@
 Channels introduce another interesting implementation issue.
 Supporting both reading and writing to a channel in a @waituntil@ means that one @waituntil@ clause may be the notifier of another @waituntil@ clause.
-This poses a problem when dealing with the special-cased @or@ where the clauses need to win a race to operate on a channel.
+This conjunction poses a problem when dealing with the special-cased @or@ where the clauses need to win a race to operate on a channel.
 Consider the following example, alongside a described ordering of events to highlight the race.
 \begin{cquote}
 \begin{tabular}{@{}l|l@{}}
-\multicolumn{2}{@{}l@{}}{\lstinline{channel A, B; // zero size channels}} \\
+\multicolumn{2}{@{}l@{}}{\lstinline{channel(int) A, B; // zero size channels}} \\
 thread 1 & thread 2 \\
 \begin{cfa}[moredelim={**[is][\color{blue}]{\#}{\#}}]
@@ -573,14 +578,22 @@
 \end{tabular}
 \end{cquote}
-Assume thread 1 executes first, registers with channel @A@ and proceeds, since it is empty, and then is interrupted before registering with @B@.
-Thread 2 similarly registers with channel @B@, and proceeds, since it does not have space to insert, and then is interrupted before registering with @A@.
+Assume thread 1 executes first, registers with channel @A@ and proceeds in the @waituntil@.
+Since @A@ is empty, thread 1 cannot remove, and then thread 1 is interrupted before registering with @B@.
+Thread 2 similarly registers with channel @B@, and proceeds in the @waituntil@.
+Since @B@ is zero size there is no space to insert, and then thread 2 is interrupted before registering with @A@.
 At this point, thread 1 and 2 resume execution.
-There is now a race that must be dealt with on two fronts.
-If thread 1 and 2 only race to the \gls{cas}, \ie a clause in their own @waituntil@, thread 1 can think that it successfully removed from @B@, and thread 2 may think it successfully inserted into @A@, when only one of these operations occurs.
+Remember from above, each exclusive-or @waituntil@ holds a race to set the winning clause of the statement.
+The issue that arises is that these two @waituntil@ statements must have matching winning clauses (both @A@ clauses or both @B@ clauses) to preserve the exclusive-or semantics, since a zero-sized channel needs an insert/remove pair for an operation to occur.
+If threads 1 and 2 race to set a winner only in their own @waituntil@, thread 1 can think it successfully removed from @B@, and thread 2 can think it successfully inserted into @A@, which is an error.
+Hence, there is a race on two fronts.
+If thread 1 wins the race and sees that @B@ has a waiting insertion, then thread 2 must execute the first clause of its @waituntil@ and thread 1 execute its second.
+Correspondingly, if thread 2 wins the race and sees that @A@ has a waiting removal, then thread 1 must execute the first clause of its @waituntil@ and thread 2 execute its second.
+Any other execution scenario is incorrect for exclusive-or semantics.
+Note, that priority execution of multiple satisfied @waituntil@ causes (\ie top to bottom) is not violated because, in this scenario, there is only one satisfied clause for either thread.
 
 The Go @select@ solves this problem by acquiring all the internal locks of the channels before registering the @select@ on the channels.
 This approach eliminates the race shown above since thread 1 and 2 cannot both be registering at the same time.
 However, this approach cannot be used in \CFA, since the @waituntil@ is polymorphic.
-Not all types in a @waituntil@ have an internal lock, and when using non-channel types acquiring all the locks incurs extra unneeded overhead.
+Not all types in a @waituntil@ have an internal lock, and when using non-channel types, acquiring all the locks incurs extra unneeded overhead.
 Instead, this race is consolidated in \CFA in two phases by having an intermediate pending status value for the race.
 This race case is detectable, and if detected, each thread first races to set its own @waituntil@ race pointer to be pending.
@@ -600,5 +613,5 @@
 
     // Try to set other status, if we succeed break and return true
-    while( !CAS( other.clause_status, &cmp_status, SAT ) ) {
+    while( ! CAS( other.clause_status, &cmp_status, SAT ) ) {
         if ( cmp_status == SAT )
             return false; // If other status is SAT we lost so return false
@@ -611,5 +624,5 @@
 
         // Attempt to set own status flag back to PENDING to retry
-        if ( !CAS( mine.clause_status, &cmp_status, PENDING ) )
+        if ( ! CAS( mine.clause_status, &cmp_status, PENDING ) )
             return false; // If we fail then we lost so return false
         
@@ -627,4 +640,5 @@
 These exception mechanisms are supported through the @on_selected@ routine.
 This routine is needed by channels to detect if they are closed after unblocking in a @waituntil@ statement, to ensure the appropriate behaviour is taken and an exception is thrown.
+Hence, the channel close-down mechanism is handled correctly.
 
 \subsection{Guards and Statement Predicate}\label{s:wu_guards}
@@ -632,17 +646,5 @@
 It is trivial to check when a synchronous multiplexing utility is done for the or/xor relationship, since any resource becoming available means that the blocked thread can proceed and the @waituntil@ statement is finished.
 In \uC and \CFA, the \gls{synch_multiplex} mechanism have both an and/or relationship, which along with guards, make the problem of checking for completion of the statement difficult.
-Consider the @waituntil@ in Figure~\ref{f:WU_ComplexPredicate}.
-When the @waituntil@ thread wakes up, checking if the statement is complete is non-trivial.
-The predicate that will return if the statement in Figure~\ref{f:WU_ComplexPredicate} is satisfied is the following.
-\begin{cfa}
-A && B || C || !GA && B || !GB && A || !GA && !GB && !GC
-\end{cfa}
-Which simplifies to:
-\begin{cfa}
-( A || !GA ) && ( B || !GB ) || C || !GA && !GB && !GC
-\end{cfa}
-Checking a predicate this large with each iteration is expensive so \uC and \CFA both take steps to simplify checking statement completion.
-
-\begin{figure}
+Consider the following @waituntil@.
 \begin{cfa}
 when( GA ) waituntil( A ) {}
@@ -650,15 +652,21 @@
 or when( GC ) waituntil( C ) {}
 \end{cfa}
-\caption{\lstinline{waituntil} with a non-trivial predicate}
-\label{f:WU_ComplexPredicate}
-\end{figure}
+When the @waituntil@ thread wakes up, the following predicate represents satisfaction:
+\begin{cfa}
+A && B || C || ! GA && B || ! GB && A || ! GA && ! GB && ! GC
+\end{cfa}
+which can be simplified to:
+\begin{cfa}
+( A || ! GA ) && ( B || ! GB ) || C || ! GA && ! GB && ! GC
+\end{cfa}
+Checking the complete predicate on each iteration of the pending @waituntil@ is expensive so \uC and \CFA both take steps to simplify checking for statement completion.
 
 In the \uC @_Select@ statement, this problem is solved by constructing a tree of the resources, where the internal nodes are operators and the leaves are booleans storing the state of each resource.
-The internal nodes also store the statuses of the two subtrees beneath them.
+A diagram of the tree for the complete predicate above is shown in Figure~\ref{f:uC_select_tree}, alongside the modification of the tree that occurs when @GA@ is @false@.
+Each internal node stores the statuses of the two subtrees beneath it.
 When resources become available, their corresponding leaf node status is modified, which percolates up the tree to update the state of the statement.
 Once the root of the tree has both subtrees marked as @true@ then the statement is complete.
 As an optimization, when the internal nodes are updated, the subtrees marked as @true@ are pruned and not examined again.
 To support statement guards in \uC, the tree is modified to remove an internal node if a guard is false to maintain the appropriate predicate representation.
-An diagram of the tree for the statement in Figure~\ref{f:WU_ComplexPredicate} is shown in Figure~\ref{f:uC_select_tree}, alongside the modification of the tree that occurs when @GA@ is @false@.
 
 \begin{figure}
@@ -666,15 +674,15 @@
 \input{diagrams/uCpp_select_tree.tikz}
 \end{center}
-\caption{\uC select tree modification}
+\caption{\uC \lstinline[language=uC++]{select} tree modification}
 \label{f:uC_select_tree}
 \end{figure}
 
-The \CFA @waituntil@ statement blocks a thread until a set of resources have become available that satisfy the underlying predicate.
-The waiting condition of the @waituntil@ statement can be represented as a predicate over the resources, joined by the @waituntil@ operators, where a resource is @true@ if it is available, and @false@ otherwise.
-In \CFA, this representation is used as the mechanism to check if a thread is done waiting on the @waituntil@.
+The \CFA @waituntil@ statement blocks a thread until a set of resources have become available that satisfy the complete predicate of a @waituntil@.
+The waiting condition of the @waituntil@ statement is implemented as the complete predicate over the resources, joined by the @waituntil@ operators, where a resource is @true@ if it is available, and @false@ otherwise.
+This complete predicate is used as the mechanism to check if a thread is done waiting on a @waituntil@.
 Leveraging the compiler, a predicate routine is generated per @waituntil@ that when passes the statuses of the resources, returns @true@ when the @waituntil@ is done, and false otherwise.
 To support guards on the \CFA @waituntil@ statement, the status of a resource disabled by a guard is set to a boolean value that ensures that the predicate function behaves as if that resource is no longer part of the predicate.
 The generated code allows the predicate that is checked with each iteration to be simplified to not check guard values.
-For example, the following would be generated for the @waituntil@ shown in Figure~\ref{f:WU_ComplexPredicate}.
+For example, the following is generated for the complete predicate above:
 \begin{cfa}
 // statement completion predicate
@@ -682,14 +690,10 @@
     return nodes[0].status && nodes[1].status || nodes[2].status;
 }
-
-// skip statement if all guards false
-if ( GA || GB || GC ) {
+if ( GA || GB || GC ) {				$\C{// skip statement if all guards false}$
     select_node nodes[3];
-    nodes[0].status = !GA && GB; // A's status
-    nodes[1].status = !GB && GA; // B's status
-    nodes[2].status = !GC;       // C's status
-
+    nodes[0].status = ! GA && GB;	$\C{// A's status}$
+    nodes[1].status = ! GB && GA;	$\C{// B's status}$
+    nodes[2].status = ! GC;			$\C{// C's status}$
     // ... rest of waituntil codegen ...
-
 }
 \end{cfa}
@@ -697,5 +701,4 @@
 \uC's @_Select@, supports operators both inside and outside of the \lstinline[language=uC++]{_Select} clauses.
 In the following example, the code blocks run once their corresponding predicate inside the round braces is satisfied.
-
 \begin{lstlisting}[language=uC++,{moredelim=**[is][\color{red}]{@}{@}}]
 Future_ISM<int> A, B, C, D;
@@ -703,5 +706,5 @@
 and _Select( @D && E@ ) { ... }
 \end{lstlisting}
-This is more expressive that the @waituntil@ statement in \CFA, allowing the code block for @&&@ to only run after both resources are available.
+This feature is more expressive than the @waituntil@ statement in \CFA, allowing the code block for @&&@ to only run after \emph{both} resources are available.
 
 In \CFA, since the @waituntil@ statement supports more resources than just futures, implementing operators inside clauses is avoided for a few reasons.
@@ -712,5 +715,5 @@
 or waituntil( C && D ) { ... }
 \end{cfa}
-If the @waituntil@ acquires each lock as it becomes available, there is a possible deadlock since it is in a hold and wait situation.
+If the @waituntil@ acquires each lock as it becomes available, there is a possible deadlock since it is in a hold-and-wait situation.
 Other semantics are needed to ensure this operation is safe.
 One possibility is to use \CC's @scoped_lock@ approach described in Section~\ref{s:DeadlockAvoidance};
@@ -718,13 +721,14 @@
 Another possibility is to use resource ordering similar to \CFA's @mutex@ statement, but that alone is insufficient, if the resource ordering is not used universally.
 One other way this could be implemented is to wait until all resources for a given clause are available before proceeding to acquire them, but this also quickly becomes a poor approach.
-This approach does not work due to \gls{toctou} issues;
-it is impossible to ensure that the full set of resources are available without holding them all first.
-Operators inside clauses in \CFA could potentially be implemented with careful circumvention of the problems involved, but it was not deemed an important feature when taking into account the runtime cost need paid to handle this situation.
-The problem of operators inside clauses also becomes a difficult issue to handle when supporting channels.
-It would require some way to ensure channels used with internal operators are modified, if and only if, the corresponding code block is run, but that is not feasible due to reasons described in the exclusive-or portion of Section~\ref{s:wu_chans}.
+This approach does not work due to \gls{toctou} issues, \ie it is impossible to ensure that the full set of resources are available without holding them all first.
+Operators inside clauses in \CFA could potentially be implemented with careful circumvention of the problems.
+Finally, the problem of operators inside clauses is also a difficult to handle when supporting channels.
+It would require some way to ensure channels used with internal operators are modified, if and only if, the corresponding code block is run.
+However, that is not feasible due to reasons described in the exclusive-or portion of Section~\ref{s:wu_chans}.
+Ultimately, I did not deemed this feature to be crucial enough when taking into account the complexity and runtime cost.
 
 \subsection{The full \lstinline{waituntil} picture}
 
-Now the details have been discussed, the full pseudocode of the @waituntil@ is presented in Figure~\ref{f:WU_Full_Impl}.
+Given all the complex details handled by the @waituntil@, its full pseudocode is presented in Figure \ref{f:WU_Full_Impl}.
 Some things to note are as follows.
 The @finally@ blocks provide exception-safe \gls{raii} unregistering of nodes, and in particular, the @finally@ inside the innermost loop performs the immediate unregistering required for deadlock-freedom mentioned in Section~\ref{s:wu_locks}.
@@ -751,5 +755,5 @@
             register_select( resource, node );
 
-    while ( !check_completion( nodes ) ) {	$\C{// check predicate}$
+    while ( ! check_completion( nodes ) ) {	$\C{// check predicate}$
         // block
         for ( resource in waituntil statement ) {	$\C{// run true code blocks}$
@@ -776,10 +780,10 @@
 \end{figure}
 
-\section{Waituntil Performance}
+\section{\lstinline{waituntil} Performance}
 
 Similar facilities to @waituntil@ are discussed in Section~\ref{s:History} covering C, Ada, Rust, \CC, and OCaml.
 However, these facilities are either not meaningful or feasible to benchmark against.
 The UNIX @select@ and related utilities are not comparable since they are system calls that go into the kernel and operate on file descriptors, whereas the @waituntil@ exists solely in user space.
-Ada's \lstinline[language=Ada]{select} and \uC's \lstinline[language=uC++]{_Accept} only operates on method calls, which is done in \CFA via the @waitfor@ statement, so it is not meaningful to benchmark against the @waituntil@, which cannot wait on this resource.
+Ada's \lstinline[language=Ada]{select} and \uC's \lstinline[language=uC++]{_Accept} only operate on method calls, which is done in \CFA via the @waitfor@ statement.
 Rust and \CC only offer a busy-wait approach, which is not comparable to a blocking approach.
 OCaml's @select@ waits on channels that are not comparable with \CFA and Go channels, so OCaml @select@ is not benchmarked against Go's @select@ and \CFA's @waituntil@.
@@ -793,5 +797,5 @@
 The channel multiplexing benchmarks compare \CFA's @waituntil@ and Go's \lstinline[language=Go]{select}, where the resource being waited on is a set of channels.
 The basic structure of the benchmark has the number of cores split evenly between producer and consumer threads, \ie, with 8 cores there are 4 producer and 4 consumer threads.
-The number of resource clauses $C$ is also varied across 2, 4, and 8 clauses, where each clause has a different channel that is waits on.
+The number of resource clauses $C$ is also varied across 2, 4, and 8 clauses, where each clause has a different channel that it waits on.
 Each producer and consumer repeatedly waits to either produce or consume from one of the $C$ clauses and respective channels.
 For example, in \CFA syntax, the work loop in the consumer main with $C = 4$ clauses is:
@@ -872,5 +876,5 @@
 Go then is holding a lock for every channel, resulting in worse performance as the number of channels increase.
 In \CFA, since races are consolidated without holding all locks, it scales much better both with cores and clauses since more work can occur in parallel.
-This scalability difference is more significant on the Intel machine than the AMD machine since the Intel machine has lower cache contention costs.
+This scalability difference is more significant on the Intel machine than the AMD machine since the Intel has lower cache-contention costs.
 
 The Go approach of holding all internal channel-locks in the \lstinline[language=Go]{select} has additional drawbacks.
@@ -904,8 +908,8 @@
 In Go, this setup results in significantly worse performance since @P2@ and @C2@ cannot operate in parallel with @P1@ and @C1@ due to all locks being acquired.
 Interesting, this case may not be as pathological as it seems.
-If the set of channels belonging to a select have channels that overlap with the set of another select, these statements lose the ability to operate in parallel.
+If the set of channels belonging to a \lstinline[language=Go]{select} have channels that overlap with the set of another \lstinline[language=Go]{select}, these statements lose the ability to operate in parallel.
 The implementation in \CFA only holds a single lock at a time, resulting in better locking granularity, and hence, more parallelism.
 Comparison of this pathological case is shown in Table~\ref{t:pathGo}.
-The AMD results highlight the worst case scenario for Go since contention is more costly on this machine than the Intel machine.
+The AMD results highlight the worst-case scenario for Go since contention is more costly on this machine than the Intel machine.
 
 \begin{table}[t]
@@ -938,19 +942,4 @@
 The @waituntil@ statement checks for statement completion using a predicate function, whereas the \lstinline[language=uC++]{_Select} statement maintains a tree that represents the state of the internal predicate.
 
-\begin{figure}
-	\centering
-	\subfloat[AMD Future Synchronization Benchmark]{
-		\resizebox{0.5\textwidth}{!}{\input{figures/nasus_Future.pgf}}
-		\label{f:futureAMD}
-	}
-	\subfloat[Intel Future Synchronization Benchmark]{
-		\resizebox{0.5\textwidth}{!}{\input{figures/pyke_Future.pgf}}
-		\label{f:futureIntel}
-	}
-	\caption{\CFA \lstinline{waituntil} and \uC \lstinline{_Select} statement throughput synchronizing on a set of futures with varying wait predicates (higher is better).}
-	\caption{}
-	\label{f:futurePerf}
-\end{figure}
-
 This benchmark aims to indirectly measure the impact of various predicates on the performance of the @waituntil@ and \lstinline[language=uC++]{_Select} statements.
 The benchmark is indirect since the performance of futures in \CFA and \uC differ by a significant margin.
@@ -991,5 +980,5 @@
 
 Results of this benchmark are shown in Figure~\ref{f:futurePerf}.
-Each pair of bars is marked with the predicate name for that experiment and the value at the top of each bar is the standard deviation..
+Each pair of bars is marked with the predicate name for that experiment and the value at the top of each bar is the standard deviation.
 In detail, \uC results are lower in all cases due to the performance difference between futures and the more complex \gls{synch_multiplex} implementation.
 However, the bars for both systems have similar height patterns across the experiments.
@@ -998,2 +987,18 @@
 Interestingly, \CFA has lower variation across predicates on the AMD (excluding the special OR case), whereas \uC has lower variation on the Intel.
 Given the differences in semantics and implementation between \uC and \CFA, this test only illustrates the overall costs among the different kinds of predicates.
+
+\begin{figure}
+	\centering
+	\subfloat[AMD Future Synchronization Benchmark]{
+		\resizebox{0.5\textwidth}{!}{\input{figures/nasus_Future.pgf}}
+		\label{f:futureAMD}
+	}
+	\subfloat[Intel Future Synchronization Benchmark]{
+		\resizebox{0.5\textwidth}{!}{\input{figures/pyke_Future.pgf}}
+		\label{f:futureIntel}
+	}
+	\caption{\CFA \lstinline{waituntil} and \uC \lstinline{_Select} statement throughput synchronizing on a set of futures with varying wait predicates (higher is better).}
+	\label{f:futurePerf}
+%%%%%%%%%%%%%%%
+\vspace*{5.5in}
+\end{figure}