Index: doc/papers/concurrency/examples/DatingServiceThread.cfa
===================================================================
--- doc/papers/concurrency/examples/DatingServiceThread.cfa	(revision b0795bebc0e389f1524fddfc7d31695fbea4c2e5)
+++ doc/papers/concurrency/examples/DatingServiceThread.cfa	(revision b0795bebc0e389f1524fddfc7d31695fbea4c2e5)
@@ -0,0 +1,110 @@
+#include <stdlib.hfa>									// random
+#include <fstream.hfa>
+#include <kernel.hfa>
+#include <thread.hfa>
+#include <unistd.h>										// getpid
+
+enum { CompCodes = 20 };								// number of compatibility codes
+
+thread DatingService {
+	condition Girls[CompCodes], Boys[CompCodes];
+	unsigned int girlPhoneNo, boyPhoneNo, ccode;
+}; // DatingService
+
+unsigned int girl( DatingService & mutex ds, unsigned int phoneno, unsigned int code ) with( ds ) {
+	girlPhoneNo = phoneno;  ccode = code;
+	wait( Girls[ccode] );								// wait for boy
+	girlPhoneNo = phoneno;
+	sout | "Girl:" | girlPhoneNo | "is dating Boy at" | boyPhoneNo | "with ccode" | ccode;
+	return boyPhoneNo;
+} // DatingService girl
+
+unsigned int boy( DatingService & mutex ds, unsigned int phoneno, unsigned int code ) with( ds ) {
+	boyPhoneNo = phoneno;  ccode = code;
+	wait( Boys[ccode] );								// wait for girl
+	boyPhoneNo = phoneno;
+	sout | " Boy:" | boyPhoneNo | "is dating Girl" | girlPhoneNo | "with ccode" | ccode;
+	return girlPhoneNo;
+} // DatingService boy
+
+void main( DatingService & ds ) with( ds ) {			// thread starts
+	for () {
+		waitfor( ^?{} : ds ) {
+			break;
+		} or waitfor( girl : ds ) {
+			if ( ! is_empty( Boys[ccode] ) ) {			// no compatible boy ?
+				signal_block( Boys[ccode] );			// restart boy to set phone number
+				signal_block( Girls[ccode] );			// restart girl to set phone number
+			} // if
+		} or waitfor( boy : ds ) {
+			if ( ! is_empty( Girls[ccode] ) ) {			// no compatible girl ?
+				signal_block( Girls[ccode] );			// restart girl to set phone number
+				signal_block( Boys[ccode] );			// restart boy to set phone number
+			} // if
+		}
+	}
+} // DatingService main
+
+unsigned int girlck[CompCodes];
+unsigned int boyck[CompCodes];
+
+thread Girl {
+	DatingService & TheExchange;
+	unsigned int id, ccode;
+}; // Girl
+
+void main( Girl & g ) with( g ) {
+	yield( random( 100 ) );								// do not start at the same time
+	unsigned int partner = girl( TheExchange, id, ccode );
+	girlck[id] = partner;
+} // Girl main
+
+void ?{}( Girl & g, DatingService * TheExchange, unsigned int id, unsigned int ccode ) {
+	&g.TheExchange = TheExchange;
+	g.id = id;
+	g.ccode = ccode;
+} // Girl ?{}
+
+thread Boy {
+	DatingService & TheExchange;
+	unsigned int id, ccode;
+}; // Boy
+
+void main( Boy & b ) with( b ) {
+	yield( random( 100 ) );								// don't all start at the same time
+	unsigned int partner = boy( TheExchange, id, ccode );
+	boyck[id] = partner;
+} // Boy main
+
+void ?{}( Boy & b, DatingService * TheExchange, unsigned int id, unsigned int ccode ) {
+	&b.TheExchange = TheExchange;
+	b.id = id;
+	b.ccode = ccode;
+} // Boy ?{}
+
+int main() {
+	DatingService TheExchange;
+	Girl * girls[CompCodes];
+	Boy  * boys[CompCodes];
+
+	srandom( /*getpid()*/ 103 );
+
+	for ( i; (unsigned int)CompCodes ) {
+		girls[i] = new( &TheExchange, i, i );			// TheExchange constructor needs unsigned int
+		boys[i]  = new( &TheExchange, i, CompCodes - ( i + 1 ) );
+	} // for
+
+	for ( i; CompCodes ) {
+		delete( boys[i] );
+		delete( girls[i] );
+	} // for
+
+	for ( i; CompCodes ) {
+		if ( girlck[ boyck[i] ] != boyck[ girlck[i] ] ) abort();
+	} // for
+} // main
+
+// Local Variables: //
+// tab-width: 4 //
+// compile-command: "cfa DatingServiceThread.cfa" //
+// End: //
Index: doc/papers/concurrency/examples/Fib.js
===================================================================
--- doc/papers/concurrency/examples/Fib.js	(revision b0795bebc0e389f1524fddfc7d31695fbea4c2e5)
+++ doc/papers/concurrency/examples/Fib.js	(revision b0795bebc0e389f1524fddfc7d31695fbea4c2e5)
@@ -0,0 +1,18 @@
+function * fib() {
+	var fn1 = 1, fn = 0;
+	while ( true ) {
+		var ret = fn; fn = fn1; fn1 = fn + ret;
+		yield ret;
+	} // while
+}
+
+f1 = fib();
+f2 = fib();
+for ( var i = fib.length; i < 10; i += 1 ) {
+	console.log( f1.next().value, f2.next().value );
+}
+
+// Local Variables: //
+// tab-width: 4 //
+// compile-command: "node Fib.js" //
+// End: //
Index: doc/papers/concurrency/examples/Format.js
===================================================================
--- doc/papers/concurrency/examples/Format.js	(revision b0795bebc0e389f1524fddfc7d31695fbea4c2e5)
+++ doc/papers/concurrency/examples/Format.js	(revision b0795bebc0e389f1524fddfc7d31695fbea4c2e5)
@@ -0,0 +1,33 @@
+function * Format() {
+	var g, b;
+	fini: while ( true ) {
+		for ( g = 0; g < 5; g += 1 ) {					// groups of 5 blocks
+			for ( b = 0; b < 4; b += 1 ) {				// blocks of 4 characters
+				while ( true ) {
+					ch = (yield)						// receive from send
+					if ( ch == '\0' ) break fini;
+					if ( '\n' != ch ) break
+				}
+				process.stdout.write( ch )				// receive from send
+			}
+			process.stdout.write( '  ' )				// block separator
+		}
+		process.stdout.write( '\n' )					// group separator
+	}
+	if ( g != 0 || b != 0 ) process.stdout.write( '\n' )
+}
+
+var input = "abcdefghijklmnop\nqrstuvwx\nyzxxxxxxxxxxxxx"
+
+fmt = Format()
+fmt.next()												// prime generator
+for ( var i = 0; i < input.length; i += 1 ) {
+	fmt.next( input[i] );								// send to yield
+}
+fmt.next( '\0' );										// EOF
+
+// Local Variables: //
+// comment-column: 56 //
+// tab-width: 4 //
+// compile-command: "node Format.js" //
+// End: //
Index: doc/papers/concurrency/examples/RWMonitorEXT.cfa
===================================================================
--- doc/papers/concurrency/examples/RWMonitorEXT.cfa	(revision b0795bebc0e389f1524fddfc7d31695fbea4c2e5)
+++ doc/papers/concurrency/examples/RWMonitorEXT.cfa	(revision b0795bebc0e389f1524fddfc7d31695fbea4c2e5)
@@ -0,0 +1,66 @@
+#include <fstream.hfa>
+#include <thread.hfa>
+
+volatile int SharedRW = 0;								// shared variable to test readers and writers
+
+monitor ReadersWriter {
+	int rcnt, wcnt;										// number of readers/writer using resource
+};
+
+void ?{}( ReadersWriter & rw ) with(rw) { rcnt = wcnt = 0; }
+void EndRead( ReadersWriter & mutex rw ) with(rw) { rcnt -= 1; }
+void EndWrite( ReadersWriter & mutex rw ) with(rw) { wcnt = 0; }
+void StartRead( ReadersWriter & mutex rw ) with(rw) {
+	if ( wcnt > 0 ) waitfor( EndWrite : rw );
+	rcnt += 1;
+}
+void StartWrite( ReadersWriter & mutex rw ) with(rw) {
+	if ( wcnt > 0 ) waitfor( EndWrite : rw );
+	else while ( rcnt > 0 ) waitfor( EndRead : rw );
+	wcnt = 1;
+}
+int readers( ReadersWriter & rw ) { return rw.rcnt; }
+
+void Read( ReadersWriter & rw ) {
+	StartRead( rw );
+	sout | "Reader:" | active_thread() | ", shared:" | SharedRW | " with:" | readers( rw ) | " readers";
+	yield( 3 );
+	EndRead( rw );
+}
+void Write( ReadersWriter & rw ) {
+	StartWrite( rw );
+
+	SharedRW += 1;
+	sout | "Writer:" | active_thread() | ",  wrote:" | SharedRW;
+	yield( 1 );
+	EndWrite( rw );
+}
+
+thread Worker {
+	ReadersWriter &rw;
+};
+void ?{}( Worker & w, ReadersWriter * rw ) { &w.rw = rw; }
+void main( Worker & w ) with(w) {
+	for ( 10 ) {
+		if ( rand() % 100 < 70 ) {					// decide to be a reader or writer
+			Read( rw );
+		} else {
+			Write( rw );
+		} // if
+	} // for
+}
+
+int main() {
+	enum { MaxTask = 5 };
+	ReadersWriter rw;
+	Worker *workers[MaxTask];
+
+	for ( i; MaxTask ) workers[i] = new( &rw );
+	for ( i; MaxTask ) delete( workers[i] );
+	sout | "successful completion";
+} // main
+
+// Local Variables: //
+// tab-width: 4 //
+// compile-command: "cfa -O2 RWMonitorEXT.cfa" //
+// End: //
Index: doc/papers/concurrency/examples/RWMonitorINT.cfa
===================================================================
--- doc/papers/concurrency/examples/RWMonitorINT.cfa	(revision b0795bebc0e389f1524fddfc7d31695fbea4c2e5)
+++ doc/papers/concurrency/examples/RWMonitorINT.cfa	(revision b0795bebc0e389f1524fddfc7d31695fbea4c2e5)
@@ -0,0 +1,74 @@
+#include <fstream.hfa>
+#include <thread.hfa>
+
+volatile int SharedRW = 0;								// shared variable to test readers and writers
+
+enum RW { READER, WRITER };
+monitor ReadersWriter {
+	int rcnt, wcnt;										// number of readers/writer using resource
+	condition RWers;
+};
+
+void ?{}( ReadersWriter & rw ) with(rw) { rcnt = wcnt = 0; }
+void StartRead( ReadersWriter & mutex rw ) with(rw) {
+	if ( wcnt !=0 || ! is_empty( RWers ) ) wait( RWers, READER );
+	rcnt += 1;
+	if ( ! is_empty( RWers ) && front( RWers ) == READER ) signal( RWers );
+}
+void EndRead( ReadersWriter & mutex rw ) with(rw) {
+	rcnt -= 1;
+	if ( rcnt == 0 ) signal( RWers );
+}
+void StartWrite( ReadersWriter & mutex rw ) with(rw) {
+	if ( wcnt != 0 || rcnt != 0 ) wait( RWers, WRITER );
+	wcnt = 1;
+}
+void EndWrite( ReadersWriter & mutex rw ) with(rw) {
+	wcnt = 0;
+	signal( RWers );
+}
+int readers( ReadersWriter & rw ) { return rw.rcnt; }
+
+void Read( ReadersWriter & rw ) {
+	StartRead( rw );
+	sout | "Reader:" | active_thread() | ", shared:" | SharedRW | " with:" | readers( rw ) | " readers";
+	yield( 3 );
+	EndRead( rw );
+}
+void Write( ReadersWriter & rw ) {
+	StartWrite( rw );
+
+	SharedRW += 1;
+	sout | "Writer:" | active_thread() | ",  wrote:" | SharedRW;
+	yield( 1 );
+	EndWrite( rw );
+}
+
+thread Worker {
+	ReadersWriter &rw;
+};
+void ?{}( Worker & w, ReadersWriter * rw ) { &w.rw = rw; }
+void main( Worker & w ) with(w) {
+	for ( 10 ) {
+		if ( rand() % 100 < 70 ) {					// decide to be a reader or writer
+			Read( rw );
+		} else {
+			Write( rw );
+		} // if
+	} // for
+}
+
+int main() {
+	enum { MaxTask = 5 };
+	ReadersWriter rw;
+	Worker *workers[MaxTask];
+
+	for ( i; MaxTask ) workers[i] = new( &rw );
+	for ( i; MaxTask ) delete( workers[i] );
+	sout | "successful completion";
+} // main
+
+// Local Variables: //
+// tab-width: 4 //
+// compile-command: "cfa -O2 RWMonitorINT.cfa" //
+// End: //
Index: doc/papers/concurrency/examples/channels.go
===================================================================
--- doc/papers/concurrency/examples/channels.go	(revision b0795bebc0e389f1524fddfc7d31695fbea4c2e5)
+++ doc/papers/concurrency/examples/channels.go	(revision b0795bebc0e389f1524fddfc7d31695fbea4c2e5)
@@ -0,0 +1,35 @@
+package main
+import "fmt"
+func main() {
+	type Msg struct{ i, j int }
+
+	ch1 := make( chan int )
+	ch2 := make( chan float32 )
+	ch3 := make( chan Msg )
+	hand := make( chan string )
+	shake := make( chan string )
+	gortn := func() { // thread starts
+		var i int;  var f float32;  var m Msg
+		L: for {
+			select { // wait for message
+			  case i = <- ch1: fmt.Println( i )
+			  case f = <- ch2: fmt.Println( f )
+			  case m = <- ch3: fmt.Println( m )
+			  case <- hand: break L // sentinel
+			}
+		}
+		shake <- "SHAKE" // completion
+	}
+
+	go gortn() // start thread
+	ch1 <- 0 // different messages
+	ch2 <- 2.5
+	ch3 <- Msg{1, 2}
+	hand <- "HAND" // sentinel value
+	<- shake // wait for completion
+}
+
+// Local Variables: //
+// tab-width: 4 //
+// compile-command: "go run channels.go" //
+// End: //
Index: doc/papers/concurrency/examples/channels.rs
===================================================================
--- doc/papers/concurrency/examples/channels.rs	(revision b0795bebc0e389f1524fddfc7d31695fbea4c2e5)
+++ doc/papers/concurrency/examples/channels.rs	(revision b0795bebc0e389f1524fddfc7d31695fbea4c2e5)
@@ -0,0 +1,34 @@
+#![feature(async_await)]
+
+use std::thread;
+use std::sync::mpsc;
+
+fn main() {
+	let (tx1, rx1) = mpsc::channel();
+	let (tx2, rx2) = mpsc::channel();
+	let (tx3, rx3) = mpsc::channel();
+	let (tx4, rx4) = mpsc::channel();
+	struct Msg { i : i64,  j : i64 }
+	let th = thread::spawn( || {
+		let i : i64; let f : f64; let m : Msg;
+		loop {
+			select! {
+				i = rx1.recv() => println( i );
+				f = rx2.recv() => println( f );
+				m = rx3.recv() => println( m );
+				_ = rx4.recv() => break;
+			}
+		}
+	});
+
+	tx1.send( 0 ); // different messages
+	tx2.send( 2.5 );
+	tx3.send( Msg { i:1, j:2 } );
+	tx4.send( "done" );
+	th.join().unwrap();
+}
+
+// Local Variables: //
+// tab-width: 4 //
+// compile-command: "rustc -C opt-level=3 channels.rs" //
+// End: //
Index: doc/papers/concurrency/examples/future.rs
===================================================================
--- doc/papers/concurrency/examples/future.rs	(revision b0795bebc0e389f1524fddfc7d31695fbea4c2e5)
+++ doc/papers/concurrency/examples/future.rs	(revision b0795bebc0e389f1524fddfc7d31695fbea4c2e5)
@@ -0,0 +1,10 @@
+use futures::executor::block_on;
+
+async fn hello_world() {
+    println!("hello, world!");
+}
+
+fn main() {
+    let future = hello_world(); // Nothing is printed
+    block_on(future); // `future` is run and "hello, world!" is printed
+}
Index: doc/papers/concurrency/response
===================================================================
--- doc/papers/concurrency/response	(revision b0795bebc0e389f1524fddfc7d31695fbea4c2e5)
+++ doc/papers/concurrency/response	(revision b0795bebc0e389f1524fddfc7d31695fbea4c2e5)
@@ -0,0 +1,984 @@
+    A revised version of your manuscript that takes into account the comments
+    of the referees will be reconsidered for publication.
+
+We have attempted to address all the referee's comments in the revised version
+of the paper, with notes below for each comment.
+
+=============================================================================
+
+    Reviewing: 1
+
+    As far as I can tell, the article contains three main ideas: an
+    asynchronous execution / threading model; a model for monitors to provide
+    mutual exclusion; and an implementation.  The first two ideas are drawn
+    together in Table 1: unfortunately this is on page 25 of 30 pages of
+    text. Implementation choices and descriptions are scattered throughout the
+    paper - and the sectioning of the paper seems almost arbitrary.
+
+Fixed, Table 1 is moved to the start and explained in detail.
+
+    The article is about its contributions.  Simply adding feature X to
+    language Y isn't by itself a contribution, (when feature X isn't already a
+    contribution).
+
+C++ (Y) added object-oriented programming (X) to C, where OO programming (X)
+was not a contribution.
+
+    For example: why support two kinds of generators as well as user-level
+    threads?  Why support both low and high level synchronization constructs?
+
+Fixed, as part of discussing Table 1.
+
+    Similarly I would have found the article easier to follow if it was written
+    top down, presenting the design principles, present the space of language
+    features, justify chosen language features (and rationale) and those
+    excluded, and then present implementation, and performance.
+
+Fixed, the paper is now restructured in this form.
+
+    Then the writing of the article is often hard to follow, to say the
+    least. Two examples: section 3 "stateful functions" - I've some idea
+    what that is (a function with Algol's "own" or C's "static" variables?
+    but in fact the paper has a rather more specific idea than that.
+
+Fixed, at the start of this section.
+
+    The top of page 3 throws a whole lot of definitions at the reader
+    "generator" "coroutine" "stackful" "stackless" "symmetric" "asymmetric"
+    without every stopping to define each one
+
+Hopefully fixed by moving Table 1 forward.
+
+    --- but then in footnote "C" takes the time to explain what C's "main"
+    function is? I cannot imagine a reader of this paper who doesn't know what
+    "main" is in C; especially if they understand the other concepts already
+    presented in the paper.
+
+Fixed by shortening.
+
+    The start of section 3 then does the same
+    thing: putting up a whole lot of definitions, making distinctions and
+    comparisons, even talking about some runtime details, but the critical
+    definition of a monitor doesn't appear until three pages later, at the
+    start of section 5 on p15, lines 29-34 are a good, clear, description
+    of what a monitor actually is.  That needs to come first, rather than
+    being buried again after two sections of comparisons, discussions,
+    implementations, and options that are ungrounded because they haven't
+    told the reader what they are actually talking about.  First tell the
+    reader what something is, then how they might use it (as programmers:
+    what are the rules and restrictions) and only then start comparison
+    with other things, other approaches, other languages, or
+    implementations.
+
+Hopefully fixed by moving Table 1 forward.
+
+    The description of the implementation is similarly lost in the trees
+    without ever really seeing the wood. Figure 19 is crucial here, but
+    it's pretty much at the end of the paper, and comments about
+    implementations are threaded throughout the paper without the context
+    (fig 19) to understand what's going on.
+
+We have to agree to disagree on the location of Fig 19. Early discussion about
+implementation for the various control structures are specific to that feature.
+Fig 19 shows the global runtime structure, which manages only the threading
+aspect of the control structures and their global organization.
+
+    The protocol for performance testing may just about suffice for C (although
+    is N constantly ten million, or does it vary for each benchmark)
+
+Fixed, the paper states N varies per language/benchmark so the benchmark runs
+long enough to get a good average per operation.
+
+    but such evaluation isn't appropriate for garbage-collected or JITTed
+    languages like Java or Go.
+
+Please explain. All the actions in the benchmarks occur independently of the
+storage-management scheme, e.g., acquiring a lock is an aspect of execution not
+storage. In fact, garbage-collected or JITTed languages cheat on benchmarks and
+we had to take great care to prevent cheating and measure the actual operation.
+
+    p1 only a subset of C-forall extensions?
+
+Fixed, removed.
+
+    p1 "has features often associated with object-oriented programming
+    languages, such as constructors, destructors, virtuals and simple
+    inheritance."  There's no need to quibble about this. Once a language has
+    inheritance, it's hard to claim it's not object-oriented.
+
+We have to agree to disagree. Object languages are defined by the notion of
+nested functions in a aggregate structure with a special receiver parameter
+"this", not by inheritance.  Inheritance is a polymorphic mechanism, e.g,
+Plan-9 C has simple inheritance but is not object-oriented. Because Cforall
+does not have a specific receiver, it is possible to have multiple function
+parameters as receivers, which introduces new concepts like bulk acquire for
+monitors.
+
+    p2 barging? signals-as-hints?
+
+Added a footnote for barging. We feel these terms are well known in the
+concurrency literature, especially in pthreads and Java, and both terms have
+citations with extensive explanations and further citations.
+
+    p3 start your discussion of generations with a simple example of a
+    C-forall generator.  Fig 1(b) might do: but put it inline instead of
+    the python example - and explain the key rules and restrictions on the
+    construct.  Then don't even start to compare with coroutines until
+    you've presented, described and explained your coroutines...
+    p3 I'd probably leave out the various "C" versions unless there are
+    key points to make you can't make in C-forall. All the alternatives
+    are just confusing.
+
+Hopefully fixed as this block of text has been rewritten.
+
+    p4 but what's that "with" in Fig 1(B)
+
+Footnote D explains the semantic of "with", which is like unqualified access
+for the receiver to the fields of a class from member routines, i.e., no
+"this->".
+
+    p5 start with the high level features of C-forall generators...
+
+Hopefully fixed by moving Table 1 forward.
+
+    p5 why is the paper explaining networking protocols?
+
+Fixed, added discussion on this point.
+
+    p7 lines 1-9 (transforming generator to coroutine - why would I do any of
+    this? Why would I want one instead of the other (do not use "stack" in your
+    answer!)
+
+As stated on line 1 because state declarations from the generator type can be
+moved out of the coroutine type into the coroutine main
+
+    p10 last para "A coroutine must retain its last resumer to suspend back
+    because the resumer is on a different stack. These reverse pointers allow
+    suspend to cycle backwards, " I've no idea what is going on here?  why
+    should I care?  Shouldn't I just be using threads instead?  why not?
+
+Hopefully fixed by moving Table 1 forward.
+
+    p16 for the same reasons - what reasons?
+
+Hopefully fixed by moving Table 1 forward.
+
+    p17 if the multiple-monitor entry procedure really is novel, write a paper
+    about that, and only about that.
+
+We do not believe this is a practical suggestion.
+
+    p23 "Loose Object Definitions" - no idea what that means.  in that
+    section: you can't leave out JS-style dynamic properties.  Even in
+    OOLs that (one way or another) allow separate definitions of methods
+    (like Objective-C, Swift, Ruby, C#) at any time a runtime class has a
+    fixed definition.  Quite why the detail about bit mask implementation
+    is here anyway, I've no idea.
+
+Fixed by rewriting the section.
+
+    p25 this cluster isn't a CLU cluster then?
+
+No. A CLU cluster is like a class in an object-oriented programming language.
+A CFA cluster is a runtime organizational mechanism.
+
+    * conclusion should conclude the paper, not the related.
+
+We do not understand this comment.
+
+=============================================================================
+
+    Reviewing: 2
+
+    There is much description of the system and its details, but nothing about
+    (non-artificial) uses of it. Although the microbenchmark data is
+    encouraging, arguably not enough practical experience with the system has
+    been reported here to say much about either its usability advantages or its
+    performance.
+
+We have a Catch-22 problem. Without publicity, there is no user community;
+without a user community, there are no publications for publicity.
+
+    p2: lines 4--9 are a little sloppy. It is not the languages but their
+    popular implementations which "adopt" the 1:1 kernel threading model.
+
+Fixed.
+
+    line 10: "medium work" -- "medium-sized work"?
+
+Fixed.
+
+    line 18: "is all sequential to the compiler" -- not true in modern
+    compilers, and in 2004 H-J Boehm wrote a tech report describing exactly why
+    ("Threads cannot be implemented as a library", HP Labs).
+
+We will have to disagree on this point. First, I am aware of Hans's 2004 paper
+because in that paper Hans cites my seminal work on this topic from 1995, which
+we cite in this paper.  Second, while modern memory-models have been added to
+languages like Java/C/C++ and new languages usually start with a memory model,
+it is still the programmer's responsibility to use them for racy code. Only
+when the programing language provides race-free constructs is the language
+aware of the concurrency; otherwise the code is sequential. Hans's paper "You
+Don't Know Jack About Shared Variables or Memory Models" talks about these
+issues, and is also cited in the paper.
+
+    line 20: "knows the optimization boundaries" -- I found this vague. What's
+    an example?
+
+Fixed.
+
+    line 31: this paragraph has made a lot of claims. Perhaps forward-reference
+    to the parts of the paper that discuss each one.
+
+Fixed by adding a road-map paragraph at the end of the introduction.
+
+    line 33: "so the reader can judge if" -- this reads rather
+    passive-aggressively. Perhaps better: "... to support our argument that..."
+
+Fixed.
+
+    line 41: "a dynamic partitioning mechanism" -- I couldn't tell what this
+    meant
+
+Fixed.
+
+    p3. Presenting concept of a "stateful function" as a new language feature
+    seems odd. In C, functions often have local state thanks to static local
+    variables (or globals, indeed). Of course, that has several
+    limitations. Can you perhaps present your contributions by enumerating
+    these limitations? See also my suggestion below about a possible framing
+    centred on a strawman.
+
+Fixed, at the start of this section.
+
+    line 2: "an old idea that is new again" -- this is too oblique
+
+Fixed, removed.
+
+    lines 2--15: I found this to be a word/concept soup. Stacks, closures,
+    generators, stackless stackful, coroutine, symmetric, asymmetric,
+    resume/suspend versus resume/resume... there needs to be a more gradual and
+    structured way to introduce all this, and ideally one that minimises
+    redundancy. Maybe present it as a series of "definitions" each with its own
+    heading, e.g. "A closure is stackless if its local state has statically
+    known fixed size"; "A generator simply means a stackless closure." And so
+    on. Perhaps also strongly introduce the word "activate" as a direct
+    contrast with resume and suspend. These are just a flavour of the sort of
+    changes that might make this paragraph into something readable.
+
+    Continuing the thought: I found it confusing that by these definitions, a
+    stackful closure is not a stack, even though logically the stack *is* a
+    kind of closure (it is a representation of the current thread's
+    continuation).
+
+Fixed. Rewrote paragraph and moved Table 1 forward.
+
+    lines 24--27: without explaining what the boost functor types mean, I don't
+    think the point here comes across.
+
+Replaced with uC++ example because boost appears to have dropped symmetric
+coroutines.
+
+    line 34: "semantically coupled" -- I wasn't sure what this meant
+
+Fixed.
+
+    p4: the point of Figure 1 (C) was not immediately clear. It seem to be
+    showing how one might "compile down" Figure 1 (B). Or is that Figure 1 (A)?
+
+Fixed. Rewrote sentence.
+
+    It's right that the incidental language features of the system are not
+    front-and-centre, but I'd appreciate some brief glossing of non-C languages
+    features as they appear. Examples are the square bracket notation, the pipe
+    notation and the constructor syntax. These explanations could go in the
+    caption of the figure which first uses them, perhaps. Overall I found the
+    figure captions to be terse, and a missed opportunity to explain clearly
+    what was going on.
+
+Fixed, added descriptive footnote about Cforall. We prefer to put text in the
+body of the paper and keep captions short.
+
+    p5 line 23: "This restriction is removed..." -- give us some up-front
+    summary of your contributions and the elements of the language design that
+    will be talked about, so that this isn't an aside. This will reduce the
+    "twisty passages" feeling that characterises much of the paper.
+
+Fixed, remove parenthesis.
+
+    line 40: "a killer asymmetric generator" -- this is stylistically odd, and
+    the sentence about failures doesn't convincingly argue that C\/ will help
+    with them. Have you any experience writing device drivers using C\/? Or any
+    argument that the kinds of failures can be traced to the "stack-ripping"
+    style that one is forced to use without coroutines ?
+
+Fixed, added new paragraph.
+
+    Also, a typo on line
+    41: "device drives". And saying "Windows/Linux" is sloppy... what does the
+    cited paper actually say?
+
+Fixed.
+
+    p6 lines 13--23: this paragraph is difficult to understand. It seems to be
+    talking about a control-flow pattern roughly equivalent to tail recursion.
+    What is the high-level point, other than that this is possible?
+
+Fixed, rewrote start of the paragraph.
+
+    line 34: "which they call coroutines" -- a better way to make this point is
+    presumably that the C++20 proposal only provides a specialised kind of
+    coroutine, namely generators, despite its use of the more general word.
+
+Fixed.
+
+    line 47: "... due to dynamic stack allocation, execution..." -- this
+    sentence doesn't scan. I suggest adding "and for" in the relevant places
+    where currently there are only commas.
+
+Fixed.
+
+    p8 / Figure 5 (B) -- the GNU C extension of unary "&&" needs to be
+    explained.
+
+Fixed, added explanation at first usage in Figure 1(C) and reference.
+
+    The whole figure needs a better explanation, in fact.
+
+Fixed, rewrote start of the paragraph.
+
+    p9, lines 1--10: I wasn't sure this stepping-through really added much
+    value. What are the truly important points to note about this code?
+
+Fixed, shortened and merged with previous paragraph.
+
+    p10: similarly, lines 3--27 again are somewhere between tedious and
+    confusing. I'm sure the motivation and details of "starter semantics" can
+    both be stated much more pithily.
+
+Fixed, shortened these paragraphs.
+
+    line 32: "a self-resume does not overwrite the last resumer" -- is this a
+    hack or a defensible principled decision?
+
+Fixed, removed but it is a defensible principled decision.
+
+    p11: "a common source of errors" -- among beginners or among production
+    code? Presumably the former.
+
+Forgetting is not specific to beginners.
+
+    line 23: "with builtin and library" -- not sure what this means
+
+Fixed.
+
+    lines 31--36: these can be much briefer. The only important point here
+    seems to be that coroutines cannot be copied.
+
+Fixed, shortened.
+
+    p12: line 1: what is a "task"? Does it matter?
+
+Fixed, "task" has been changed to "thread" throughout the paper.
+
+     line 7: calling it "heap stack" seems to be a recipe for
+     confusion. "Stack-and-heap" might be better, and contrast with
+     "stack-and-VLS" perhaps. When "VLS" is glossed, suggest actually expanding
+     its initials: say "length" not "size".
+
+Fixed, make correction and rewrote some of the text.
+
+     line 21: are you saying "cooperative threading" is the same as
+     "non-preemptive scheduling", or that one is a special case (kind) of the
+     other? Both are defensible, but be clear.
+
+Fixed, clarified the definitions.
+
+    line 27: "mutual exclusion and synchronization" -- the former is a kind of
+    the latter, so I suggest "and other forms of synchronization".
+
+We have to agree to disagree. Included a citation that explains the
+differences.
+
+    line 30: "can either be a stackless or stackful" -- stray "a", but also,
+    this seems to be switching from generic/background terminology to
+    C\/-specific terminology.
+
+Fixed, but the terms stackless or stackful are not specific to Cforall; they
+are well known in the literature.
+
+    An expositional idea occurs: start the paper with a strawman naive/limited
+    realisation of coroutines -- say, Simon Tatham's popular "Coroutines in C"
+    web page -- and identify point by point what the limitations are and how
+    C\/ overcomes them. Currently the presentation is often flat (lacking
+    motivating contrasts) and backwards (stating solutions before
+    problems). The foregoing approach might fix both of these.
+
+We prefer the current structure of our paper and believe the paper does
+explain basic coding limitations and how they are overcome in using high-level
+control-floe mechanisms.
+
+    page 13: line 23: it seems a distraction to mention the Python feature
+    here.
+
+Why? It is the first location in the paper where dynamic allocation and
+initialization are mentioned.
+
+    p14 line 5: it seems odd to describe these as "stateless" just because they
+    lack shared mutable state. It means the code itself is even more
+    stateful. Maybe the "stack ripping" argument could usefully be given here.
+
+Fixed, changed "stateless" to "non-shared".
+
+    line 16: "too restrictive" -- would be good to have a reference to justify
+    this, or at least give a sense of what the state-of-the-art performance in
+    transactional memory systems is (both software and hardware)
+
+Fixed, added 2 citations.
+
+    line 22: "simulate monitors" -- what about just *implementing* monitors?
+    isn't that what these systems do? or is the point more about refining them
+    somehow into something more specialised?
+
+Fixed, changed "simulate monitors" to "manually implement a monitor".
+
+    p15: sections 4.1 and 4.2 seem adrift and misplaced. Split them into basic
+    parts (which go earlier) and more advanced parts (e.g. barging, which can
+    be explained later).
+
+Fixed, removed them by shortening and merging with previous section.
+
+    line 31: "acquire/release" -- misses an opportunity to contrast the
+    monitor's "enter/exit" abstraction with the less structured acquire/release
+    of locks.
+
+Fixed, added "by call/return" in sentence.
+
+    p16 line 12: the "implicit" versus "explicit" point is unclear. Is it
+    perhaps about the contract between an opt-in *discipline* and a
+    language-enforced *guarantee*?
+
+Fixed.
+
+    line 28: no need to spend ages dithering about which one is default and
+    which one is the explicit qualifier. Tell us what you decided, briefly
+    justify it, and move on.
+
+Fixed, shortened paragraph.
+
+    p17: Figure 11: since the main point seems to be to highlight bulk acquire,
+    include a comment which identifies the line where this is happening.
+
+Fixed.
+
+    line 2: "impossible to statically..." -- or dynamically. Doing it
+    dynamically would be perfectly acceptable (locking is a dynamic operation
+    after all)
+
+Fixed, clarified the "statically" applied to the unknown-sized pointer types.
+
+    "guarantees acquisition order is consistent" -- assuming it's done in a
+    single bulk acquire.
+
+Fixed.
+
+    p18: section 5.3: the text here is a mess. The explanations of "internal"
+    versus "external" scheduling are unclear, and "signals as hints" is not
+    explained. "... can cause thread starvation" -- means including a while
+    loop, or not doing so? "There are three signalling mechanisms.." but the
+    text does not follow that by telling us what they are. My own scribbled
+    attempt at unpicking the internal/external thing: "threads already in the
+    monitor, albeit waiting, have priority over those trying to enter".
+
+Fixed, rewrote and shortened paragraphs.
+
+    p19: line 3: "empty condition" -- explain that condition variables don't
+    store anything. So being "empty" means that the queue of waiting threads
+    (threads waiting to be signalled that the condition has become true) is
+    empty.
+
+Fixed, changed condition variable to condition queue throughout the paper.
+
+    line 6: "... can be transformed into external scheduling..." -- OK, but
+    give some motivation.
+
+The paper states that it removes the condition queues and signal/wait. Changed
+"transform" to "simplified".
+
+    p20: line 6: "mechnaism"
+
+Fixed.
+
+    lines 16--20: this is dense and can probably only be made clear with an
+    example
+    
+Fixed, rewrote and added example.
+
+    p21 line 21: clarify that nested monitor deadlock was describe earlier (in
+    5.2). (Is the repetition necessary?)
+
+Fixed, put in a forward reference, and the point bears repeating because
+releasing a subset of acquired monitors in unique to Cforall concurrency.
+
+    line 27: "locks, and by extension monitors" -- this is true but the "by
+    extension" argument is faulty. It is perfectly possible to use locks as a
+    primitive and build a compositional mechanism out of them,
+    e.g. transactions.
+
+True, but that is not what we said. Locks are not composable, monitors are
+built using locks not transactions, so by extension monitors are not composable.
+
+    p22 line 2: should say "restructured"
+
+Fixed.
+
+    line 33: "Implementing a fast subset check..." -- make clear that the
+    following section explains how to do this. Restructuring the sections
+    themselves could do this, or noting in the text.
+
+Fixed, added a forward reference to the following sections.
+
+    p23: line 3: "dynamic member adding, eg, JavaScript" -- needs to say "as
+    permitted in JavaScript", and "dynamically adding members" is stylistically
+    better
+
+Fixed.
+
+    p23: line 18: "urgent stack" -- back-reference to where this was explained
+    before
+
+Fixed.
+
+    p24 line 7: I did not understand what was more "direct" about "direct
+    communication". Also, what is a "passive monitor" -- just a monitor, given
+    that monitors are passive by design?
+
+The back half of line 7 defines "direct". For example, Go, Java, pthread
+threads cannot directly call/communicate with one another, where they can in
+Ada, uC++, and Cforall threads. Figure 18 show this exact difference.
+
+A monitor object is *passive* because it does not have a thread, while a Go,
+Java, Cforall "thread" object is *active* because it has a thread.
+
+    line 14 / section 5.9: this table was useful and it (or something like it)
+    could be used much earlier on to set the structure of the rest of the
+    paper.
+
+Fixed, Table 1 is moved to the start and explained in detail.
+
+    The explanation at present is too brief, e.g. I did not really understand
+    the point about cases 7 and 8. Table 1: what does "No / Yes" mean?
+
+Fixed, expanded the explanation.
+
+    p25 line 2: instead of casually dropping in a terse explanation for the
+    newly introduced term "virtual processor", introduce it
+    properly. Presumably the point is to give a less ambiguous meaning to
+    "thread" by reserving it only for C\/'s green threads.
+
+Fixed.
+
+    p26 line 15: "transforms user threads into fibres" -- a reference is needed
+    to explain what "fibres" means... guessing it's in the sense of Adya et al.
+
+Fixed. In a prior correct, the term fibre from Adya is defined.
+
+    line 20: "Microsoft runtime" -- means Windows?
+
+Fixed.
+
+    lines 21--26: don't say "interrupt" to mean "signal", especially not
+    without clear introduction. You can use "POSIX signal" to disambiguate from
+    condition variables' "signal".
+
+We have to agree to disagree on this terminology. Interrupt is the action of
+stopping the CPU while a signal is a specific kind of interrupt. The two terms
+seem to be well understood in the literature.
+
+    p27 line 3: "frequency is usually long" -- that's a "time period" or
+    "interval", not a frequency
+
+Fixed.
+
+    line 5: the lengthy quotation is not really necessary; just paraphrase the
+    first sentence and move on.
+
+Fixed.
+
+    line 20: "to verify the implementation" -- I don't think that means what is
+    intended
+
+Fixed, changed "verify" to "test".
+
+    Tables in section 7 -- too many significant figures. How many overall runs
+    are described? What is N in each case?
+
+Fixed. As stated, N=31.
+
+    p29 line 2: "to eliminate this cost" -- arguably confusing since nowadays
+    on commodity CPUs most of the benefits of inlining are not to do with call
+    overheads, but from later optimizations enabled as a consequence of the
+    inlining
+
+Fixed.
+
+    line 41: "a hierarchy" -- are they a hierarchy? If so, this could be
+    explained earlier. Also, to say these make up "an integrated set... of
+    control-flow features" verges on the tautologous.
+
+Fixed, rewrote sentence.
+
+    p30 line 15: "a common case being web servers and XaaS" -- that's two cases
+
+Fixed.
+
+============================================================================
+
+    Reviewing: 3
+
+    * Expand on the motivations for including both generator and coroutines, vs
+      trying to build one atop the other
+
+Fixed, Table 1 is moved to the start and explained in detail.
+
+    * Expand on the motivations for having both symmetric and asymmetric
+      coroutines?
+
+A coroutine is not marked as symmetric or asymmetric, it is a coroutine.
+Symmetric or asymmetric is a stylistic use of a coroutine. By analogy, a
+function is not marked as recursive or non-recursive. Recursion is a style of
+programming with a function. So there is no notion of motivation for having
+both symmetric and asymmetric as they follow from how a programmer uses suspend
+and resume.
+
+    * Comparison to async-await model adopted by other languages
+
+Fixed, added a new section on this topic.
+
+    * Consider performance comparisons against node.js and Rust frameworks
+
+Fixed.
+
+    * Discuss performance of monitors vs finer-grained memory models and atomic
+      operations found in other languages
+
+The paper never suggested high-level concurrency constructs can or should
+replace race programming or hardware atomics. The paper suggests programmers
+use high-level constructs when and where is it feasible because they are easy
+and safer to use. The monitor example of an atomic counter is just that, an
+example, not the way it should be done if maximal performance is required.  We
+have tried to make this point clear in the paper.
+
+    * Why both internal/external scheduling for synchronization?
+
+Some additional motivation has been added.
+
+    * Generators are not exposed as a "function" that returns a generator
+      object, but rather as a kind of struct, with communication happening via
+      mutable state instead of "return values".
+
+Yes, Cforall uses an object-style of coroutine, which allows multiple interface
+functions that pass and return values through a structure. This approach allows
+a generator function to have different kinds of return values and different
+kinds of parameters to produce those values. Our generators can provide this
+capability via multiple interface functions to the generator/coroutine state,
+which is discussed on page 5, lines 13-21.
+
+      That is, the generator must be manually resumed and (if I understood) it
+      is expected to store values that can then later be read (perhaps via
+      methods), instead of having a `yield <Expr>` statement that yields up a
+      value explicitly.
+
+All generators are manually resumed, e.g., Python/nodejs use "next" to resume a
+generator. Yes, yield <Expr> has a single interface with one input/return type,
+versus the Cforall approach allowing arbitrary number of interfaces of
+arbitrary types.
+
+    * Both "symmetric" and "asymmetric" generators are supported, instead of
+      only asymmetric.
+
+Yes, because they support different functionality as discussed in Chris
+Marlin's seminal work and both forms are implemented in Simula67. We did not
+invent symmetric and asymmetric generators/coroutines, we took them from the
+literature.
+
+    * Coroutines (multi-frame generators) are an explicit mechanism.
+
+    In most other languages, coroutines are rather built by layering
+    single-frame generators atop one another (e.g., using a mechanism like
+    async-await),
+
+We disagree. Node.js has async-await but has a separate coroutine feature.
+While there are claims that coroutines can be built from async-await and/or
+continuations, in actuality they cannot.
+
+    and symmetric coroutines are basically not supported. I'd like to see a bit
+    more justification for Cforall including all the above mechanisms -- it
+    seemed like symmetric coroutines were a useful building block for some of
+    the user-space threading and custom scheduler mechanisms that were briefly
+    mentioned later in the paper.
+
+Hopefully fixed by moving Table 1 forward.
+
+    In the discussion of coroutines, I would have expected a bit more of a
+    comparison to the async-await mechanism offered in other languages.
+
+We added a new section at the start to point out there is no comparison between
+coroutines and async-await.
+
+    Certainly the semantics of async-await in JavaScript implies
+    significantly more overhead (because each async fn is a distinct heap
+    object). [Rust's approach avoids this overhead][zc], however, and might be
+    worthy of a comparison (see the Performance section).
+
+We could not get Rust async-await to work, and when reading the description of
+rust async-await, it appears to be Java-style executors with futures (possibly
+fast futures).
+
+    There are several sections in the paper that compare against atomics -- for
+    example, on page 15, the paper shows a simple monitor that encapsulates an
+    integer and compares that to C++ atomics. Later, the paper compares the
+    simplicity of monitors against the `volatile` quantifier from Java. The
+    conclusion in section 8 also revisits this point.
+    While I agree that monitors are simpler, they are obviously also
+    significantly different from a performance perspective -- the paper doesn't
+    seem to address this at all. It's plausible that (e.g.) the `Aint` monitor
+    type described in the paper can be compiled and mapped to the specialized
+    instructions offered by hardware, but I didn't see any mention of how this
+    would be done.
+
+Fixed, see response above.
+
+    There is also no mention of the more nuanced memory ordering
+    relations offered by C++11 and how one might achieve similar performance
+    characteristics in Cforall (perhaps the answer is that one simply doesn't
+    need to; I think that's defensible, but worth stating explicitly).
+
+Cforall is built on C, and therefore has full access to all the gcc atomics,
+and automatically gets any gcc updates.  Furthermore, section 6.9 states that
+Cforall provides the full panoply of low-level locks, as does Java, Go, C++,
+for performance programming.
+
+    Cforall includes both internal and external scheduling; I found the
+    explanation for the external scheduling mechanism to be lacking in
+    justification. Why include both mechanisms when most languages seem to make
+    do with only internal scheduling? It would be useful to show some scenarios
+    where external scheduling is truly more powerful.
+
+Fixed. Pointed out external scheduling is simpler as part of rewriting in that
+section, and added additional examples.
+
+    I would have liked to see some more discussion of external scheduling and
+    how it interacts with software engineering best practices. It seems
+    somewhat similar to AOP in certain regards. It seems to add a bit of "extra
+    semantics" to monitor methods, in that any method may now also become a
+    kind of synchronization point.
+
+Fixed somewhat. Pointed out that external scheduling has been around for a long
+time (40 years) in Ada, so there is a body of the software-engineering
+experience using it. As well, I have been teaching it for 30 years in the
+concurrency course at Waterloo. We don't know what software engineering best
+practices you imagine it interacting with. Yes, monitor functions are
+synchronization points with external scheduling.
+
+    The "open-ended" nature of this feels like it could easily lead to subtle
+    bugs, particularly when code refactoring occurs (which may e.g. split an
+    existing method into two).
+
+Any time a public interface is refactored, it invalids existing calls, so there
+is always an issue. For mutex routines and external scheduling, the waitfor
+statements may have to be updated, but that update is part of the refactoring.
+
+    This seems particularly true if external scheduling can occur across
+    compilation units -- the paper suggested that this is true, but I wasn't
+    entirely clear.
+
+Every aspect of Cforall allows separate compilation. The function prototypes
+necessary for separate compilation provide all the information necessary to
+compile any aspect of a program.
+
+    I would have also appreciated a few more details on how external scheduling
+    is implemented. It seems to me that there must be some sort of "hooks" on
+    mutex methods so that they can detect whether some other function is
+    waiting on them and awaken those blocked threads. I'm not sure how such
+    hooks are inserted, particularly across compilation units.
+
+Hooks are inserted by the Cforall translator, in the same way that Java
+inserted hooks into a "synchronized" member of a monitor. As for Java, as long
+as the type information is consistent across compilation units, the correct
+code is inserted.
+
+    The material in Section 5.6 didn't quite clarify the matter for me. For
+    example, it left me somewhat confused about whether the `f` and `g`
+    functions declared were meant to be local to a translation unit, or shared
+    with other unit.
+
+There are no restrictions with respect to static or external mutex functions.
+Cforall is C. Any form of access or separate compilation in C applies to
+Cforall. As in C, function prototypes carry all necessary information to
+compile the code.
+
+    To start, I did not realize that the `mutex_opt` notation was a keyword, I
+    thought it was a type annotation. I think this could be called out more
+    explicitly.
+
+Fixed, indicated "mutex" is a C-style parameter-only declaration type-qualifier.
+
+    Later, in section 5.2, the paper discusses `nomutex` annotations, which
+    initially threw me, as they had not been introduced (now I realize that
+    this paragraph is there to justify why there is no such keyword). The
+    paragraph might be rearranged to make that clearer, perhaps by leading with
+    the choice that Cforall made.
+
+Fixed, rewrote paragraph removing nomutex.
+
+    On page 17, the paper states that "acquiring multiple monitors is safe from
+    deadlock", but this could be stated a bit more precisely: acquiring
+    multiple monitors in a bulk-acquire is safe from deadlock (deadlock can
+    still result from nested acquires).
+
+Fixed.
+
+    On page 18, the paper states that wait states do not have to be enclosed in
+    loops, as there is no concern of barging. This seems true but there are
+    also other reasons to use loops (e.g., if there are multiple reasons to
+    notify on the same condition). Thus the statement initially surprised me,
+    as barging is only one of many reasons that I typically employ loops around
+    waits.
+
+Fixed. Rewrote the sentence. Note, for all non-barging cases where you employ a
+loop around a wait, the unblocking task must change state before blocking
+again.  In the barging case, the unblocking thread blocks again without
+changing state.
+
+    I did not understand the diagram in Figure 12 for some time. Initially, I
+    thought that it was generic to all monitors, and I could not understand the
+    state space. It was only later that I realized it was specific to your
+    example. Updating the caption from "Monitor scheduling to "Monitor
+    scheduling in the example from Fig 13" might have helped me quite a bit.
+
+Fixed, updated text to clarify. Did not change the caption because the
+signal_block does not apply to Figure 13.
+
+    I spent quite some time reading the boy/girl dating example (\*) and I
+    admit I found it somewhat confusing. For example, I couldn't tell whether
+    there were supposed to be many "girl" threads executing at once, or if
+    there was only supposed to be one girl and one boy thread executing in a
+    loop.
+
+The paper states:
+
+  The dating service matches girl and boy threads with matching compatibility
+  codes so they can exchange phone numbers.
+
+so there are many girl/boy threads. There is nothing preventing an individual
+girl/boy from arranging multiple dates.
+
+    Are the girl/boy threads supposed to invoke the girl/boy methods or vice
+    versa?
+
+As long as the girls/boys are consistent in the calls, it does not matter. The
+goal is to find a partner and exchange phone numbers.
+
+    Surely there is some easier way to set this up?
+
+There are some other solutions using monitors but they all have a similar
+structure.
+
+    The paper offered a number of comparisons to Go, C#, Scala, and so forth,
+    but seems to have overlooked another recent language, Rust. In many ways,
+    Rust seems to be closest in philosophy to Cforall, so it seems like an odd
+    omission. I already mentioned above that Rust is in the process of shipping
+    [async-await syntax][aa], which is definitely an alternative to the
+    generator/coroutine approach in Cforall (though one with clear pros/cons).
+
+We cannot get rust async-await example programs to compile nor does the select!
+macro compile.
+
+  @plg2[1]% rustc --version
+  rustc 1.40.0 (73528e339 2019-12-16)
+  
+  @plg2[2]% cat future.rs 
+  use futures::executor::block_on;
+  
+  async fn hello_world() {
+      println!("hello, world!");
+  }
+  
+  fn main() {
+      let future = hello_world(); // Nothing is printed
+      block_on(future); // `future` is run and "hello, world!" is printed
+  }
+  
+  @plg2[3]% rustc -C opt-level=3 future.rs
+  error[E0670]: `async fn` is not permitted in the 2015 edition
+   --> future.rs:3:1
+    |
+  3 | async fn hello_world() {
+    | ^^^^^
+  
+  error[E0433]: failed to resolve: maybe a missing crate `futures`?
+   --> future.rs:1:5
+    |
+  1 | use futures::executor::block_on;
+    |     ^^^^^^^ maybe a missing crate `futures`?
+  
+  error[E0425]: cannot find function `block_on` in this scope
+   --> future.rs:9:5
+    |
+  9 |     block_on(future); // `future` is run and "hello, world!" is printed
+    |     ^^^^^^^^ not found in this scope
+  
+  error: aborting due to 3 previous errors
+  
+  Some errors have detailed explanations: E0425, E0433, E0670.
+  For more information about an error, try `rustc --explain E0425`.
+
+
+    In the performance section in particular, you might consider comparing
+    against some of the Rust web servers and threading systems.
+
+This paper is not about building web-servers. Nor are web-servers a reasonable
+benchmark for language concurrency. Web-servers are a benchmark for
+non-blocking I/O library efficiency accessed in the underlying operating
+system. Our prior work on web-server performance:
+
+@inproceedings{Pariag07,
+    author	= {David Pariag and Tim Brecht and Ashif Harji and Peter Buhr and Amol Shukla},
+    title	= {Comparing the Performance of Web Server Architectures},
+    booktitle	= {Proceedings of the 2007 Eurosys conference},
+    month	= mar,
+    year	= 2007,
+    pages	= {231--243},
+}
+
+@inproceedings{Harji12,
+    author	= {Ashif S. Harji and Peter A. Buhr and Tim Brecht},
+    title	= {Comparing High-Performance Multi-core Web-Server Architectures},
+    booktitle	= {Proceedings of the 5th Annual International Systems and Storage Conference},
+    series	= {SYSTOR '12},
+    publisher	= {ACM},
+    address	= {New York, NY, USA},
+    location	= {Haifa, Israel},
+    month	= jun,
+    year	= 2012,
+    articleno  	= 1,
+    pages	= {1:1--1:12},
+}
+
+shows the steps to build a high-performance web-server, which are largely
+independent of the server architecture and programing language.
+
+    It would seem worth trying to compare their "context switching" costs as
+    well -- I believe both actix and tokio have a notion of threads that could
+    be readily compared.
+
+Again, context-switching speed is largely irrelevant because the amount of code
+to process an http request is large enough to push any concurrency costs into
+the background.
+
+    Another addition that might be worth considering is to compare against
+    node.js promises, although I think the comparison to process creation is
+    not as clean.
+
+Done.
