Index: benchmark/io/http/Makefile.am
===================================================================
--- benchmark/io/http/Makefile.am	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ benchmark/io/http/Makefile.am	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -21,5 +21,5 @@
 include $(top_srcdir)/tools/build/cfa.make
 
-AM_CFLAGS = -O3 -Wall -Wextra -I$(srcdir) -lrt -pthread # -Werror
+AM_CFLAGS = -O3 -Wall -Wextra -I$(srcdir) -lrt -pthread -g # -Werror
 AM_CFAFLAGS = -quiet -nodebug
 AM_LDFLAGS = -quiet -nodebug
@@ -37,6 +37,10 @@
 	options.cfa \
 	options.hfa \
+	printer.cfa \
+	printer.hfa \
 	protocol.cfa \
 	protocol.hfa \
+	socket.cfa \
+	socket.hfa \
 	worker.cfa \
 	worker.hfa
Index: benchmark/io/http/main.cfa
===================================================================
--- benchmark/io/http/main.cfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ benchmark/io/http/main.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -2,4 +2,5 @@
 
 #include <errno.h>
+#include <signal.h>
 #include <stdio.h>
 #include <string.h>
@@ -8,4 +9,5 @@
 	#include <sched.h>
 	#include <signal.h>
+	#include <sys/eventfd.h>
 	#include <sys/socket.h>
 	#include <netinet/in.h>
@@ -14,4 +16,5 @@
 #include <fstream.hfa>
 #include <kernel.hfa>
+#include <locks.hfa>
 #include <iofwd.hfa>
 #include <stats.hfa>
@@ -21,4 +24,6 @@
 #include "filecache.hfa"
 #include "options.hfa"
+#include "socket.hfa"
+#include "printer.hfa"
 #include "worker.hfa"
 
@@ -30,83 +35,6 @@
 
 //=============================================================================================
-// Stats Printer
-//============================================================================================='
-
-thread StatsPrinter {
-	Worker * workers;
-	int worker_cnt;
-};
-
-void ?{}( StatsPrinter & this, cluster & cl ) {
-	((thread&)this){ "Stats Printer Thread", cl };
-	this.worker_cnt = 0;
-}
-
-void ^?{}( StatsPrinter & mutex this ) {}
-
-#define eng3(X) (ws(3, 3, unit(eng( X ))))
-
-void main(StatsPrinter & this) {
-	LOOP: for() {
-		waitfor( ^?{} : this) {
-			break LOOP;
-		}
-		or else {}
-
-		sleep(10`s);
-
-		print_stats_now( *active_cluster(), CFA_STATS_READY_Q | CFA_STATS_IO );
-		if(this.worker_cnt != 0) {
-			uint64_t tries = 0;
-			uint64_t calls = 0;
-			uint64_t header = 0;
-			uint64_t splcin = 0;
-			uint64_t splcot = 0;
-			struct {
-				volatile uint64_t calls;
-				volatile uint64_t bytes;
-			} avgrd[zipf_cnts];
-			memset(avgrd, 0, sizeof(avgrd));
-
-			for(i; this.worker_cnt) {
-				tries += this.workers[i].stats.sendfile.tries;
-				calls += this.workers[i].stats.sendfile.calls;
-				header += this.workers[i].stats.sendfile.header;
-				splcin += this.workers[i].stats.sendfile.splcin;
-				splcot += this.workers[i].stats.sendfile.splcot;
-				for(j; zipf_cnts) {
-					avgrd[j].calls += this.workers[i].stats.sendfile.avgrd[j].calls;
-					avgrd[j].bytes += this.workers[i].stats.sendfile.avgrd[j].bytes;
-				}
-			}
-
-			double ratio = ((double)tries) / calls;
-
-			sout | "----- Worker Stats -----";
-			sout | "sendfile  : " | calls | "calls," | tries | "tries (" | ratio | " try/call)";
-			sout | "            " | header | "header," | splcin | "splice in," | splcot | "splice out";
-			sout | " - zipf sizes:";
-			for(i; zipf_cnts) {
-				double written = avgrd[i].calls > 0 ? ((double)avgrd[i].bytes) / avgrd[i].calls : 0;
-				sout | "        " | zipf_sizes[i] | "bytes," | avgrd[i].calls | "shorts," | written | "written";
-			}
-		}
-		else {
-			sout | "No Workers!";
-		}
-	}
-}
-
-//=============================================================================================
 // Globals
 //=============================================================================================
-struct ServerCluster {
-	cluster self;
-	processor    * procs;
-	// io_context   * ctxs;
-	StatsPrinter * prnt;
-
-};
-
 void ?{}( ServerCluster & this ) {
 	(this.self){ "Server Cluster", options.clopts.params };
@@ -122,18 +50,18 @@
 		(this.procs[i]){ "Benchmark Processor", this.self };
 
-		int c = 0;
-		int n = 1 + (i % cnt);
-		for(int j = 0; j < CPU_SETSIZE; j++) {
-			if(CPU_ISSET(j, &fullset)) n--;
-			if(n == 0) {
-				c = j;
-				break;
-			}
-		}
-		cpu_set_t localset;
-		CPU_ZERO(&localset);
-		CPU_SET(c, &localset);
-		ret = pthread_setaffinity_np(this.procs[i].kernel_thread, sizeof(localset), &localset);
-		if( ret != 0 ) abort | "sched_getaffinity failed with" | ret | strerror( ret );
+		// int c = 0;
+		// int n = 1 + (i % cnt);
+		// for(int j = 0; j < CPU_SETSIZE; j++) {
+		// 	if(CPU_ISSET(j, &fullset)) n--;
+		// 	if(n == 0) {
+		// 		c = j;
+		// 		break;
+		// 	}
+		// }
+		// cpu_set_t localset;
+		// CPU_ZERO(&localset);
+		// CPU_SET(c, &localset);
+		// ret = pthread_setaffinity_np(this.procs[i].kernel_thread, sizeof(localset), &localset);
+		// if( ret != 0 ) abort | "sched_getaffinity failed with" | ret | strerror( ret );
 
 		#if !defined(__CFA_NO_STATISTICS__)
@@ -147,11 +75,4 @@
 	}
 
-	if(options.stats) {
-		this.prnt = alloc();
-		(*this.prnt){ this.self };
-	} else {
-		this.prnt = 0p;
-	}
-
 	#if !defined(__CFA_NO_STATISTICS__)
 		print_stats_at_exit( this.self, CFA_STATS_READY_Q | CFA_STATS_IO );
@@ -163,6 +84,4 @@
 
 void ^?{}( ServerCluster & this ) {
-	delete(this.prnt);
-
 	for(i; options.clopts.nprocs) {
 		^(this.procs[i]){};
@@ -175,4 +94,26 @@
 extern void init_protocol(void);
 extern void deinit_protocol(void);
+
+//=============================================================================================
+// REUSEPORT
+//=============================================================================================
+
+size_t sockarr_size;
+struct __attribute__((aligned(128))) Q {
+	mpsc_queue(PendingRead) q;
+};
+
+//=============================================================================================
+// Termination
+//=============================================================================================
+
+int closefd;
+void cleanstop(int) {
+	eventfd_t buffer = 1;
+	char * buffer_s = (char*)&buffer;
+	int ret = write(closefd, buffer_s, sizeof(buffer));
+	if(ret < 0) abort( "eventfd write error: (%d) %s\n", (int)errno, strerror(errno) );
+	return;
+}
 
 //=============================================================================================
@@ -180,4 +121,5 @@
 //============================================================================================='
 int main( int argc, char * argv[] ) {
+	int ret;
 	__sighandler_t s = 1p;
 	signal(SIGPIPE, s);
@@ -186,4 +128,18 @@
 	// Parse args
 	parse_options(argc, argv);
+
+	//===================
+	// Setup non-interactive termination
+	if(!options.interactive) {
+		closefd = eventfd(0, 0);
+		if(closefd < 0) abort( "eventfd error: (%d) %s\n", (int)errno, strerror(errno) );
+
+		sighandler_t prev = signal(SIGTERM, cleanstop);
+		intptr_t prev_workaround = (intptr_t) prev;
+		// can't use SIG_ERR it crashes the compiler
+		if(prev_workaround == -1) abort( "signal setup error: (%d) %s\n", (int)errno, strerror(errno) );
+
+		sout | "Signal termination ready";
+	}
 
 	//===================
@@ -197,46 +153,9 @@
 	// Open Socket
 	sout | getpid() | ": Listening on port" | options.socket.port;
-	int server_fd = socket(AF_INET, SOCK_STREAM, 0);
-	if(server_fd < 0) {
-		abort( "socket error: (%d) %s\n", (int)errno, strerror(errno) );
-	}
-
-	int ret = 0;
+
 	struct sockaddr_in address;
-	int addrlen = sizeof(address);
-	memset( (char *)&address, '\0' );
-	address.sin_family = AF_INET;
-	address.sin_addr.s_addr = htonl(INADDR_ANY);
-	address.sin_port = htons( options.socket.port );
-
-	int waited = 0;
-	for() {
-		int sockfd = server_fd;
-		__CONST_SOCKADDR_ARG addr;
-		addr.__sockaddr__ = (struct sockaddr *)&address;
-		socklen_t addrlen = sizeof(address);
-		ret = bind( sockfd, addr, addrlen );
-		if(ret < 0) {
-			if(errno == EADDRINUSE) {
-				if(waited == 0) {
-					if(!options.interactive) abort | "Port already in use in non-interactive mode. Aborting";
-					sout | "Waiting for port";
-				} else {
-					sout | "\r" | waited | nonl;
-					flush( sout );
-				}
-				waited ++;
-				sleep( 1`s );
-				continue;
-			}
-			abort( "bind error: (%d) %s\n", (int)errno, strerror(errno) );
-		}
-		break;
-	}
-
-	ret = listen( server_fd, options.socket.backlog );
-	if(ret < 0) {
-		abort( "listen error: (%d) %s\n", (int)errno, strerror(errno) );
-	}
+	int addrlen = prepaddr(address);
+
+	int server_fd;
 
 	//===================
@@ -257,26 +176,73 @@
 
 		{
+			// Stats printer makes a copy so this needs to persist longer than normal
+			connection ** conns;
+			AcceptWorker  * aworkers = 0p;
+			ChannelWorker * cworkers = 0p;
+			Acceptor * acceptors = 0p;
+			Q * queues = 0p;
 			ServerCluster cl[options.clopts.nclusters];
+
+			if(options.stats) {
+				stats_thrd = alloc();
+				(*stats_thrd){ cl };
+			} else {
+				stats_thrd = 0p;
+			}
 
 			init_protocol();
 			{
-				Worker * workers = anew(options.clopts.nworkers);
-				cl[0].prnt->workers = workers;
-				cl[0].prnt->worker_cnt = options.clopts.nworkers;
-				for(i; options.clopts.nworkers) {
-					// if( options.file_cache.fixed_fds ) {
-					// 	workers[i].pipe[0] = pipe_off + (i * 2) + 0;
-					// 	workers[i].pipe[1] = pipe_off + (i * 2) + 1;
-					// }
-					// else
-					{
-						workers[i].pipe[0] = fds[pipe_off + (i * 2) + 0];
-						workers[i].pipe[1] = fds[pipe_off + (i * 2) + 1];
-						workers[i].sockfd  = server_fd;
-						workers[i].addr    = (struct sockaddr *)&address;
-						workers[i].addrlen = (socklen_t*)&addrlen;
-						workers[i].flags   = 0;
-					}
-					unpark( workers[i] );
+				int nacceptors = options.clopts.nprocs * options.clopts.nclusters;
+				conns = alloc(options.clopts.nworkers);
+				if(options.socket.reuseport) {
+					queues = alloc(nacceptors);
+					acceptors = alloc(nacceptors);
+					sout | "Creating" | nacceptors | "Acceptors";
+					for(i; nacceptors) {
+						(acceptors[i]){ i % options.clopts.nclusters };
+					}
+					for(i; nacceptors) {
+						(queues[i]){};
+						{
+							acceptors[i].sockfd  = listener(address, addrlen);
+							acceptors[i].addr    = (struct sockaddr *)&address;
+							acceptors[i].addrlen = (socklen_t*)&addrlen;
+							acceptors[i].flags   = 0;
+							acceptors[i].queue   = &queues[i].q;
+						}
+						unpark( acceptors[i] );
+					}
+
+					cworkers = anew(options.clopts.nworkers);
+					for(i; options.clopts.nworkers) {
+						{
+							cworkers[i].conn.pipe[0] = fds[pipe_off + (i * 2) + 0];
+							cworkers[i].conn.pipe[1] = fds[pipe_off + (i * 2) + 1];
+							cworkers[i].queue = &queues[i % nacceptors].q;
+							conns[i] = &cworkers[i].conn;
+						}
+						unpark( cworkers[i] );
+					}
+				}
+				else {
+					server_fd = listener(address, addrlen);
+					aworkers = anew(options.clopts.nworkers);
+					for(i; options.clopts.nworkers) {
+						// if( options.file_cache.fixed_fds ) {
+						// 	workers[i].pipe[0] = pipe_off + (i * 2) + 0;
+						// 	workers[i].pipe[1] = pipe_off + (i * 2) + 1;
+						// }
+						// else
+						{
+							aworkers[i].conn.pipe[0] = fds[pipe_off + (i * 2) + 0];
+							aworkers[i].conn.pipe[1] = fds[pipe_off + (i * 2) + 1];
+							aworkers[i].sockfd = server_fd;
+							aworkers[i].addr    = (struct sockaddr *)&address;
+							aworkers[i].addrlen = (socklen_t*)&addrlen;
+							aworkers[i].flags   = 0;
+							conns[i] = &aworkers[i].conn;
+						}
+						unpark( aworkers[i] );
+					}
 				}
 				sout | options.clopts.nworkers | "workers started on" | options.clopts.nprocs | "processors /" | options.clopts.nclusters | "clusters";
@@ -285,14 +251,20 @@
 				}
 				sout | nl;
-				if(!options.interactive) park();
 				{
-					char buffer[128];
-					for() {
-						int ret = cfa_read(0, buffer, 128, 0);
-						if(ret == 0) break;
+					if(options.interactive) {
+						char buffer[128];
+						for() {
+							int ret = cfa_read(0, buffer, 128, 0);
+							if(ret == 0) break;
+							if(ret < 0) abort( "main read error: (%d) %s\n", (int)errno, strerror(errno) );
+							sout | "User wrote '" | "" | nonl;
+							write(sout, buffer, ret - 1);
+							sout | "'";
+						}
+					}
+					else {
+						char buffer[sizeof(eventfd_t)];
+						int ret = cfa_read(closefd, buffer, sizeof(eventfd_t), 0);
 						if(ret < 0) abort( "main read error: (%d) %s\n", (int)errno, strerror(errno) );
-						sout | "User wrote '" | "" | nonl;
-						write(sout, buffer, ret - 1);
-						sout | "'";
 					}
 
@@ -300,28 +272,88 @@
 				}
 
-				sout | "Notifying connections..." | nonl; flush( sout );
-				for(i; options.clopts.nworkers) {
-					workers[i].done = true;
-				}
-				sout | "done";
-
-				sout | "Shutting down socket..." | nonl; flush( sout );
-				int ret = shutdown( server_fd, SHUT_RD );
-				if( ret < 0 ) {
-					abort( "shutdown error: (%d) %s\n", (int)errno, strerror(errno) );
-				}
-				sout | "done";
-
 				//===================
-				// Close Socket
-				sout | "Closing Socket..." | nonl; flush( sout );
-				ret = close( server_fd );
-				if(ret < 0) {
-					abort( "close socket error: (%d) %s\n", (int)errno, strerror(errno) );
-				}
-				sout | "done";
-
-				sout | "Stopping connection threads..." | nonl; flush( sout );
-				adelete(workers);
+				// Close Socket and join
+				if(options.socket.reuseport) {
+					sout | "Notifying connections..." | nonl; flush( sout );
+					for(i; nacceptors) {
+						acceptors[i].done = true;
+					}
+					for(i; options.clopts.nworkers) {
+						cworkers[i].done = true;
+					}
+					sout | "done";
+
+					sout | "Shutting down Socket..." | nonl; flush( sout );
+					for(i; nacceptors) {
+						ret = shutdown( acceptors[i].sockfd, SHUT_RD );
+						if( ret < 0 ) {
+							abort( "shutdown1 error: (%d) %s\n", (int)errno, strerror(errno) );
+						}
+					}
+					sout | "done";
+
+					sout | "Closing Socket..." | nonl; flush( sout );
+					for(i; nacceptors) {
+						ret = close( acceptors[i].sockfd );
+						if( ret < 0) {
+							abort( "close socket error: (%d) %s\n", (int)errno, strerror(errno) );
+						}
+					}
+					sout | "done";
+
+					sout | "Stopping accept threads..." | nonl; flush( sout );
+					for(i; nacceptors) {
+						join(acceptors[i]);
+					}
+					sout | "done";
+
+					sout | "Draining worker queues..." | nonl; flush( sout );
+					for(i; nacceptors) {
+						PendingRead * p = 0p;
+						while(p = pop(queues[i].q)) {
+							fulfil(p->f, -ECONNRESET);
+						}
+					}
+					sout | "done";
+
+					sout | "Stopping worker threads..." | nonl; flush( sout );
+					for(i; options.clopts.nworkers) {
+						for(j; 2) {
+							ret = close(cworkers[i].conn.pipe[j]);
+							if(ret < 0) abort( "close pipe %d error: (%d) %s\n", j, (int)errno, strerror(errno) );
+						}
+						join(cworkers[i]);
+					}
+				}
+				else {
+					sout | "Notifying connections..." | nonl; flush( sout );
+					for(i; options.clopts.nworkers) {
+						aworkers[i].done = true;
+					}
+					sout | "done";
+
+					sout | "Shutting down Socket..." | nonl; flush( sout );
+					ret = shutdown( server_fd, SHUT_RD );
+					if( ret < 0 ) {
+						abort( "shutdown2 error: (%d) %s\n", (int)errno, strerror(errno) );
+					}
+					sout | "done";
+
+					sout | "Closing Socket..." | nonl; flush( sout );
+					ret = close( server_fd );
+					if(ret < 0) {
+						abort( "close socket error: (%d) %s\n", (int)errno, strerror(errno) );
+					}
+					sout | "done";
+
+					sout | "Stopping connection threads..." | nonl; flush( sout );
+					for(i; options.clopts.nworkers) {
+						for(j; 2) {
+							ret = close(aworkers[i].conn.pipe[j]);
+							if(ret < 0) abort( "close pipe %d error: (%d) %s\n", j, (int)errno, strerror(errno) );
+						}
+						join(aworkers[i]);
+					}
+				}
 			}
 			sout | "done";
@@ -331,17 +363,23 @@
 			sout | "done";
 
+			sout | "Stopping printer threads..." | nonl; flush( sout );
+			if(stats_thrd) {
+				notify_one(stats_thrd->var);
+			}
+			delete(stats_thrd);
+			sout | "done";
+
+			// Now that the stats printer is stopped, we can reclaim this
+			adelete(aworkers);
+			adelete(cworkers);
+			adelete(acceptors);
+			adelete(queues);
+			free(conns);
+
 			sout | "Stopping processors/clusters..." | nonl; flush( sout );
 		}
 		sout | "done";
 
-		sout | "Closing splice fds..." | nonl; flush( sout );
-		for(i; pipe_cnt) {
-			ret = close( fds[pipe_off + i] );
-			if(ret < 0) {
-				abort( "close pipe error: (%d) %s\n", (int)errno, strerror(errno) );
-			}
-		}
 		free(fds);
-		sout | "done";
 
 		sout | "Stopping processors..." | nonl; flush( sout );
Index: benchmark/io/http/options.cfa
===================================================================
--- benchmark/io/http/options.cfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ benchmark/io/http/options.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -35,7 +35,8 @@
 
 	{ // socket
-		8080, // port
-		10,   // backlog
-		1024  // buflen
+		8080,  // port
+		10,    // backlog
+		1024,  // buflen
+		false  // reuseport
 	},
 
@@ -52,7 +53,4 @@
 
 void parse_options( int argc, char * argv[] ) {
-	// bool fixedfd = false;
-	// bool sqkpoll = false;
-	// bool iokpoll = false;
 	unsigned nentries = 0;
 	bool isolate = false;
@@ -70,4 +68,5 @@
 		{'\0', "shell",          "Disable interactive mode", options.interactive, parse_setfalse},
 		{'\0', "accept-backlog", "Maximum number of pending accepts", options.socket.backlog},
+		{'\0', "reuseport",      "Use acceptor threads with reuse port SO_REUSEPORT", options.socket.reuseport, parse_settrue},
 		{'\0', "request_len",    "Maximum number of bytes in the http request, requests with more data will be answered with Http Code 414", options.socket.buflen},
 		{'\0', "seed",           "seed to use for hashing", options.file_cache.hash_seed },
Index: benchmark/io/http/options.hfa
===================================================================
--- benchmark/io/http/options.hfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ benchmark/io/http/options.hfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -27,4 +27,5 @@
 		int backlog;
 		int buflen;
+		bool reuseport;
 	} socket;
 
Index: benchmark/io/http/printer.cfa
===================================================================
--- benchmark/io/http/printer.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ benchmark/io/http/printer.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,107 @@
+#include "printer.hfa"
+#include "options.hfa"
+
+#include <fstream.hfa>
+#include <stats.hfa>
+
+void ?{}( sendfile_stats_t & this ) {
+	this.maxfd = 0;
+	this.close = 0;
+	this.calls = 0;
+	this.tries = 0;
+	this.header = 0;
+	this.splcin = 0;
+	this.splcot = 0;
+	for(i; zipf_cnts) {
+		this.avgrd[i].calls = 0;
+		this.avgrd[i].bytes = 0;
+	};
+}
+
+void push(sendfile_stats_t & from, sendfile_stats_t & to) {
+	to.maxfd = max( to.maxfd, from.maxfd);
+	to.close += from.close; from.close = 0;
+	to.calls += from.calls; from.calls = 0;
+	to.tries += from.tries; from.tries = 0;
+	to.header += from.header; from.header = 0;
+	to.splcin += from.splcin; from.splcin = 0;
+	to.splcot += from.splcot; from.splcot = 0;
+	for(i; zipf_cnts) {
+		to.avgrd[i].calls += from.avgrd[i].calls; from.avgrd[i].calls = 0;
+		to.avgrd[i].bytes += from.avgrd[i].bytes; from.avgrd[i].bytes = 0;
+	};
+}
+
+void ?{}( acceptor_stats_t & this ) {
+	this.creates = 0;
+	this.accepts = 0;
+	this.eagains = 0;
+}
+
+void push(acceptor_stats_t & from, acceptor_stats_t & to) {
+	to.creates += from.creates; from.creates = 0;
+	to.accepts += from.accepts; from.accepts = 0;
+	to.eagains += from.eagains; from.eagains = 0;
+}
+
+void ?{}( StatsPrinter & this, ServerCluster * cl ) {
+	((thread&)this){ "Stats Printer Thread" };
+	this.cl = cl;
+	memset(&this.stats, 0, sizeof(this.stats));;
+}
+
+void ^?{}( StatsPrinter & mutex this ) {}
+
+#define eng3(X) (ws(3, 3, unit(eng( X ))))
+
+void main(StatsPrinter & this) {
+	LOOP: for() {
+		wait(this.var, 10`s);
+
+		for(i; options.clopts.nclusters) print_stats_now( this.cl[i].self, CFA_STATS_READY_Q | CFA_STATS_IO );
+		{
+			struct {
+				volatile uint64_t calls;
+				volatile uint64_t bytes;
+			} avgrd[zipf_cnts];
+			memset(avgrd, 0, sizeof(avgrd));
+
+			uint64_t tries = this.stats.send.tries;
+			uint64_t calls  = this.stats.send.calls;
+			uint64_t header = this.stats.send.header;
+			uint64_t splcin = this.stats.send.splcin;
+			uint64_t splcot = this.stats.send.splcot;
+			for(j; zipf_cnts) {
+				avgrd[j].calls += this.stats.send.avgrd[j].calls;
+				avgrd[j].bytes += this.stats.send.avgrd[j].bytes;
+			}
+
+			double ratio = ((double)tries) / calls;
+			uint64_t accp_open = this.stats.accpt.accepts;
+			uint64_t accp_clse = this.stats.send.close;
+			uint64_t accp_live = accp_open - accp_clse;
+
+			sout | "----- Acceptor Stats -----";
+			sout | "accepts : " | eng3(accp_open) |"opened," | eng3(accp_clse) |"closed," | eng3(accp_live) |"live";
+			sout | "accept  : " | eng3(this.stats.accpt.accepts) | "calls," | eng3(this.stats.accpt.eagains) | "eagains," | eng3(this.stats.accpt.creates) | " thrds";
+			sout | nl;
+
+			sout | "----- Connection Stats -----";
+			sout | "max fd    : " | this.stats.send.maxfd;
+			sout | "sendfile  : " | eng3(calls) | "calls," | eng3(tries) | "tries (" | ratio | " try/call)";
+			sout | "            " | eng3(header) | "header," | eng3(splcin) | "splice in," | eng3(splcot) | "splice out";
+			sout | " - zipf sizes:";
+			for(i; zipf_cnts) {
+				double written = avgrd[i].calls > 0 ? ((double)avgrd[i].bytes) / avgrd[i].calls : 0;
+				sout | "        " | zipf_sizes[i] | "bytes," | avgrd[i].calls | "shorts," | written | "written";
+			}
+		}
+
+		waitfor( ^?{} : this) {
+			break LOOP;
+		}
+		or else {}
+	}
+}
+
+StatsPrinter * stats_thrd;
Index: benchmark/io/http/printer.hfa
===================================================================
--- benchmark/io/http/printer.hfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ benchmark/io/http/printer.hfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <stdint.h>
+
+#include <locks.hfa>
+#include <thread.hfa>
+
+extern const size_t zipf_sizes[];
+enum { zipf_cnts = 36, };
+
+struct sendfile_stats_t {
+	volatile uint64_t maxfd;
+	volatile uint64_t close;
+	volatile uint64_t calls;
+	volatile uint64_t tries;
+	volatile uint64_t header;
+	volatile uint64_t splcin;
+	volatile uint64_t splcot;
+	struct {
+		volatile uint64_t calls;
+		volatile uint64_t bytes;
+	} avgrd[zipf_cnts];
+};
+
+void ?{}( sendfile_stats_t & this );
+
+void push(sendfile_stats_t & from, sendfile_stats_t & to);
+
+struct acceptor_stats_t {
+	volatile uint64_t creates;
+	volatile uint64_t accepts;
+	volatile uint64_t eagains;
+};
+
+void ?{}( acceptor_stats_t & this );
+
+void push(acceptor_stats_t & from, acceptor_stats_t & to);
+
+struct ServerCluster {
+	cluster self;
+	processor    * procs;
+};
+
+thread StatsPrinter {
+	struct {
+		__spinlock_t lock;
+		sendfile_stats_t send;
+		acceptor_stats_t accpt;
+	} stats;
+	condition_variable(fast_block_lock) var;
+	ServerCluster * cl;
+};
+
+void ?{}( StatsPrinter & this, ServerCluster * cl );
+void ^?{}( StatsPrinter & mutex this );
+
+extern StatsPrinter * stats_thrd;
Index: benchmark/io/http/protocol.cfa
===================================================================
--- benchmark/io/http/protocol.cfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ benchmark/io/http/protocol.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -30,4 +30,13 @@
 #define PLAINTEXT_NOCOPY
 #define LINKED_IO
+
+static inline __s32 wait_res( io_future_t & this ) {
+	wait( this );
+	if( this.result < 0 ) {{
+		errno = -this.result;
+		return -1;
+	}}
+	return this.result;
+}
 
 struct https_msg_str {
@@ -470,4 +479,5 @@
 
 			if(is_error(splice_in.res)) {
+				if(splice_in.res.error == -EPIPE) return -ECONNRESET;
 				mutex(serr) serr | "SPLICE IN failed with" | splice_in.res.error;
 				close(fd);
@@ -503,5 +513,5 @@
 }
 
-[HttpCode code, bool closed, * const char file, size_t len] http_read(int fd, []char buffer, size_t len) {
+[HttpCode code, bool closed, * const char file, size_t len] http_read(volatile int & fd, []char buffer, size_t len, io_future_t * f) {
 	char * it = buffer;
 	size_t count = len - 1;
@@ -509,5 +519,12 @@
 	READ:
 	for() {
-		int ret = cfa_recv(fd, (void*)it, count, 0, CFA_IO_LAZY);
+		int ret;
+		if( f ) {
+			ret = wait_res(*f);
+			reset(*f);
+			f = 0p;
+		} else {
+			ret = cfa_recv(fd, (void*)it, count, 0, CFA_IO_LAZY);
+		}
 		// int ret = read(fd, (void*)it, count);
 		if(ret == 0 ) return [OK200, true, 0, 0];
@@ -570,5 +587,5 @@
 
 void ?{}( DateFormater & this ) {
-	((thread&)this){ "Server Date Thread", *options.clopts.instance[0] };
+	((thread&)this){ "Server Date Thread" };
 	this.idx = 0;
 	memset( &this.buffers[0], 0, sizeof(this.buffers[0]) );
Index: benchmark/io/http/protocol.hfa
===================================================================
--- benchmark/io/http/protocol.hfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ benchmark/io/http/protocol.hfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -1,4 +1,5 @@
 #pragma once
 
+struct io_future_t;
 struct sendfile_stats_t;
 
@@ -22,3 +23,3 @@
 int answer_sendfile( int pipe[2], int fd, int ans_fd, size_t count, struct sendfile_stats_t & );
 
-[HttpCode code, bool closed, * const char file, size_t len] http_read(int fd, []char buffer, size_t len);
+[HttpCode code, bool closed, * const char file, size_t len] http_read(volatile int & fd, []char buffer, size_t len, io_future_t * f);
Index: benchmark/io/http/socket.cfa
===================================================================
--- benchmark/io/http/socket.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ benchmark/io/http/socket.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,74 @@
+#define _GNU_SOURCE
+
+#include "socket.hfa"
+
+#include <errno.h>
+#include <string.h>
+extern "C" {
+	#include <sys/socket.h>
+	#include <netinet/in.h>
+}
+
+#include <fstream.hfa>
+#include <time.hfa>
+#include <thread.hfa>
+
+#include "options.hfa"
+
+int prepaddr(struct sockaddr_in & address) {
+	int addrlen = sizeof(address);
+	memset( (char *)&address, '\0', addrlen );
+	address.sin_family = AF_INET;
+	address.sin_addr.s_addr = htonl(INADDR_ANY);
+	address.sin_port = htons( options.socket.port );
+	return addrlen;
+}
+
+int listener(struct sockaddr_in & address, int addrlen) {
+	int type = SOCK_STREAM;
+	// if(options.socket.reuseport) type |= SOCK_NONBLOCK;
+	int sockfd = socket(AF_INET, type, 0);
+	if(sockfd < 0) {
+		abort( "socket error: (%d) %s\n", (int)errno, strerror(errno) );
+	}
+
+	if(options.socket.reuseport) {
+		int value = 1;
+		// if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (const void*)&on, sizeof(on)))
+		// 	abort( "setsockopt SO_REUSEADDR error: (%d) %s\n", (int)errno, strerror(errno) );
+		if (setsockopt(sockfd, SOL_SOCKET, SO_REUSEPORT, &value, sizeof(int)) < 0)
+			abort( "setsockopt SO_REUSEPORT error: (%d) %s\n", (int)errno, strerror(errno) );
+	}
+
+	int ret = 0;
+	int waited = 0;
+	for() {
+		__CONST_SOCKADDR_ARG addr;
+		addr.__sockaddr__ = (struct sockaddr *)&address;
+		socklen_t addrlen = sizeof(address);
+		ret = bind( sockfd, addr, addrlen );
+		if(ret < 0) {
+			if(errno == EADDRINUSE) {
+				if(waited == 0) {
+					if(!options.interactive) abort | "Port already in use in non-interactive mode. Aborting";
+					sout | "Waiting for port";
+				} else {
+					sout | "\r" | waited | nonl;
+					flush( sout );
+				}
+				waited ++;
+				sleep( 1`s );
+				continue;
+			}
+			abort( "bind error: (%d) %s\n", (int)errno, strerror(errno) );
+		}
+		break;
+	}
+
+	ret = listen( sockfd, options.socket.backlog );
+	if(ret < 0) {
+		abort( "listen error: (%d) %s\n", (int)errno, strerror(errno) );
+	}
+
+	return sockfd;
+}
Index: benchmark/io/http/socket.hfa
===================================================================
--- benchmark/io/http/socket.hfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ benchmark/io/http/socket.hfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,5 @@
+#pragma once
+
+struct sockaddr_in;
+int prepaddr(struct sockaddr_in & addr);
+int listener(struct sockaddr_in & address, int addrlen);
Index: benchmark/io/http/worker.cfa
===================================================================
--- benchmark/io/http/worker.cfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ benchmark/io/http/worker.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -8,4 +8,5 @@
 #include <fstream.hfa>
 #include <iofwd.hfa>
+#include <mutex_stmt.hfa>
 
 #include "options.hfa"
@@ -14,38 +15,119 @@
 
 //=============================================================================================
-// Worker Thread
-//=============================================================================================
-void ?{}( Worker & this ) {
+// Generic connection handling
+//=============================================================================================
+static void handle_connection( connection & this, volatile int & fd, char * buffer, size_t len, io_future_t * f, unsigned long long & last ) {
+	REQUEST:
+	for() {
+		bool closed;
+		HttpCode code;
+		const char * file;
+		size_t name_size;
+
+		// Read the http request
+		if( options.log ) mutex(sout) sout | "=== Reading request ===";
+		[code, closed, file, name_size] = http_read(fd, buffer, len, f);
+		f = 0p;
+
+		// if we are done, break out of the loop
+		if( closed ) break REQUEST;
+
+		// If this wasn't a request retrun 400
+		if( code != OK200 ) {
+			sout | "=== Invalid Request :" | code_val(code) | "===";
+			answer_error(fd, code);
+			continue REQUEST;
+		}
+
+		if(0 == strncmp(file, "plaintext", min(name_size, sizeof("plaintext") ))) {
+			if( options.log ) mutex(sout) sout | "=== Request for /plaintext ===";
+
+			int ret = answer_plaintext(fd);
+			if( ret == -ECONNRESET ) break REQUEST;
+
+			if( options.log ) mutex(sout) sout | "=== Answer sent ===";
+			continue REQUEST;
+		}
+
+		if(0 == strncmp(file, "ping", min(name_size, sizeof("ping") ))) {
+			if( options.log ) mutex(sout) sout | "=== Request for /ping ===";
+
+			// Send the header
+			int ret = answer_empty(fd);
+			if( ret == -ECONNRESET ) break REQUEST;
+
+			if( options.log ) mutex(sout) sout | "=== Answer sent ===";
+			continue REQUEST;
+		}
+
+		if( options.log ) {
+			sout | "=== Request for file " | nonl;
+			write(sout, file, name_size);
+			sout | " ===";
+		}
+
+		if( !options.file_cache.path ) {
+			if( options.log ) {
+				sout | "=== File Not Found (" | nonl;
+				write(sout, file, name_size);
+				sout | ") ===";
+			}
+			answer_error(fd, E405);
+			continue REQUEST;
+		}
+
+		// Get the fd from the file cache
+		int ans_fd;
+		size_t count;
+		[ans_fd, count] = get_file( file, name_size );
+
+		// If we can't find the file, return 404
+		if( ans_fd < 0 ) {
+			if( options.log ) {
+				sout | "=== File Not Found (" | nonl;
+				write(sout, file, name_size);
+				sout | ") ===";
+			}
+			answer_error(fd, E404);
+			continue REQUEST;
+		}
+
+		// Send the desired file
+		int ret = answer_sendfile( this.pipe, fd, ans_fd, count, this.stats.sendfile );
+		if( ret == -ECONNRESET ) break REQUEST;
+
+		if( options.log ) mutex(sout) sout | "=== Answer sent ===";
+	}
+
+	if (stats_thrd) {
+		unsigned long long next = rdtscl();
+		if(next > (last + 500000000)) {
+			if(try_lock(stats_thrd->stats.lock __cfaabi_dbg_ctx2)) {
+				push(this.stats.sendfile, stats_thrd->stats.send);
+				unlock(stats_thrd->stats.lock);
+				last = next;
+			}
+		}
+	}
+}
+
+//=============================================================================================
+// Self Accepting Worker Thread
+//=============================================================================================
+void ?{}( AcceptWorker & this ) {
 	size_t cli = rand() % options.clopts.cltr_cnt;
 	((thread&)this){ "Server Worker Thread", *options.clopts.instance[cli], 64000 };
 	options.clopts.thrd_cnt[cli]++;
-	this.pipe[0] = -1;
-	this.pipe[1] = -1;
 	this.done = false;
-
-	this.stats.sendfile.calls = 0;
-	this.stats.sendfile.tries = 0;
-	this.stats.sendfile.header = 0;
-	this.stats.sendfile.splcin = 0;
-	this.stats.sendfile.splcot = 0;
-	for(i; zipf_cnts) {
-		this.stats.sendfile.avgrd[i].calls = 0;
-		this.stats.sendfile.avgrd[i].bytes = 0;
-	}
-}
-
-extern "C" {
-extern int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
-}
-
-void main( Worker & this ) {
+}
+
+void main( AcceptWorker & this ) {
 	park();
-	/* paranoid */ assert( this.pipe[0] != -1 );
-	/* paranoid */ assert( this.pipe[1] != -1 );
-
-	CONNECTION:
-	for() {
-		if( options.log ) sout | "=== Accepting connection ===";
-		int fd = cfa_accept4( this.[sockfd, addr, addrlen, flags], CFA_IO_LAZY );
+	unsigned long long last = rdtscl();
+	/* paranoid */ assert( this.conn.pipe[0] != -1 );
+	/* paranoid */ assert( this.conn.pipe[1] != -1 );
+	for() {
+		if( options.log ) mutex(sout) sout | "=== Accepting connection ===";
+		int fd = cfa_accept4( this.sockfd, this.[addr, addrlen, flags], CFA_IO_LAZY );
 		if(fd < 0) {
 			if( errno == ECONNABORTED ) break;
@@ -55,90 +137,185 @@
 		if(this.done) break;
 
+		if( options.log ) mutex(sout) sout | "=== New connection" | fd | "" | ", waiting for requests ===";
+		size_t len = options.socket.buflen;
+		char buffer[len];
+		handle_connection( this.conn, fd, buffer, len, 0p, last );
+
+		if( options.log ) mutex(sout) sout | "=== Connection closed ===";
+	}
+}
+
+
+//=============================================================================================
+// Channel Worker Thread
+//=============================================================================================
+void ?{}( ChannelWorker & this ) {
+	size_t cli = rand() % options.clopts.cltr_cnt;
+	((thread&)this){ "Server Worker Thread", *options.clopts.instance[cli], 64000 };
+	options.clopts.thrd_cnt[cli]++;
+	this.done = false;
+}
+
+void main( ChannelWorker & this ) {
+	park();
+	unsigned long long last = rdtscl();
+	/* paranoid */ assert( this.conn.pipe[0] != -1 );
+	/* paranoid */ assert( this.conn.pipe[1] != -1 );
+	for() {
+		size_t len = options.socket.buflen;
+		char buffer[len];
+		PendingRead p;
+		p.next = 0p;
+		p.in.buf = (void*)buffer;
+		p.in.len = len;
+		push(*this.queue, &p);
+
+		if( options.log ) mutex(sout) sout | "=== Waiting new connection ===";
+		handle_connection( this.conn, p.out.fd, buffer, len, &p.f, last );
+
+		if( options.log ) mutex(sout) sout | "=== Connection closed ===";
+		if(this.done) break;
+	}
+}
+
+extern "C" {
+extern int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
+}
+
+void ?{}( Acceptor & this, int cli ) {
+	((thread&)this){ "Server Acceptor Thread", *options.clopts.instance[cli], 64000 };
+	options.clopts.thrd_cnt[cli]++;
+	this.done = false;
+}
+
+static inline __s32 get_res( io_future_t & this ) {
+	if( this.result < 0 ) {{
+		errno = -this.result;
+		return -1;
+	}}
+	return this.result;
+}
+
+static inline void push_connection( Acceptor & this, int fd ) {
+	PendingRead * p = 0p;
+	for() {
+		if(this.done) return;
+		p = pop(*this.queue);
+		if(p) break;
+		yield();
+		this.stats.creates++;
+	};
+
+	p->out.fd = fd;
+	async_recv(p->f, p->out.fd, p->in.buf, p->in.len, 0, CFA_IO_LAZY);
+}
+
+// #define ACCEPT_SPIN
+#define ACCEPT_MANY
+
+void main( Acceptor & this ) {
+	park();
+	unsigned long long last = rdtscl();
+
+#if defined(ACCEPT_SPIN)
+	if( options.log ) sout | "=== Accepting connection ===";
+	for() {
+		int fd = accept4(this.sockfd, this.[addr, addrlen, flags]);
+		if(fd < 0) {
+			if( errno == EWOULDBLOCK) {
+				this.stats.eagains++;
+				yield();
+				continue;
+			}
+			if( errno == ECONNABORTED ) break;
+			if( this.done && (errno == EINVAL || errno == EBADF) ) break;
+			abort( "accept error: (%d) %s\n", (int)errno, strerror(errno) );
+		}
+		this.stats.accepts++;
+
+		if(this.done) return;
+
 		if( options.log ) sout | "=== New connection" | fd | "" | ", waiting for requests ===";
-		REQUEST:
-		for() {
-			bool closed;
-			HttpCode code;
-			const char * file;
-			size_t name_size;
-
-			// Read the http request
-			size_t len = options.socket.buflen;
-			char buffer[len];
-			if( options.log ) sout | "=== Reading request ===";
-			[code, closed, file, name_size] = http_read(fd, buffer, len);
-
-			// if we are done, break out of the loop
-			if( closed ) break REQUEST;
-
-			// If this wasn't a request retrun 400
-			if( code != OK200 ) {
-				sout | "=== Invalid Request :" | code_val(code) | "===";
-				answer_error(fd, code);
-				continue REQUEST;
-			}
-
-			if(0 == strncmp(file, "plaintext", min(name_size, sizeof("plaintext") ))) {
-				if( options.log ) sout | "=== Request for /plaintext ===";
-
-				int ret = answer_plaintext(fd);
-				if( ret == -ECONNRESET ) break REQUEST;
-
-				if( options.log ) sout | "=== Answer sent ===";
-				continue REQUEST;
-			}
-
-			if(0 == strncmp(file, "ping", min(name_size, sizeof("ping") ))) {
-				if( options.log ) sout | "=== Request for /ping ===";
-
-				// Send the header
-				int ret = answer_empty(fd);
-				if( ret == -ECONNRESET ) break REQUEST;
-
-				if( options.log ) sout | "=== Answer sent ===";
-				continue REQUEST;
-			}
-
-			if( options.log ) {
-				sout | "=== Request for file " | nonl;
-				write(sout, file, name_size);
-				sout | " ===";
-			}
-
-			if( !options.file_cache.path ) {
-				if( options.log ) {
-					sout | "=== File Not Found (" | nonl;
-					write(sout, file, name_size);
-					sout | ") ===";
+
+		if(fd) push_connection(this, fd);
+
+		if (stats_thrd) {
+			unsigned long long next = rdtscl();
+			if(next > (last + 500000000)) {
+				if(try_lock(stats_thrd->stats.lock)) {
+					push(this.stats, stats_thrd->stats.accpt);
+					unlock(stats_thrd->stats.lock);
+					last = next;
 				}
-				answer_error(fd, E405);
-				continue REQUEST;
-			}
-
-			// Get the fd from the file cache
-			int ans_fd;
-			size_t count;
-			[ans_fd, count] = get_file( file, name_size );
-
-			// If we can't find the file, return 404
-			if( ans_fd < 0 ) {
-				if( options.log ) {
-					sout | "=== File Not Found (" | nonl;
-					write(sout, file, name_size);
-					sout | ") ===";
+			}
+		}
+
+		if( options.log ) sout | "=== Accepting connection ===";
+	}
+
+#elif defined(ACCEPT_MANY)
+	const int nacc = 10;
+	io_future_t results[nacc];
+
+	for(i; nacc) {
+		io_future_t & res = results[i];
+		reset(res);
+		/* paranoid */ assert(!available(res));
+		if( options.log ) mutex(sout) sout | "=== Re-arming accept no" | i | " ===";
+		async_accept4(res, this.sockfd, this.[addr, addrlen, flags], CFA_IO_LAZY);
+	}
+
+	for() {
+		if (stats_thrd) {
+			unsigned long long next = rdtscl();
+			if(next > (last + 500000000)) {
+				if(try_lock(stats_thrd->stats.lock __cfaabi_dbg_ctx2)) {
+					push(this.stats, stats_thrd->stats.accpt);
+					unlock(stats_thrd->stats.lock);
+					last = next;
 				}
-				answer_error(fd, E404);
-				continue REQUEST;
-			}
-
-			// Send the desired file
-			int ret = answer_sendfile( this.pipe, fd, ans_fd, count, this.stats.sendfile );
-			if( ret == -ECONNRESET ) break REQUEST;
-
-			if( options.log ) sout | "=== Answer sent ===";
-		}
-
-		if( options.log ) sout | "=== Connection closed ===";
-		continue CONNECTION;
-	}
-}
+			}
+		}
+
+		for(i; nacc) {
+			io_future_t & res = results[i];
+			if(available(res)) {
+				if( options.log ) mutex(sout) sout | "=== Accept no " | i | "completed with result" | res.result | "===";
+				int fd = get_res(res);
+				reset(res);
+				this.stats.accepts++;
+				if(fd < 0) {
+					if( errno == ECONNABORTED ) continue;
+					if( this.done && (errno == EINVAL || errno == EBADF) ) continue;
+					abort( "accept error: (%d) %s\n", (int)errno, strerror(errno) );
+				}
+				push_connection( this, fd );
+
+				/* paranoid */ assert(!available(res));
+				if( options.log ) mutex(sout) sout | "=== Re-arming accept no" | i | " ===";
+				async_accept4(res, this.sockfd, this.[addr, addrlen, flags], CFA_IO_LAZY);
+			}
+		}
+		if(this.done) return;
+
+		if( options.log ) mutex(sout) sout | "=== Waiting for any accept ===";
+		this.stats.eagains++;
+		wait_any(results, nacc);
+
+		if( options.log ) mutex(sout) {
+			sout | "=== Acceptor wake-up ===";
+			for(i; nacc) {
+				io_future_t & res = results[i];
+				sout | i | "available:" | available(res);
+			}
+		}
+
+	}
+
+	for(i; nacc) {
+		wait(results[i]);
+	}
+#else
+#error no accept algorithm specified
+#endif
+}
Index: benchmark/io/http/worker.hfa
===================================================================
--- benchmark/io/http/worker.hfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ benchmark/io/http/worker.hfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -1,4 +1,6 @@
 #pragma once
 
+#include <iofwd.hfa>
+#include <queueLockFree.hfa>
 #include <thread.hfa>
 
@@ -7,25 +9,24 @@
 }
 
+#include "printer.hfa"
+
 //=============================================================================================
 // Worker Thread
 //=============================================================================================
 
-extern const size_t zipf_sizes[];
-enum { zipf_cnts = 36, };
-
-struct sendfile_stats_t {
-	volatile uint64_t calls;
-	volatile uint64_t tries;
-	volatile uint64_t header;
-	volatile uint64_t splcin;
-	volatile uint64_t splcot;
+struct connection {
+	int pipe[2];
 	struct {
-		volatile uint64_t calls;
-		volatile uint64_t bytes;
-	} avgrd[zipf_cnts];
+		sendfile_stats_t sendfile;
+	} stats;
 };
 
-thread Worker {
-	int pipe[2];
+static inline void ?{}( connection & this ) {
+	this.pipe[0] = -1;
+	this.pipe[1] = -1;
+}
+
+thread AcceptWorker {
+	connection conn;
 	int sockfd;
 	struct sockaddr * addr;
@@ -33,8 +34,42 @@
 	int flags;
 	volatile bool done;
+};
+void ?{}( AcceptWorker & this);
+void main( AcceptWorker & );
+
+
+struct PendingRead {
+	PendingRead * volatile next;
+	io_future_t f;
 	struct {
-		sendfile_stats_t sendfile;
-	} stats;
+		void * buf;
+		size_t len;
+	} in;
+	struct {
+		volatile int fd;
+	} out;
 };
-void ?{}( Worker & this);
-void main( Worker & );
+
+static inline PendingRead * volatile & ?`next ( PendingRead * node ) {
+	return node->next;
+}
+
+thread ChannelWorker {
+	connection conn;
+	volatile bool done;
+	mpsc_queue(PendingRead) * queue;
+};
+void ?{}( ChannelWorker & );
+void main( ChannelWorker & );
+
+thread Acceptor {
+	mpsc_queue(PendingRead) * queue;
+	int sockfd;
+	struct sockaddr * addr;
+	socklen_t * addrlen;
+	int flags;
+	volatile bool done;
+	acceptor_stats_t stats;
+};
+void ?{}( Acceptor &, int cli );
+void main( Acceptor & );
Index: benchmark/io/sendfile/producer.cfa
===================================================================
--- benchmark/io/sendfile/producer.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ benchmark/io/sendfile/producer.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,509 @@
+// programs that sends a file many times as fast as it can
+// compares sendfile to splice
+
+#define _GNU_SOURCE
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <errno.h>
+#include <locale.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/sendfile.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <fcntl.h>
+
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+
+#include <liburing.h>
+
+enum {
+	USAGE_ERROR = 1,
+	HOST_ERROR,
+	PIPE_ERROR,
+	FSTAT_ERROR,
+	SOCKET_ERROR,
+	CONNECT_ERROR,
+	SENDFILE_ERROR,
+	SPLICEIN_ERROR,
+	SPLICEOUT_ERROR,
+	URINGWAIT_ERROR
+};
+
+enum { buffer_len = 10240 };
+char buffer[buffer_len];
+
+enum { TIMEGRAN = 1000000000LL, TIMES = 100000 };
+
+int pipefd[2];
+struct io_uring ring;
+
+char * buf;
+
+struct stats {
+	size_t calls;
+	size_t bytes;
+	struct {
+		struct {
+			size_t cnt;
+			size_t bytes;
+		} r, w;
+	} shorts;
+};
+static void my_sendfile(int out, int in, size_t size, struct stats *);
+static void my_splice  (int out, int in, size_t size, struct stats *);
+static void my_iouring (int out, int in, size_t size, struct stats *);
+static void my_ringlink(int out, int in, size_t size, struct stats *);
+static void my_readwrit(int out, int in, size_t size, struct stats *);
+typedef void (*sender_t)(int out, int in, size_t size, struct stats *);
+
+static void run(sender_t sender, struct addrinfo * addr, int infd, size_t size);
+
+int main(int argc, char * argv[]) {
+	setlocale(LC_ALL, "");
+	const char * file_path;
+	struct addrinfo * addr;
+	int file_fd;
+	int ret;
+	switch(argc) {
+	case 3:
+		{
+			// Open the file
+			const char * const path = argv[2];
+			ret = open(path, 0, O_RDONLY);
+			if(ret < 0) {
+				fprintf( stderr, "cannot open file '%s': %s\n\n", path, strerror(errno) );
+				goto USAGE;
+			}
+
+			file_path = path;
+			file_fd = ret;
+
+
+			// connect to the address
+			char * state = 0;
+			char * str = argv[1];
+			const char * const host = strtok_r(str, ":", &state);
+			if(NULL == host) {
+				fprintf( stderr, "Invalid host:port specification, no host.\n\n" );
+				goto USAGE;
+			}
+
+			const char * const port = strtok_r(NULL, ":", &state);
+			if(NULL == port) {
+				fprintf( stderr, "Invalid host:port specification, no port.\n\n" );
+				goto USAGE;
+			}
+
+			printf("looking up '%s:%s'\n", host, port);
+
+			struct addrinfo hints = {};
+			struct addrinfo * pResultList = NULL;
+
+			hints.ai_family = AF_INET;
+			hints.ai_socktype = SOCK_STREAM;
+			hints.ai_flags = AI_NUMERICSERV;
+
+			ret = getaddrinfo(host, port, &hints, &pResultList);
+
+			switch(ret) {
+			case 0:
+				addr = pResultList;
+				goto DONE;
+
+			case EAI_ADDRFAMILY:
+				fprintf( stderr, "The specified network host does not have any network addresses in the requested address family.\n\n" );
+				break;
+
+			case EAI_AGAIN:
+				fprintf( stderr, "The name server returned a temporary failure indication. Try again later.\n\n" );
+				exit( HOST_ERROR );
+
+			case EAI_BADFLAGS:
+				fprintf( stderr, "hints.ai_flags  contains invalid flags; or, hints.ai_flags included AI_CANONNAME and name was NULL.\n\n" );
+				exit( HOST_ERROR );
+
+			case EAI_FAIL:
+				fprintf( stderr, "The name server returned a permanent failure indication.\n\n" );
+				break;
+
+			case EAI_FAMILY:
+				fprintf( stderr, "The requested address family is not supported.\n\n" );
+				exit( HOST_ERROR );
+
+			case EAI_MEMORY:
+				fprintf( stderr, "Out of memory.\n\n" );
+				exit( HOST_ERROR );
+
+			case EAI_NODATA:
+				fprintf( stderr, "The specified network host exists, but does not have any network addresses defined.\n\n" );
+				break;
+
+			case EAI_NONAME:
+				fprintf( stderr, "The unkonwn host or invalid port.\n\n" );
+				break;
+
+			case EAI_SERVICE:
+				fprintf( stderr, "The requested service is not available for the requested socket type.\n\n" );
+				break;
+
+			case EAI_SOCKTYPE:
+				fprintf( stderr, "The requested  socket  type  is  not  supported.\n\n" );
+				exit( HOST_ERROR );
+
+			case EAI_SYSTEM:
+				// Other system error, check errno for details.
+			default:
+				fprintf( stderr, "Unnown hostname error: (%d) %s\n\n", (int)errno, strerror(errno) );
+				exit( HOST_ERROR );
+			}
+			if(pResultList) freeaddrinfo(pResultList);
+			goto USAGE;
+		}
+	USAGE:
+	default:
+		fprintf( stderr, "USAGE: %s host:port file\n", argv[0] );
+		exit( USAGE_ERROR );
+	}
+
+	DONE:
+
+	io_uring_queue_init(16, &ring, 0);
+
+	size_t file_size = 0;
+	{
+		struct stat buf;
+   		ret = fstat(file_fd, &buf);
+		if(0 != ret) {
+			fprintf( stderr, "fstat error: (%d) %s\n\n", (int)errno, strerror(errno) );
+			exit( FSTAT_ERROR );
+		}
+		file_size = buf.st_size;
+	}
+
+	{
+		char addr_str[INET_ADDRSTRLEN];
+		struct sockaddr_in * address = (struct sockaddr_in *) addr->ai_addr;
+		inet_ntop( AF_INET, &address->sin_addr, addr_str, INET_ADDRSTRLEN );
+		printf("sending '%s' (%zu bytes) to '%s:%i'\n", file_path, file_size, addr_str, ntohs(address->sin_port));
+	}
+
+	ret = pipe(pipefd);
+	if( ret < 0 ) {
+		fprintf( stderr, "pipe error: (%d) %s\n\n", (int)errno, strerror(errno) );
+		exit( PIPE_ERROR );
+	}
+
+	buf = malloc(file_size);
+
+	printf("--- read + write ---\n");
+	run(my_readwrit, addr, file_fd, file_size);
+	printf("--- splice ---\n");
+	run(my_splice  , addr, file_fd, file_size);
+	printf("--- sendfile ---\n");
+	run(my_sendfile, addr, file_fd, file_size);
+	printf("--- io_uring ---\n");
+	run(my_iouring, addr, file_fd, file_size);
+	printf("--- io_uring + link ---\n");
+	run(my_ringlink, addr, file_fd, file_size);
+
+	close(pipefd[0]);
+	close(pipefd[1]);
+	close(file_fd);
+	return 0;
+}
+
+static void run(sender_t sender, struct addrinfo * addr, int infd, size_t size) {
+
+	int sock = socket(addr->ai_family, addr->ai_socktype, addr->ai_protocol);
+      if(sock < 0) {
+		fprintf( stderr, "socket error: (%d) %s\n\n", (int)errno, strerror(errno) );
+		exit( SOCKET_ERROR );
+      }
+
+      int ret = connect(sock, addr->ai_addr, addr->ai_addrlen);
+      if(ret < 0) {
+            fprintf( stderr, "connect error: (%d) %s\n\n", (int)errno, strerror(errno) );
+		exit( CONNECT_ERROR );
+      }
+
+	struct stats st;
+	st.calls = 0;
+	st.bytes = 0;
+	st.shorts.r.cnt = 0;
+	st.shorts.r.bytes = 0;
+	st.shorts.w.cnt = 0;
+	st.shorts.w.bytes = 0;
+
+	struct timespec after, before;
+
+	clock_gettime(CLOCK_MONOTONIC, &before);
+
+	for(long long int i = 0; i < TIMES; i++) {
+		sender( sock, infd, size, &st );
+	}
+
+	clock_gettime(CLOCK_MONOTONIC, &after);
+
+	close(sock);
+
+	uint64_t tb = ((int64_t)before.tv_sec * TIMEGRAN) + before.tv_nsec;
+	uint64_t ta = ((int64_t)after.tv_sec * TIMEGRAN) + after.tv_nsec;
+	double secs = ((double)ta - tb) / TIMEGRAN;
+
+	printf("Sent %'zu bytes in %'zu files, %f seconds\n", st.bytes, st.calls, secs);
+	printf(" - %'3.3f bytes per second\n", (((double)st.bytes) / secs));
+	printf(" - %'f seconds per file\n", secs / st.calls);
+	printf(" - %'3.3f bytes per calls\n", (((double)st.bytes) / st.calls));
+	if(st.shorts.r.cnt ){
+		printf(" - %'zu short reads\n", st.shorts.r.cnt);
+		printf(" - %'3.3f bytes per short read\n", (((double)st.shorts.r.bytes) / st.shorts.r.cnt));
+	} else printf("No short reads\n");
+	if(st.shorts.w.cnt ){
+		printf(" - %'zu short reads\n", st.shorts.w.cnt);
+		printf(" - %'3.3f bytes per short read\n", (((double)st.shorts.w.bytes) / st.shorts.w.cnt));
+	} else printf("No short writes\n");
+}
+
+static void my_sendfile(int out, int in, size_t size, struct stats * st) {
+	off_t off = 0;
+	for(;;) {
+
+		ssize_t ret = sendfile(out, in, &off, size);
+		if(ret < 0) {
+			fprintf( stderr, "connect error: (%d) %s\n\n", (int)errno, strerror(errno) );
+			exit( SENDFILE_ERROR );
+		}
+
+		st->calls++;
+		st->bytes += ret;
+		off += ret;
+		size -= ret;
+		if( size == 0 ) return;
+		st->shorts.r.cnt++;
+		st->shorts.r.bytes += ret;
+	}
+}
+
+static void my_splice  (int out, int in, size_t size, struct stats * st) {
+	unsigned flags = 0; //SPLICE_F_MOVE; // | SPLICE_F_MORE;
+	off_t offset = 0;
+	size_t writes = 0;
+	for(;;) {
+		ssize_t reti = 0;
+		reti = splice(in, &offset, pipefd[1], NULL, size, flags);
+		if( reti < 0 ) {
+			fprintf( stderr, "splice in error: (%d) %s\n\n", (int)errno, strerror(errno) );
+			exit( SPLICEIN_ERROR );
+		}
+
+		size -= reti;
+		size_t in_pipe = reti;
+		for(;;) {
+			ssize_t reto = 0;
+			reto = splice(pipefd[0], NULL, out, NULL, in_pipe, flags);
+			if( reto < 0 ) {
+				fprintf( stderr, "splice out error: (%d) %s\n\n", (int)errno, strerror(errno) );
+				exit( SPLICEOUT_ERROR );
+			}
+			in_pipe -= reto;
+			writes += reto;
+			if(0 == in_pipe) break;
+			st->shorts.w.cnt++;
+			st->shorts.w.bytes += reto;
+		}
+		if(0 == size) break;
+		st->shorts.r.cnt++;
+		st->shorts.r.bytes += reti;
+	}
+	st->calls++;
+	st->bytes += writes;
+}
+
+static ssize_t naive_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags) {
+	struct io_uring_sqe * sqe = io_uring_get_sqe(&ring);
+
+	io_uring_prep_splice(sqe, fd_in, NULL != off_in ? *off_in: -1, fd_out, NULL != off_out ? *off_out: -1, len, flags);
+
+	io_uring_submit(&ring);
+
+	struct io_uring_cqe * cqe = NULL;
+	/* wait for the sqe to complete */
+	int ret = io_uring_wait_cqe_nr(&ring, &cqe, 1);
+
+	/* read and process cqe event */
+	switch(ret) {
+	case 0:
+		{
+			ssize_t val = cqe->res;
+			if( cqe->res < 0 ) {
+				printf("Completion Error : %s\n", strerror( -cqe->res ));
+				return EXIT_FAILURE;
+			}
+			io_uring_cqe_seen(&ring, cqe);
+			return val;
+		}
+	default:
+		fprintf( stderr, "io_uring_wait error: (%d) %s\n\n", (int)-ret, strerror(-ret) );
+		exit( URINGWAIT_ERROR );
+	}
+}
+
+static void my_iouring (int out, int in, size_t size, struct stats * st) {
+	unsigned flags = 0; //SPLICE_F_MOVE; // | SPLICE_F_MORE;
+	off_t offset = 0;
+	size_t writes = 0;
+	for(;;) {
+		ssize_t reti = 0;
+		reti = naive_splice(in, &offset, pipefd[1], NULL, size, flags);
+		if( reti < 0 ) {
+			fprintf( stderr, "splice in error: (%d) %s\n\n", (int)errno, strerror(errno) );
+			exit( SPLICEIN_ERROR );
+		}
+
+		size -= reti;
+		size_t in_pipe = reti;
+		for(;;) {
+			ssize_t reto = 0;
+			reto = naive_splice(pipefd[0], NULL, out, NULL, in_pipe, flags);
+			if( reto < 0 ) {
+				fprintf( stderr, "splice out error: (%d) %s\n\n", (int)errno, strerror(errno) );
+				exit( SPLICEOUT_ERROR );
+			}
+			in_pipe -= reto;
+			writes += reto;
+			if(0 == in_pipe) break;
+			st->shorts.w.cnt++;
+			st->shorts.w.bytes += reto;
+		}
+		if(0 == size) break;
+		st->shorts.r.cnt++;
+		st->shorts.r.bytes += reti;
+	}
+	st->calls++;
+	st->bytes += writes;
+}
+
+static void my_ringlink(int out, int in, size_t size, struct stats * st) {
+	enum { SPLICE_IN, SPLICE_OUT };
+
+	size_t in_pipe = size;
+	off_t offset = 0;
+	bool has_in = false;
+	bool has_out = false;
+	while(true) {
+		if(!has_in && size > 0) {
+			struct io_uring_sqe * sqe = io_uring_get_sqe(&ring);
+			io_uring_prep_splice(sqe, in, offset, pipefd[1], -1, size, 0);
+			sqe->user_data = SPLICE_IN;
+			sqe->flags = IOSQE_IO_LINK;
+			has_in = true;
+		}
+		if(!has_out) {
+			struct io_uring_sqe * sqe = io_uring_get_sqe(&ring);
+			io_uring_prep_splice(sqe, pipefd[0], -1, out, -1, in_pipe, 0);
+			sqe->user_data = SPLICE_OUT;
+			has_out = true;
+		}
+
+		int ret = io_uring_submit_and_wait(&ring, 1);
+		if(ret < 0) {
+			fprintf( stderr, "io_uring_submit error: (%d) %s\n\n", (int)-ret, strerror(-ret) );
+			exit( URINGWAIT_ERROR );
+		}
+
+		/* poll the cq and count how much polling we did */
+		while(true) {
+			struct io_uring_cqe * cqe = NULL;
+			/* wait for the sqe to complete */
+			int ret = io_uring_wait_cqe_nr(&ring, &cqe, 0);
+
+			/* read and process cqe event */
+			switch(ret) {
+			case 0:
+				if( cqe->res < 0 ) {
+					printf("Completion Error : %s\n", strerror( -cqe->res ));
+					exit( URINGWAIT_ERROR );
+				}
+
+				ssize_t write = cqe->res;
+				int which = cqe->user_data;
+				io_uring_cqe_seen(&ring, cqe);
+				switch( which ) {
+				case SPLICE_IN:
+					has_in = false;
+					size -= write;
+					offset += write;
+					if(0 == size) break;
+					st->shorts.r.cnt++;
+					st->shorts.r.bytes += write;
+					break;
+				case SPLICE_OUT:
+					has_out = false;
+					in_pipe -= write;
+					st->bytes += write;
+					if(0 == in_pipe) break;
+					st->shorts.w.cnt++;
+					st->shorts.w.bytes += write;
+					break;
+				default:
+					printf("Completion Error : unknown user data\n");
+					exit( URINGWAIT_ERROR );
+				}
+				continue;
+			case -EAGAIN:
+				goto OUTER;
+			default:
+				fprintf( stderr, "io_uring_get_cqe error: (%d) %s\n\n", (int)-ret, strerror(-ret) );
+				exit( URINGWAIT_ERROR );
+			}
+		}
+		OUTER:
+		if(0 == in_pipe) break;
+	}
+	st->calls++;
+}
+
+static void my_readwrit(int out, int in, size_t size, struct stats * st) {
+	off_t offset = 0;
+	size_t writes = 0;
+	for(;;) {
+		ssize_t reti = pread(in, buf, size, offset);
+		if( reti < 0 ) {
+			printf("Read in Error : (%d) %s\n\n", (int)errno, strerror(errno) );
+			exit( 1 );
+		}
+
+		offset += reti;
+		size -= reti;
+
+		size_t in_buf = reti;
+		for(;;) {
+			ssize_t reto = write(out, buf, in_buf);
+			if( reto < 0 ) {
+					printf("Write out Error : (%d) %s\n\n", (int)errno, strerror(errno) );
+					exit( 1 );
+				}
+
+			in_buf -= reto;
+			writes += reto;
+			if(0 == in_buf) break;
+			st->shorts.w.cnt++;
+			st->shorts.w.bytes += reto;
+		}
+		if(0 == size) break;
+		st->shorts.r.cnt++;
+		st->shorts.r.bytes += reti;
+	}
+	st->calls++;
+	st->bytes += writes;
+}
Index: libcfa/src/bits/locks.hfa
===================================================================
--- libcfa/src/bits/locks.hfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ libcfa/src/bits/locks.hfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -26,12 +26,4 @@
 	// Wrap in struct to prevent false sharing with debug info
 	volatile bool lock;
-	#ifdef __CFA_DEBUG__
-		// previous function to acquire the lock
-		const char * prev_name;
-		// previous thread to acquire the lock
-		void* prev_thrd;
-		// keep track of number of times we had to spin, just in case the number is unexpectedly huge
-		size_t spin_count;
-	#endif
 };
 
@@ -40,17 +32,9 @@
 		extern void disable_interrupts() OPTIONAL_THREAD;
 		extern void enable_interrupts( bool poll = true ) OPTIONAL_THREAD;
-
-		#ifdef __CFA_DEBUG__
-			void __cfaabi_dbg_record_lock(__spinlock_t & this, const char prev_name[]);
-		#else
-			#define __cfaabi_dbg_record_lock(x, y)
-		#endif
+		#define __cfaabi_dbg_record_lock(x, y)
 	}
 
 	static inline void ?{}( __spinlock_t & this ) {
 		this.lock = 0;
-		#ifdef __CFA_DEBUG__
-			this.spin_count = 0;
-		#endif
 	}
 
@@ -77,7 +61,4 @@
 		for ( unsigned int i = 1;; i += 1 ) {
 			if ( (this.lock == 0) && (__atomic_test_and_set( &this.lock, __ATOMIC_ACQUIRE ) == 0) ) break;
-			#ifdef __CFA_DEBUG__
-				this.spin_count++;
-			#endif
 			#ifndef NOEXPBACK
 				// exponential spin
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ libcfa/src/concurrency/invoke.h	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -195,10 +195,4 @@
 		struct __monitor_group_t monitors;
 
-		// used to put threads on user data structures
-		struct {
-			struct thread$ * next;
-			struct thread$ * back;
-		} seqable;
-
 		// used to put threads on dlist data structure
 		__cfa_dlink(thread$);
@@ -208,4 +202,10 @@
 			struct thread$ * prev;
 		} node;
+
+		// used to store state between clh lock/unlock
+		volatile bool * clh_prev;
+
+		// used to point to this thd's current clh node
+		volatile bool * clh_node;
 
 		struct processor * last_proc;
@@ -240,20 +240,4 @@
 		}
 
-		static inline thread$ * volatile & ?`next ( thread$ * this )  __attribute__((const)) {
-			return this->seqable.next;
-		}
-
-		static inline thread$ *& Back( thread$ * this ) __attribute__((const)) {
-			return this->seqable.back;
-		}
-
-		static inline thread$ *& Next( thread$ * this ) __attribute__((const)) {
-				return this->seqable.next;
-		}
-
-		static inline bool listed( thread$ * this ) {
-			return this->seqable.next != 0p;
-		}
-
 		static inline void ?{}(__monitor_group_t & this) {
 			(this.data){0p};
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ libcfa/src/concurrency/io.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -159,30 +159,39 @@
 
 		const __u32 mask = *ctx->cq.mask;
+		const __u32 num  = *ctx->cq.num;
 		unsigned long long ts_prev = ctx->cq.ts;
-
-		// re-read the head and tail in case it already changed.
-		const __u32 head = *ctx->cq.head;
-		const __u32 tail = *ctx->cq.tail;
-		const __u32 count = tail - head;
-		__STATS__( false, io.calls.drain++; io.calls.completed += count; )
-
-		for(i; count) {
-			unsigned idx = (head + i) & mask;
-			volatile struct io_uring_cqe & cqe = ctx->cq.cqes[idx];
-
-			/* paranoid */ verify(&cqe);
-
-			struct io_future_t * future = (struct io_future_t *)(uintptr_t)cqe.user_data;
-			// __cfadbg_print_safe( io, "Kernel I/O : Syscall completed : cqe %p, result %d for %p\n", &cqe, cqe.res, future );
-
-			__kernel_unpark( fulfil( *future, cqe.res, false ), UNPARK_LOCAL );
-		}
-
-		unsigned long long ts_next = ctx->cq.ts = rdtscl();
-
-		// Mark to the kernel that the cqe has been seen
-		// Ensure that the kernel only sees the new value of the head index after the CQEs have been read.
-		__atomic_store_n( ctx->cq.head, head + count, __ATOMIC_SEQ_CST );
-		ctx->proc->idle_wctx.drain_time = ts_next;
+		unsigned long long ts_next;
+
+		// We might need to do this multiple times if more events completed than can fit in the queue.
+		for() {
+			// re-read the head and tail in case it already changed.
+			const __u32 head = *ctx->cq.head;
+			const __u32 tail = *ctx->cq.tail;
+			const __u32 count = tail - head;
+			__STATS__( false, io.calls.drain++; io.calls.completed += count; )
+
+			for(i; count) {
+				unsigned idx = (head + i) & mask;
+				volatile struct io_uring_cqe & cqe = ctx->cq.cqes[idx];
+
+				/* paranoid */ verify(&cqe);
+
+				struct io_future_t * future = (struct io_future_t *)(uintptr_t)cqe.user_data;
+				// __cfadbg_print_safe( io, "Kernel I/O : Syscall completed : cqe %p, result %d for %p\n", &cqe, cqe.res, future );
+
+				__kernel_unpark( fulfil( *future, cqe.res, false ), UNPARK_LOCAL );
+			}
+
+			ts_next = ctx->cq.ts = rdtscl();
+
+			// Mark to the kernel that the cqe has been seen
+			// Ensure that the kernel only sees the new value of the head index after the CQEs have been read.
+			__atomic_store_n( ctx->cq.head, head + count, __ATOMIC_SEQ_CST );
+			ctx->proc->idle_wctx.drain_time = ts_next;
+
+			if(likely(count < num)) break;
+
+			ioring_syscsll( *ctx, 0, IORING_ENTER_GETEVENTS);
+		}
 
 		__cfadbg_print_safe(io, "Kernel I/O : %u completed age %llu\n", count, ts_next);
Index: libcfa/src/concurrency/io/setup.cfa
===================================================================
--- libcfa/src/concurrency/io/setup.cfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ libcfa/src/concurrency/io/setup.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -138,5 +138,5 @@
 		__u32 nentries = params_in.num_entries != 0 ? params_in.num_entries : 256;
 		if( !is_pow2(nentries) ) {
-			abort("ERROR: I/O setup 'num_entries' must be a power of 2\n");
+			abort("ERROR: I/O setup 'num_entries' must be a power of 2, was %u\n", nentries);
 		}
 
Index: libcfa/src/concurrency/iofwd.hfa
===================================================================
--- libcfa/src/concurrency/iofwd.hfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ libcfa/src/concurrency/iofwd.hfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -76,4 +76,6 @@
 	void reset    ( io_future_t & this ) { return reset    (this.self); }
 	bool available( io_future_t & this ) { return available(this.self); }
+	bool setup    ( io_future_t & this, oneshot & ctx ) { return setup  (this.self, ctx); }
+	bool retract  ( io_future_t & this, oneshot & ctx ) { return retract(this.self, ctx); }
 }
 
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ libcfa/src/concurrency/kernel.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -834,17 +834,4 @@
 #endif
 
-
-
-//-----------------------------------------------------------------------------
-// Debug
-__cfaabi_dbg_debug_do(
-	extern "C" {
-		void __cfaabi_dbg_record_lock(__spinlock_t & this, const char prev_name[]) {
-			this.prev_name = prev_name;
-			this.prev_thrd = kernelTLS().this_thread;
-		}
-	}
-)
-
 //-----------------------------------------------------------------------------
 // Debug
Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -200,5 +200,4 @@
 					struct thread$ * expected = this.ptr;
 					if(expected == 1p) return false;
-					/* paranoid */ verify( expected == 0p );
 					if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
 						park();
@@ -213,5 +212,5 @@
 			thread$ * post(oneshot & this, bool do_unpark = true) {
 				struct thread$ * got = __atomic_exchange_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
-				if( got == 0p ) return 0p;
+				if( got == 0p || got == 1p ) return 0p;
 				if(do_unpark) unpark( got );
 				return got;
@@ -263,5 +262,4 @@
 
 					// The future is not fulfilled, try to setup the wait context
-					/* paranoid */ verify( expected == 0p );
 					if(__atomic_compare_exchange_n(&this.ptr, &expected, &wait_ctx, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
 						return true;
@@ -275,30 +273,34 @@
 			// should retract the wait ctx
 			// intented to be use by wait, wait_any, waitfor, etc. rather than used directly
-			void retract( future_t & this, oneshot & wait_ctx ) {
-				// Remove the wait context
-				struct oneshot * got = __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
-
-				// got == 0p: future was never actually setup, just return
-				if( got == 0p ) return;
-
-				// got == wait_ctx: since fulfil does an atomic_swap,
-				// if we got back the original then no one else saw context
-				// It is safe to delete (which could happen after the return)
-				if( got == &wait_ctx ) return;
-
-				// got == 1p: the future is ready and the context was fully consumed
-				// the server won't use the pointer again
-				// It is safe to delete (which could happen after the return)
-				if( got == 1p ) return;
-
-				// got == 2p: the future is ready but the context hasn't fully been consumed
-				// spin until it is safe to move on
-				if( got == 2p ) {
-					while( this.ptr != 1p ) Pause();
-					return;
-				}
-
-				// got == any thing else, something wen't wrong here, abort
-				abort("Future in unexpected state");
+			bool retract( future_t & this, oneshot & wait_ctx ) {
+				for() {
+					struct oneshot * expected = this.ptr;
+
+					// expected == 0p: future was never actually setup, just return
+					if( expected == 0p ) return false;
+
+					// expected == 1p: the future is ready and the context was fully consumed
+					// the server won't use the pointer again
+					// It is safe to delete (which could happen after the return)
+					if( expected == 1p ) return true;
+
+					// expected == 2p: the future is ready but the context hasn't fully been consumed
+					// spin until it is safe to move on
+					if( expected == 2p ) {
+						while( this.ptr != 1p ) Pause();
+						/* paranoid */ verify( this.ptr == 1p );
+						return true;
+					}
+
+					// expected != wait_ctx: the future was setup with a different context ?!?!
+					// something went wrong here, abort
+					if( expected != &wait_ctx ) abort("Future in unexpected state");
+
+					// we still have the original context, then no one else saw it
+					// attempt to remove the context so it doesn't get consumed.
+					if(__atomic_compare_exchange_n( &this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+						return false;
+					}
+				}
 			}
 
@@ -379,4 +381,27 @@
 				return ret;
 			}
+
+			// Wait for any future to be fulfilled
+			forall(T& | sized(T) | { bool setup( T&, oneshot & ); bool retract( T&, oneshot & ); })
+			T & wait_any( T * futures, size_t num_futures ) {
+				oneshot temp;
+
+				// setup all futures
+				// if any are already satisfied return
+				for ( i; num_futures ) {
+					if( !setup(futures[i], temp) ) return futures[i];
+				}
+
+				// Wait context is setup, just wait on it
+				wait( temp );
+
+				size_t ret;
+				// attempt to retract all futures
+				for ( i; num_futures ) {
+					if ( retract( futures[i], temp ) ) ret = i;
+				}
+
+				return futures[ret];
+			}
 		}
 
Index: libcfa/src/concurrency/locks.cfa
===================================================================
--- libcfa/src/concurrency/locks.cfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ libcfa/src/concurrency/locks.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -219,4 +219,37 @@
 	// this casts the alarm node to our wrapped type since we used type erasure
 	static void alarm_node_wrap_cast( alarm_node_t & a ) { timeout_handler( (alarm_node_wrap(L) &)a ); }
+
+	struct pthread_alarm_node_wrap {
+		alarm_node_t alarm_node;
+		pthread_cond_var(L) * cond;
+		info_thread(L) * info_thd;
+	};
+
+	void ?{}( pthread_alarm_node_wrap(L) & this, Duration alarm, Duration period, Alarm_Callback callback, pthread_cond_var(L) * c, info_thread(L) * i ) {
+		this.alarm_node{ callback, alarm, period };
+		this.cond = c;
+		this.info_thd = i;
+	}
+
+	void ^?{}( pthread_alarm_node_wrap(L) & this ) { }
+
+	static void timeout_handler ( pthread_alarm_node_wrap(L) & this ) with( this ) {
+		// This pthread_cond_var member is called from the kernel, and therefore, cannot block, but it can spin.
+		lock( cond->lock __cfaabi_dbg_ctx2 );
+
+		// this check is necessary to avoid a race condition since this timeout handler
+		// 	may still be called after a thread has been removed from the queue but
+		// 	before the alarm is unregistered
+		if ( (*info_thd)`isListed ) {	// is thread on queue
+			info_thd->signalled = false;
+			// remove this thread O(1)
+			remove( *info_thd );
+			on_notify(*info_thd->lock, info_thd->t);
+		}
+		unlock( cond->lock );
+	}
+
+	// this casts the alarm node to our wrapped type since we used type erasure
+	static void pthread_alarm_node_wrap_cast( alarm_node_t & a ) { timeout_handler( (pthread_alarm_node_wrap(L) &)a ); }
 }
 
@@ -388,6 +421,93 @@
 		on_wakeup(*i.lock, recursion_count);
 	}
-}
-
+
+	//-----------------------------------------------------------------------------
+	// pthread_cond_var
+
+	void  ?{}( pthread_cond_var(L) & this ) with(this) {
+		blocked_threads{};
+		lock{};
+	}
+
+	void ^?{}( pthread_cond_var(L) & this ) { }
+
+	bool notify_one( pthread_cond_var(L) & this ) with(this) { 
+		lock( lock __cfaabi_dbg_ctx2 );
+		bool ret = ! blocked_threads`isEmpty;
+		if ( ret ) {
+			info_thread(L) & popped = try_pop_front( blocked_threads );
+			on_notify(*popped.lock, popped.t);
+		}
+		unlock( lock );
+		return ret;
+	}
+
+	bool notify_all( pthread_cond_var(L) & this ) with(this) { 
+		lock( lock __cfaabi_dbg_ctx2 );
+		bool ret = ! blocked_threads`isEmpty;
+		while( ! blocked_threads`isEmpty ) {
+			info_thread(L) & popped = try_pop_front( blocked_threads );
+			on_notify(*popped.lock, popped.t);
+		}
+		unlock( lock );
+		return ret;
+	}
+
+	uintptr_t front( pthread_cond_var(L) & this ) with(this) { return blocked_threads`isEmpty ? NULL : blocked_threads`first.info; }
+	bool empty ( pthread_cond_var(L) & this ) with(this) { return blocked_threads`isEmpty; }
+
+	static size_t queue_and_get_recursion( pthread_cond_var(L) & this, info_thread(L) * i ) with(this) {
+		// add info_thread to waiting queue
+		insert_last( blocked_threads, *i );
+		size_t recursion_count = 0;
+		recursion_count = on_wait( *i->lock );
+		return recursion_count;
+	}
+	
+	static void queue_info_thread_timeout( pthread_cond_var(L) & this, info_thread(L) & info, Duration t, Alarm_Callback callback ) with(this) {
+		lock( lock __cfaabi_dbg_ctx2 );
+		size_t recursion_count = queue_and_get_recursion(this, &info);
+		pthread_alarm_node_wrap(L) node_wrap = { t, 0`s, callback, &this, &info };
+		register_self( &node_wrap.alarm_node );
+		unlock( lock );
+
+		// blocks here
+		park();
+
+		// unregisters alarm so it doesn't go off if this happens first
+		unregister_self( &node_wrap.alarm_node );
+
+		// resets recursion count here after waking
+		if (info.lock) on_wakeup(*info.lock, recursion_count);
+	}
+
+	void wait( pthread_cond_var(L) & this, L & l ) with(this) {
+		wait( this, l, 0 );
+	}
+
+	void wait( pthread_cond_var(L) & this, L & l, uintptr_t info ) with(this) {
+		lock( lock __cfaabi_dbg_ctx2 );
+		info_thread( L ) i = { active_thread(), info, &l };
+		size_t recursion_count = queue_and_get_recursion(this, &i);
+		unlock( lock );
+		park( );
+		on_wakeup(*i.lock, recursion_count);
+	}
+
+	#define PTHREAD_WAIT_TIME( u, l, t ) \
+		info_thread( L ) i = { active_thread(), u, l }; \
+		queue_info_thread_timeout(this, i, t, pthread_alarm_node_wrap_cast ); \
+		return i.signalled;
+
+	bool wait( pthread_cond_var(L) & this, L & l, timespec t ) {
+		Duration d = { t };
+		WAIT_TIME( 0, &l , d )
+	}
+	
+	bool wait( pthread_cond_var(L) & this, L & l, uintptr_t info, timespec t  ) {
+		Duration d = { t };
+		WAIT_TIME( info, &l , d )
+	}
+}
 //-----------------------------------------------------------------------------
 // Semaphore
Index: libcfa/src/concurrency/locks.hfa
===================================================================
--- libcfa/src/concurrency/locks.hfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ libcfa/src/concurrency/locks.hfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -101,4 +101,69 @@
 
 //-----------------------------------------------------------------------------
+// MCS Spin Lock
+// - No recursive acquisition
+// - Needs to be released by owner
+
+struct mcs_spin_node {
+	mcs_spin_node * volatile next;
+	volatile bool locked;
+};
+
+struct mcs_spin_queue {
+	mcs_spin_node * volatile tail;
+};
+
+static inline void ?{}(mcs_spin_node & this) { this.next = 0p; this.locked = true; }
+
+static inline mcs_spin_node * volatile & ?`next ( mcs_spin_node * node ) {
+	return node->next;
+}
+
+struct mcs_spin_lock {
+	mcs_spin_queue queue;
+};
+
+static inline void lock(mcs_spin_lock & l, mcs_spin_node & n) {
+	mcs_spin_node * prev = __atomic_exchange_n(&l.queue.tail, &n, __ATOMIC_SEQ_CST);
+	n.locked = true;
+	if(prev == 0p) return;
+	prev->next = &n;
+	while(__atomic_load_n(&n.locked, __ATOMIC_RELAXED)) Pause();
+}
+
+static inline void unlock(mcs_spin_lock & l, mcs_spin_node & n) {
+	mcs_spin_node * n_ptr = &n;
+	if (__atomic_compare_exchange_n(&l.queue.tail, &n_ptr, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) return;
+	while (__atomic_load_n(&n.next, __ATOMIC_RELAXED) == 0p) {}
+	n.next->locked = false;
+}
+
+//-----------------------------------------------------------------------------
+// CLH Spinlock
+// - No recursive acquisition
+// - Needs to be released by owner
+
+struct clh_lock {
+	volatile bool * volatile tail;
+};
+
+static inline void  ?{}( clh_lock & this ) { this.tail = malloc(); *this.tail = true; }
+static inline void ^?{}( clh_lock & this ) { free(this.tail); }
+
+static inline void lock(clh_lock & l) {
+	thread$ * curr_thd = active_thread();
+	*(curr_thd->clh_node) = false;
+	volatile bool * prev = __atomic_exchange_n((bool **)(&l.tail), (bool *)(curr_thd->clh_node), __ATOMIC_SEQ_CST);
+	while(!__atomic_load_n(prev, __ATOMIC_ACQUIRE)) Pause();
+	curr_thd->clh_prev = prev;
+}
+
+static inline void unlock(clh_lock & l) {
+	thread$ * curr_thd = active_thread();
+	__atomic_store_n(curr_thd->clh_node, true, __ATOMIC_RELEASE);
+	curr_thd->clh_node = curr_thd->clh_prev;
+}
+
+//-----------------------------------------------------------------------------
 // Linear backoff Spinlock
 struct linear_backoff_then_block_lock {
@@ -205,16 +270,22 @@
 // Fast Block Lock
 
-// High efficiency minimal blocking lock
+// minimal blocking lock
 // - No reacquire for cond var
 // - No recursive acquisition
 // - No ownership
 struct fast_block_lock {
+	// List of blocked threads
+	dlist( thread$ ) blocked_threads;
+
 	// Spin lock used for mutual exclusion
 	__spinlock_t lock;
 
-	// List of blocked threads
-	dlist( thread$ ) blocked_threads;
-
+	// flag showing if lock is held
 	bool held:1;
+
+	#ifdef __CFA_DEBUG__
+	// for deadlock detection
+	struct thread$ * owner;
+	#endif
 };
 
@@ -231,4 +302,8 @@
 static inline void lock(fast_block_lock & this) with(this) {
 	lock( lock __cfaabi_dbg_ctx2 );
+
+	#ifdef __CFA_DEBUG__
+	assert(!(held && owner == active_thread()));
+	#endif
 	if (held) {
 		insert_last( blocked_threads, *active_thread() );
@@ -238,4 +313,7 @@
 	}
 	held = true;
+	#ifdef __CFA_DEBUG__
+	owner = active_thread();
+	#endif
 	unlock( lock );
 }
@@ -246,4 +324,7 @@
 	thread$ * t = &try_pop_front( blocked_threads );
 	held = ( t ? true : false );
+	#ifdef __CFA_DEBUG__
+	owner = ( t ? t : 0p );
+	#endif
 	unpark( t );
 	unlock( lock );
@@ -253,4 +334,274 @@
 static inline size_t on_wait(fast_block_lock & this) { unlock(this); return 0; }
 static inline void on_wakeup(fast_block_lock & this, size_t recursion ) { }
+
+//-----------------------------------------------------------------------------
+// simple_owner_lock
+
+// pthread owner lock
+// - reacquire for cond var
+// - recursive acquisition
+// - ownership
+struct simple_owner_lock {
+	// List of blocked threads
+	dlist( thread$ ) blocked_threads;
+
+	// Spin lock used for mutual exclusion
+	__spinlock_t lock;
+
+	// owner showing if lock is held
+	struct thread$ * owner;
+
+	size_t recursion_count;
+};
+
+static inline void  ?{}( simple_owner_lock & this ) with(this) {
+	lock{};
+	blocked_threads{};
+	owner = 0p;
+	recursion_count = 0;
+}
+static inline void ^?{}( simple_owner_lock & this ) {}
+static inline void ?{}( simple_owner_lock & this, simple_owner_lock this2 ) = void;
+static inline void ?=?( simple_owner_lock & this, simple_owner_lock this2 ) = void;
+
+static inline void lock(simple_owner_lock & this) with(this) {
+	if (owner == active_thread()) {
+		recursion_count++;
+		return;
+	}
+	lock( lock __cfaabi_dbg_ctx2 );
+
+	if (owner != 0p) {
+		insert_last( blocked_threads, *active_thread() );
+		unlock( lock );
+		park( );
+		return;
+	}
+	owner = active_thread();
+	recursion_count = 1;
+	unlock( lock );
+}
+
+// TODO: fix duplicate def issue and bring this back
+// void pop_and_set_new_owner( simple_owner_lock & this ) with( this ) {
+	// thread$ * t = &try_pop_front( blocked_threads );
+	// owner = t;
+	// recursion_count = ( t ? 1 : 0 );
+	// unpark( t );
+// }
+
+static inline void unlock(simple_owner_lock & this) with(this) {
+	lock( lock __cfaabi_dbg_ctx2 );
+	/* paranoid */ verifyf( owner != 0p, "Attempt to release lock %p that isn't held", &this );
+	/* paranoid */ verifyf( owner == active_thread(), "Thread %p other than the owner %p attempted to release owner lock %p", owner, active_thread(), &this );
+	// if recursion count is zero release lock and set new owner if one is waiting
+	recursion_count--;
+	if ( recursion_count == 0 ) {
+		// pop_and_set_new_owner( this );
+		thread$ * t = &try_pop_front( blocked_threads );
+		owner = t;
+		recursion_count = ( t ? 1 : 0 );
+		unpark( t );
+	}
+	unlock( lock );
+}
+
+static inline void on_notify(simple_owner_lock & this, struct thread$ * t ) with(this) {
+	lock( lock __cfaabi_dbg_ctx2 );
+	// lock held
+	if ( owner != 0p ) {
+		insert_last( blocked_threads, *t );
+		unlock( lock );
+	}
+	// lock not held
+	else {
+		owner = t;
+		recursion_count = 1;
+		unpark( t );
+		unlock( lock );
+	}
+}
+
+static inline size_t on_wait(simple_owner_lock & this) with(this) { 
+	lock( lock __cfaabi_dbg_ctx2 );
+	/* paranoid */ verifyf( owner != 0p, "Attempt to release lock %p that isn't held", &this );
+	/* paranoid */ verifyf( owner == active_thread(), "Thread %p other than the owner %p attempted to release owner lock %p", owner, active_thread(), &this );
+
+	size_t ret = recursion_count;
+
+	// pop_and_set_new_owner( this );
+
+	thread$ * t = &try_pop_front( blocked_threads );
+	owner = t;
+	recursion_count = ( t ? 1 : 0 );
+	unpark( t );
+
+	unlock( lock );
+	return ret;
+}
+
+static inline void on_wakeup(simple_owner_lock & this, size_t recursion ) with(this) { recursion_count = recursion; }
+
+//-----------------------------------------------------------------------------
+// Spin Queue Lock
+
+// - No reacquire for cond var
+// - No recursive acquisition
+// - No ownership
+// - spin lock with no locking/atomics in unlock
+struct spin_queue_lock {
+	// Spin lock used for mutual exclusion
+	mcs_spin_lock lock;
+
+	// flag showing if lock is held
+	volatile bool held;
+
+	#ifdef __CFA_DEBUG__
+	// for deadlock detection
+	struct thread$ * owner;
+	#endif
+};
+
+static inline void  ?{}( spin_queue_lock & this ) with(this) {
+	lock{};
+	held = false;
+}
+static inline void ^?{}( spin_queue_lock & this ) {}
+static inline void ?{}( spin_queue_lock & this, spin_queue_lock this2 ) = void;
+static inline void ?=?( spin_queue_lock & this, spin_queue_lock this2 ) = void;
+
+// if this is called recursively IT WILL DEADLOCK!!!!!
+static inline void lock(spin_queue_lock & this) with(this) {
+	mcs_spin_node node;
+	#ifdef __CFA_DEBUG__
+	assert(!(held && owner == active_thread()));
+	#endif
+	lock( lock, node );
+	while(__atomic_load_n(&held, __ATOMIC_SEQ_CST)) Pause();
+	__atomic_store_n(&held, true, __ATOMIC_SEQ_CST);
+	unlock( lock, node );
+	#ifdef __CFA_DEBUG__
+	owner = active_thread();
+	#endif
+}
+
+static inline void unlock(spin_queue_lock & this) with(this) {
+	#ifdef __CFA_DEBUG__
+	owner = 0p;
+	#endif
+	__atomic_store_n(&held, false, __ATOMIC_RELEASE);
+}
+
+static inline void on_notify(spin_queue_lock & this, struct thread$ * t ) { unpark(t); }
+static inline size_t on_wait(spin_queue_lock & this) { unlock(this); return 0; }
+static inline void on_wakeup(spin_queue_lock & this, size_t recursion ) { }
+
+
+//-----------------------------------------------------------------------------
+// MCS Block Spin Lock
+
+// - No reacquire for cond var
+// - No recursive acquisition
+// - No ownership
+// - Blocks but first node spins (like spin queue but blocking for not first thd)
+struct mcs_block_spin_lock {
+	// Spin lock used for mutual exclusion
+	mcs_lock lock;
+
+	// flag showing if lock is held
+	volatile bool held;
+
+	#ifdef __CFA_DEBUG__
+	// for deadlock detection
+	struct thread$ * owner;
+	#endif
+};
+
+static inline void  ?{}( mcs_block_spin_lock & this ) with(this) {
+	lock{};
+	held = false;
+}
+static inline void ^?{}( mcs_block_spin_lock & this ) {}
+static inline void ?{}( mcs_block_spin_lock & this, mcs_block_spin_lock this2 ) = void;
+static inline void ?=?( mcs_block_spin_lock & this, mcs_block_spin_lock this2 ) = void;
+
+// if this is called recursively IT WILL DEADLOCK!!!!!
+static inline void lock(mcs_block_spin_lock & this) with(this) {
+	mcs_node node;
+	#ifdef __CFA_DEBUG__
+	assert(!(held && owner == active_thread()));
+	#endif
+	lock( lock, node );
+	while(held) Pause();
+	held = true;
+	unlock( lock, node );
+	#ifdef __CFA_DEBUG__
+	owner = active_thread();
+	#endif
+}
+
+static inline void unlock(mcs_block_spin_lock & this) with(this) {
+	#ifdef __CFA_DEBUG__
+	owner = 0p;
+	#endif
+	held = false;
+}
+
+static inline void on_notify(mcs_block_spin_lock & this, struct thread$ * t ) { unpark(t); }
+static inline size_t on_wait(mcs_block_spin_lock & this) { unlock(this); return 0; }
+static inline void on_wakeup(mcs_block_spin_lock & this, size_t recursion ) { }
+
+//-----------------------------------------------------------------------------
+// Block Spin Lock
+
+// - No reacquire for cond var
+// - No recursive acquisition
+// - No ownership
+// - Blocks but first node spins (like spin queue but blocking for not first thd)
+struct block_spin_lock {
+	// Spin lock used for mutual exclusion
+	fast_block_lock lock;
+
+	// flag showing if lock is held
+	volatile bool held;
+
+	#ifdef __CFA_DEBUG__
+	// for deadlock detection
+	struct thread$ * owner;
+	#endif
+};
+
+static inline void  ?{}( block_spin_lock & this ) with(this) {
+	lock{};
+	held = false;
+}
+static inline void ^?{}( block_spin_lock & this ) {}
+static inline void ?{}( block_spin_lock & this, block_spin_lock this2 ) = void;
+static inline void ?=?( block_spin_lock & this, block_spin_lock this2 ) = void;
+
+// if this is called recursively IT WILL DEADLOCK!!!!!
+static inline void lock(block_spin_lock & this) with(this) {
+	#ifdef __CFA_DEBUG__
+	assert(!(held && owner == active_thread()));
+	#endif
+	lock( lock );
+	while(held) Pause();
+	held = true;
+	unlock( lock );
+	#ifdef __CFA_DEBUG__
+	owner = active_thread();
+	#endif
+}
+
+static inline void unlock(block_spin_lock & this) with(this) {
+	#ifdef __CFA_DEBUG__
+	owner = 0p;
+	#endif
+	held = false;
+}
+
+static inline void on_notify(block_spin_lock & this, struct thread$ * t ) { unpark(t); }
+static inline size_t on_wait(block_spin_lock & this) { unlock(this); return 0; }
+static inline void on_wakeup(block_spin_lock & this, size_t recursion ) { }
 
 //-----------------------------------------------------------------------------
@@ -332,8 +683,8 @@
 	// - signalling without holding branded lock is UNSAFE!
 	// - only allows usage of one lock, cond var is branded after usage
+
 	struct fast_cond_var {
 		// List of blocked threads
 		dlist( info_thread(L) ) blocked_threads;
-
 		#ifdef __CFA_DEBUG__
 		L * lock_used;
@@ -341,5 +692,4 @@
 	};
 
-
 	void  ?{}( fast_cond_var(L) & this );
 	void ^?{}( fast_cond_var(L) & this );
@@ -349,8 +699,33 @@
 
 	uintptr_t front( fast_cond_var(L) & this );
-
 	bool empty  ( fast_cond_var(L) & this );
 
 	void wait( fast_cond_var(L) & this, L & l );
 	void wait( fast_cond_var(L) & this, L & l, uintptr_t info );
-}
+
+
+	//-----------------------------------------------------------------------------
+	// pthread_cond_var
+	//
+	// - cond var with minimal footprint
+	// - supports operations needed for phthread cond
+
+	struct pthread_cond_var {
+		dlist( info_thread(L) ) blocked_threads;
+		__spinlock_t lock;
+	};
+
+	void  ?{}( pthread_cond_var(L) & this );
+	void ^?{}( pthread_cond_var(L) & this );
+
+	bool notify_one( pthread_cond_var(L) & this );
+	bool notify_all( pthread_cond_var(L) & this );
+
+	uintptr_t front( pthread_cond_var(L) & this );
+	bool empty ( pthread_cond_var(L) & this );
+
+	void wait( pthread_cond_var(L) & this, L & l );
+	void wait( pthread_cond_var(L) & this, L & l, uintptr_t info );
+	bool wait( pthread_cond_var(L) & this, L & l, timespec t );
+	bool wait( pthread_cond_var(L) & this, L & l, uintptr_t info, timespec t );
+}
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ libcfa/src/concurrency/thread.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -53,11 +53,11 @@
 	#endif
 
-	seqable.next = 0p;
-	seqable.back = 0p;
-
 	node.next = 0p;
 	node.prev = 0p;
+
+	clh_node = malloc( );
+	*clh_node = false;
+
 	doregister(curr_cluster, this);
-
 	monitors{ &self_mon_p, 1, (fptr_t)0 };
 }
@@ -67,4 +67,5 @@
 		canary = 0xDEADDEADDEADDEADp;
 	#endif
+	free(clh_node);
 	unregister(curr_cluster, this);
 	^self_cor{};
Index: libcfa/src/containers/queueLockFree.hfa
===================================================================
--- libcfa/src/containers/queueLockFree.hfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ libcfa/src/containers/queueLockFree.hfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -2,4 +2,6 @@
 
 #include <assert.h>
+
+#include <bits/defs.hfa>
 
 forall( T &) {
Index: libcfa/src/startup.cfa
===================================================================
--- libcfa/src/startup.cfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ libcfa/src/startup.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -63,7 +63,4 @@
 
 struct __spinlock_t;
-extern "C" {
-	void __cfaabi_dbg_record_lock(struct __spinlock_t & this, const char prev_name[]) __attribute__(( weak )) libcfa_public {}
-}
 
 // Local Variables: //
Index: src/AST/Pass.impl.hpp
===================================================================
--- src/AST/Pass.impl.hpp	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ src/AST/Pass.impl.hpp	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -182,8 +182,8 @@
 
 		// get the stmts/decls that will need to be spliced in
-		auto stmts_before = __pass::stmtsToAddBefore( core, 0);
-		auto stmts_after  = __pass::stmtsToAddAfter ( core, 0);
-		auto decls_before = __pass::declsToAddBefore( core, 0);
-		auto decls_after  = __pass::declsToAddAfter ( core, 0);
+		auto stmts_before = __pass::stmtsToAddBefore( core, 0 );
+		auto stmts_after  = __pass::stmtsToAddAfter ( core, 0 );
+		auto decls_before = __pass::declsToAddBefore( core, 0 );
+		auto decls_after  = __pass::declsToAddAfter ( core, 0 );
 
 		// These may be modified by subnode but most be restored once we exit this statemnet.
@@ -287,8 +287,8 @@
 
 		// get the stmts/decls that will need to be spliced in
-		auto stmts_before = __pass::stmtsToAddBefore( core, 0);
-		auto stmts_after  = __pass::stmtsToAddAfter ( core, 0);
-		auto decls_before = __pass::declsToAddBefore( core, 0);
-		auto decls_after  = __pass::declsToAddAfter ( core, 0);
+		auto stmts_before = __pass::stmtsToAddBefore( core, 0 );
+		auto stmts_after  = __pass::stmtsToAddAfter ( core, 0 );
+		auto decls_before = __pass::declsToAddBefore( core, 0 );
+		auto decls_after  = __pass::declsToAddAfter ( core, 0 );
 
 		// These may be modified by subnode but most be restored once we exit this statemnet.
@@ -317,6 +317,4 @@
 				assert(( empty( stmts_before ) && empty( stmts_after ))
 				    || ( empty( decls_before ) && empty( decls_after )) );
-
-
 
 				// Take all the statements which should have gone after, N/A for first iteration
@@ -2116,6 +2114,6 @@
 	if ( __visit_children() ) {
 		bool mutated = false;
-		std::unordered_map< ast::TypeInstType::TypeEnvKey, ast::ptr< ast::Type > > new_map;
-		for ( const auto & p : node->typeEnv ) {
+		ast::TypeSubstitution::TypeMap new_map;
+		for ( const auto & p : node->typeMap ) {
 			guard_symtab guard { *this };
 			auto new_node = p.second->accept( *this );
@@ -2125,5 +2123,5 @@
 		if (mutated) {
 			auto new_node = __pass::mutate<core_t>( node );
-			new_node->typeEnv.swap( new_map );
+			new_node->typeMap.swap( new_map );
 			node = new_node;
 		}
Index: src/AST/TypeSubstitution.cpp
===================================================================
--- src/AST/TypeSubstitution.cpp	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ src/AST/TypeSubstitution.cpp	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -38,43 +38,44 @@
 
 void TypeSubstitution::initialize( const TypeSubstitution &src, TypeSubstitution &dest ) {
-	dest.typeEnv.clear();
+	dest.typeMap.clear();
 	dest.add( src );
 }
 
 void TypeSubstitution::add( const TypeSubstitution &other ) {
-	for ( TypeEnvType::const_iterator i = other.typeEnv.begin(); i != other.typeEnv.end(); ++i ) {
-		typeEnv[ i->first ] = i->second;
+	for ( TypeMap::const_iterator i = other.typeMap.begin(); i != other.typeMap.end(); ++i ) {
+		typeMap[ i->first ] = i->second;
 	} // for
 }
 
 void TypeSubstitution::add( const TypeInstType * formalType, const Type *actualType ) {
-	typeEnv[ *formalType ] = actualType;
+	typeMap[ *formalType ] = actualType;
 }
 
 void TypeSubstitution::add( const TypeInstType::TypeEnvKey & key, const Type * actualType) {
-	typeEnv[ key ] = actualType;
+	typeMap[ key ] = actualType;
 }
 
 void TypeSubstitution::remove( const TypeInstType * formalType ) {
-	TypeEnvType::iterator i = typeEnv.find( *formalType );
-	if ( i != typeEnv.end() ) {
-		typeEnv.erase( *formalType );
-	} // if
-}
-
-const Type *TypeSubstitution::lookup( const TypeInstType * formalType ) const {
-	TypeEnvType::const_iterator i = typeEnv.find( *formalType );
+	TypeMap::iterator i = typeMap.find( *formalType );
+	if ( i != typeMap.end() ) {
+		typeMap.erase( *formalType );
+	} // if
+}
+
+const Type *TypeSubstitution::lookup(
+		const TypeInstType::TypeEnvKey & formalType ) const {
+	TypeMap::const_iterator i = typeMap.find( formalType );
 
 	// break on not in substitution set
-	if ( i == typeEnv.end() ) return 0;
+	if ( i == typeMap.end() ) return 0;
 
 	// attempt to transitively follow TypeInstType links.
 	while ( const TypeInstType *actualType = i->second.as<TypeInstType>()) {
 		// break cycles in the transitive follow
-		if ( *formalType == *actualType ) break;
+		if ( formalType == *actualType ) break;
 
 		// Look for the type this maps to, returning previous mapping if none-such
-		i = typeEnv.find( *actualType );
-		if ( i == typeEnv.end() ) return actualType;
+		i = typeMap.find( *actualType );
+		if ( i == typeMap.end() ) return actualType;
 	}
 
@@ -83,6 +84,10 @@
 }
 
+const Type *TypeSubstitution::lookup( const TypeInstType * formalType ) const {
+	return lookup( ast::TypeInstType::TypeEnvKey( *formalType ) );
+}
+
 bool TypeSubstitution::empty() const {
-	return typeEnv.empty();
+	return typeMap.empty();
 }
 
@@ -119,5 +124,5 @@
 		sub.core.subCount = 0;
 		sub.core.freeOnly = true;
-		for ( TypeEnvType::iterator i = typeEnv.begin(); i != typeEnv.end(); ++i ) {
+		for ( TypeMap::iterator i = typeMap.begin(); i != typeMap.end(); ++i ) {
 			i->second = i->second->accept( sub );
 		}
@@ -129,6 +134,6 @@
 	if ( bound != boundVars.end() ) return inst;
 
-	TypeEnvType::const_iterator i = sub.typeEnv.find( *inst );
-	if ( i == sub.typeEnv.end() ) {
+	TypeMap::const_iterator i = sub.typeMap.find( *inst );
+	if ( i == sub.typeMap.end() ) {
 		return inst;
 	} else {
Index: src/AST/TypeSubstitution.hpp
===================================================================
--- src/AST/TypeSubstitution.hpp	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ src/AST/TypeSubstitution.hpp	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -75,4 +75,5 @@
 	void add( const TypeSubstitution &other );
 	void remove( const TypeInstType * formalType );
+	const Type *lookup( const TypeInstType::TypeEnvKey & formalType ) const;
 	const Type *lookup( const TypeInstType * formalType ) const;
 	bool empty() const;
@@ -104,13 +105,13 @@
 	friend class Pass;
 
-	typedef std::unordered_map< TypeInstType::TypeEnvKey, ptr<Type> > TypeEnvType;
-	TypeEnvType typeEnv;
+	typedef std::unordered_map< TypeInstType::TypeEnvKey, ptr<Type> > TypeMap;
+	TypeMap typeMap;
 
   public:
-	// has to come after declaration of typeEnv
-	auto begin()       -> decltype( typeEnv.begin() ) { return typeEnv.begin(); }
-	auto   end()       -> decltype( typeEnv.  end() ) { return typeEnv.  end(); }
-	auto begin() const -> decltype( typeEnv.begin() ) { return typeEnv.begin(); }
-	auto   end() const -> decltype( typeEnv.  end() ) { return typeEnv.  end(); }
+	// has to come after declaration of typeMap
+	auto begin()       -> decltype( typeMap.begin() ) { return typeMap.begin(); }
+	auto   end()       -> decltype( typeMap.  end() ) { return typeMap.  end(); }
+	auto begin() const -> decltype( typeMap.begin() ) { return typeMap.begin(); }
+	auto   end() const -> decltype( typeMap.  end() ) { return typeMap.  end(); }
 
 };
@@ -144,5 +145,5 @@
 			if ( const TypeExpr *actual = actualIt->template as<TypeExpr>() ) {
 				if ( formal->name != "" ) {
-					typeEnv[ formal ] = actual->type;
+					typeMap[ formal ] = actual->type;
 				} // if
 			} else {
Index: src/Concurrency/Waitfor.cc
===================================================================
--- src/Concurrency/Waitfor.cc	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ src/Concurrency/Waitfor.cc	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -56,5 +56,5 @@
                       |  |
                       |  |
-			    |  |
+                      |  |
                       |  |
                       |  |
Index: src/Concurrency/Waitfor.h
===================================================================
--- src/Concurrency/Waitfor.h	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ src/Concurrency/Waitfor.h	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -19,7 +19,12 @@
 
 class Declaration;
+namespace ast {
+	class TranslationUnit;
+}
 
 namespace Concurrency {
 	void generateWaitFor( std::list< Declaration * > & translationUnit );
+
+void generateWaitFor( ast::TranslationUnit & translationUnit );
 };
 
Index: src/Concurrency/WaitforNew.cpp
===================================================================
--- src/Concurrency/WaitforNew.cpp	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ src/Concurrency/WaitforNew.cpp	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,580 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// WaitforNew.cpp -- Expand waitfor clauses into code.
+//
+// Author           : Andrew Beach
+// Created On       : Fri May 27 10:31:00 2022
+// Last Modified By : Andrew Beach
+// Last Modified On : Tue Jun 13 13:30:00 2022
+// Update Count     : 0
+//
+
+#include "Waitfor.h"
+
+#include <string>
+
+#include "AST/Pass.hpp"
+#include "Common/UniqueName.h"
+#include "InitTweak/InitTweak.h"
+#include "ResolvExpr/Resolver.h"
+
+#include "AST/Print.hpp"
+
+using namespace std::string_literals;
+using ResolvExpr::ResolveContext;
+
+/* So this is what this file dones:
+
+void f(int i, float f, A & mutex b, struct foo *  );
+void f(int );
+
+...{
+	when ( a < 1 ) waitfor( f : a ) { fee(); }
+	or timeout( getWaitTime() ) { fy(); }
+	or waitfor( g : a ) { foe(); }
+	or waitfor( ^?{} : a ) { break; }
+	or waitfor( ^?{} ) { break; }
+	or when ( a < 1 ) else { fum(); }
+}...
+
+		 ||
+		 ||
+		\||/
+		 \/
+
+...{
+	{
+		__acceptable_t __acceptables_#[4 <num-clauses>];
+		bool __do_run_# = false;
+
+		monitor$ * __monitors_#[1 <num-monitors>] = { a };
+		if ( a < 1) {
+			void (*__function_#)() = <casts> f;
+			__acceptables_#[0].is_dtor = false;
+			__acceptables_#[0].func = __function_#;
+			__acceptables_#[0].data = __monitors_#;
+			__acceptables_#[0].size = 1;
+			__do_run_# = true;
+		}
+
+		// Remaining waitfor clauses go here.
+
+		long long unsigned int __timeout_# = -1;
+		if ( true ) {
+			__timeout_# = getWaitTime();
+			__do_run_# = true;
+		}
+
+		if ( a < 1 ) {
+			__timeout_# = 0
+			__do_run_# = true;
+		}
+
+		short int __index_# = -1;
+		__waitfor_mask_t __mask_# = {&__index_#, {__acceptables_#, ?}};
+		__waitfor_internal((__waitfor_mask_t&)__mask_#, __timeout_#);
+
+		switch (__index_#) {
+		case 0:
+			{ { fee(); } break; }
+		case 1:
+			{ { foe(); } break; }
+		case 2:
+			{ <modified-break> break; }
+		case 3:
+			{ <modified-break> break; }
+		case -2:
+			{ { fy(); } break; }
+		case -1:
+			{ { foe(); } break; }
+		}
+	}
+}...
+*/
+
+namespace Concurrency {
+
+namespace {
+
+class GenerateWaitForCore :
+		public ast::WithSymbolTable, public ast::WithConstTranslationUnit {
+	const ast::FunctionDecl * decl_waitfor    = nullptr;
+	const ast::StructDecl   * decl_mask       = nullptr;
+	const ast::StructDecl   * decl_acceptable = nullptr;
+	const ast::StructDecl   * decl_monitor    = nullptr;
+
+	UniqueName namer_acc = "__acceptables_"s;
+	UniqueName namer_idx = "__index_"s;
+	UniqueName namer_flg = "__do_run_"s;
+	UniqueName namer_msk = "__mask_"s;
+	UniqueName namer_mon = "__monitors_"s;
+	UniqueName namer_tim = "__timeout_"s;
+	UniqueName namer_fun = "__function_"s;
+
+	ast::ObjectDecl * declareAcceptables( ast::CompoundStmt * out,
+		const CodeLocation & location, unsigned long numClauses );
+	ast::ObjectDecl * declareFlag(
+		ast::CompoundStmt * out, const CodeLocation & location );
+	ast::ExprStmt * makeSetter(
+		const CodeLocation & location, ast::ObjectDecl * flag );
+	ast::ObjectDecl * declMonitors(
+		ast::CompoundStmt * out, const ast::WaitForClause * clause );
+	void init_clause( ast::CompoundStmt * out, ast::ObjectDecl * acceptables,
+		int index, const ast::WaitForClause * clause, ast::Stmt * setter );
+	ast::Expr * init_timeout(
+		ast::CompoundStmt * out, const CodeLocation & topLocation,
+		const ast::Expr * timeout_time, const ast::Expr * timeout_cond,
+		const ast::Stmt * else_stmt, const ast::Expr * else_cond,
+		const ast::Stmt * setter );
+	ast::Expr * call(
+		ast::CompoundStmt * out, const CodeLocation & location,
+		size_t numClauses, ast::ObjectDecl * acceptables,
+		ast::Expr * timeout );
+public:
+	void previsit( const ast::FunctionDecl * decl );
+	void previsit( const ast::StructDecl * decl );
+	ast::Stmt * postvisit( const ast::WaitForStmt * stmt );
+};
+
+ast::Expr * makeOpIndex( const CodeLocation & location,
+		const ast::DeclWithType * array, unsigned long index ) {
+	return new ast::UntypedExpr( location,
+		new ast::NameExpr( location, "?[?]" ),
+		{
+			new ast::VariableExpr( location, array ),
+			ast::ConstantExpr::from_ulong( location, index ),
+		}
+	);
+}
+
+ast::Expr * makeOpAssign( const CodeLocation & location,
+		const ast::Expr * lhs, const ast::Expr * rhs ) {
+	return new ast::UntypedExpr( location,
+		new ast::NameExpr( location, "?=?" ),
+		{ lhs, rhs }
+	);
+}
+
+ast::Expr * makeOpMember( const CodeLocation & location,
+		const std::string & mem, const ast::Expr * sue ) {
+	return new ast::UntypedMemberExpr( location,
+		new ast::NameExpr( location, mem ),
+		sue
+	);
+}
+
+ast::Stmt * makeAccStmt(
+		const CodeLocation & location, ast::DeclWithType * object,
+		unsigned long index, const std::string & member,
+		const ast::Expr * value, const ResolveContext & context
+) {
+	ast::Expr * expr = makeOpAssign( location,
+		makeOpMember( location,
+			member,
+			makeOpIndex( location,
+				object,
+				index
+			)
+		),
+		value
+	);
+
+	auto result = ResolvExpr::findVoidExpression( expr, context );
+	return new ast::ExprStmt( location, result.get() );
+}
+
+const ast::Stmt * maybeCond( const CodeLocation & location,
+		const ast::Expr * cond, std::list<ast::ptr<ast::Stmt>> && stmts ) {
+	ast::Stmt * block = new ast::CompoundStmt( location, std::move( stmts ) );
+	return (cond) ? new ast::IfStmt( location, cond, block ) : block;
+}
+
+const ast::VariableExpr * extractVariable( const ast::Expr * func ) {
+	if ( auto var = dynamic_cast<const ast::VariableExpr *>( func ) ) {
+		return var;
+	}
+	auto cast = strict_dynamic_cast<const ast::CastExpr *>( func );
+	return cast->arg.strict_as<ast::VariableExpr>();
+}
+
+const ast::Expr * detectIsDtor(
+		const CodeLocation & location, const ast::Expr * func ) {
+	const ast::VariableExpr * typed_func = extractVariable( func );
+	bool is_dtor = InitTweak::isDestructor(
+		typed_func->var.strict_as<ast::FunctionDecl>() );
+	return ast::ConstantExpr::from_bool( location, is_dtor );
+}
+
+ast::ObjectDecl * GenerateWaitForCore::declareAcceptables(
+		ast::CompoundStmt * out,
+		const CodeLocation & location, unsigned long numClauses ) {
+	ast::ObjectDecl * acceptables = new ast::ObjectDecl( location,
+		namer_acc.newName(),
+		new ast::ArrayType(
+			new ast::StructInstType( decl_acceptable ),
+			ast::ConstantExpr::from_ulong( location, numClauses ),
+			ast::FixedLen,
+			ast::DynamicDim
+		)
+	);
+	out->push_back( new ast::DeclStmt( location, acceptables ) );
+
+	ast::Expr * set = new ast::UntypedExpr( location,
+		new ast::NameExpr( location, "__builtin_memset" ),
+		{
+			new ast::VariableExpr( location, acceptables ),
+			ast::ConstantExpr::from_int( location, 0 ),
+			new ast::SizeofExpr( location,
+				new ast::VariableExpr( location, acceptables ) ),
+		}
+	);
+	ResolveContext context{ symtab, transUnit().global };
+	auto result = ResolvExpr::findVoidExpression( set, context );
+	out->push_back( new ast::ExprStmt( location, result.get() ) );
+
+	return acceptables;
+}
+
+ast::ObjectDecl * GenerateWaitForCore::declareFlag(
+		ast::CompoundStmt * out, const CodeLocation & location ) {
+	ast::ObjectDecl * flag = new ast::ObjectDecl( location,
+		namer_flg.newName(),
+		new ast::BasicType( ast::BasicType::Bool ),
+		new ast::SingleInit( location,
+			ast::ConstantExpr::from_ulong( location, 0 )
+		)
+	);
+	out->push_back( new ast::DeclStmt( location, flag ) );
+	return flag;
+}
+
+ast::ExprStmt * GenerateWaitForCore::makeSetter(
+		const CodeLocation & location, ast::ObjectDecl * flag ) {
+	ast::Expr * expr = new ast::UntypedExpr( location,
+		new ast::NameExpr( location, "?=?" ),
+		{
+			new ast::VariableExpr( location, flag ),
+			ast::ConstantExpr::from_ulong( location, 1 ),
+		}
+	);
+	ResolveContext context{ symtab, transUnit().global };
+	auto result = ResolvExpr::findVoidExpression( expr, context );
+	return new ast::ExprStmt( location, result.get() );
+}
+
+ast::ObjectDecl * GenerateWaitForCore::declMonitors(
+		ast::CompoundStmt * out,
+		const ast::WaitForClause * clause ) {
+	const CodeLocation & location = clause->location;
+	ast::ObjectDecl * monitor = new ast::ObjectDecl( location,
+		namer_mon.newName(),
+		new ast::ArrayType(
+			new ast::PointerType(
+				new ast::StructInstType( decl_monitor )
+			),
+			ast::ConstantExpr::from_ulong( location, clause->target_args.size() ),
+			ast::FixedLen,
+			ast::DynamicDim
+		),
+		new ast::ListInit( location,
+			map_range<std::vector<ast::ptr<ast::Init>>>(
+				clause->target_args,
+				[]( const ast::Expr * expr ){
+					return new ast::SingleInit( expr->location, expr ); }
+			)
+		)
+	);
+	out->push_back( new ast::DeclStmt( location, monitor ) );
+	return monitor;
+}
+
+void GenerateWaitForCore::init_clause(
+		ast::CompoundStmt * out,
+		ast::ObjectDecl * acceptables,
+		int index,
+		const ast::WaitForClause * clause,
+		ast::Stmt * setter ) {
+	const CodeLocation & location = clause->location;
+	const ast::ObjectDecl * monitors = declMonitors( out, clause );
+	ast::Type * fptr_t = new ast::PointerType(
+			new ast::FunctionType( ast::VariableArgs ) );
+
+	const ast::VariableExpr * variableExpr =
+		clause->target_func.as<ast::VariableExpr>();
+	ast::Expr * castExpr = new ast::CastExpr(
+		location,
+		new ast::CastExpr(
+			location,
+			clause->target_func,
+			ast::deepCopy( variableExpr->result.get() ),
+			ast::GeneratedCast ),
+		fptr_t,
+		ast::GeneratedCast );
+
+	ast::ObjectDecl * funcDecl = new ast::ObjectDecl( location,
+		namer_fun.newName(),
+		ast::deepCopy( fptr_t ),
+		new ast::SingleInit( location, castExpr )
+		);
+	ast::Expr * funcExpr = new ast::VariableExpr( location, funcDecl );
+	out->push_back( new ast::DeclStmt( location, funcDecl ) );
+
+	ResolveContext context{ symtab, transUnit().global };
+	out->push_back( maybeCond( location, clause->cond.get(), {
+		makeAccStmt( location, acceptables, index, "is_dtor",
+			detectIsDtor( location, clause->target_func ), context ),
+		makeAccStmt( location, acceptables, index, "func",
+			funcExpr, context ),
+		makeAccStmt( location, acceptables, index, "data",
+			new ast::VariableExpr( location, monitors ), context ),
+		makeAccStmt( location, acceptables, index, "size",
+			ast::ConstantExpr::from_ulong( location,
+				clause->target_args.size() ), context ),
+		ast::deepCopy( setter ),
+	} ) );
+}
+
+ast::Expr * GenerateWaitForCore::init_timeout(
+		ast::CompoundStmt * out,
+		const CodeLocation & topLocation,
+		const ast::Expr * timeout_time,
+		const ast::Expr * timeout_cond,
+		const ast::Stmt * else_stmt,
+		const ast::Expr * else_cond,
+		const ast::Stmt * setter ) {
+	ast::ObjectDecl * timeout = new ast::ObjectDecl( topLocation,
+		namer_tim.newName(),
+		new ast::BasicType( ast::BasicType::LongLongUnsignedInt ),
+		new ast::SingleInit( topLocation,
+			ast::ConstantExpr::from_int( topLocation, -1 )
+		)
+	);
+	out->push_back( new ast::DeclStmt( topLocation, timeout ) );
+
+	if ( timeout_time ) {
+		const CodeLocation & location = timeout_time->location;
+		out->push_back( maybeCond( location, timeout_cond, {
+			new ast::ExprStmt( location,
+				makeOpAssign(
+					location,
+					new ast::VariableExpr( location, timeout ),
+					timeout_time
+				)
+			),
+			ast::deepCopy( setter ),
+		} ) );
+	}
+
+	// We only care about the else_stmt's presence and location.
+	if ( else_stmt ) {
+		const CodeLocation & location = else_stmt->location;
+		out->push_back( maybeCond( location, else_cond, {
+			new ast::ExprStmt( location,
+				makeOpAssign(
+					location,
+					new ast::VariableExpr( location, timeout ),
+					ast::ConstantExpr::from_ulong( location, 0 )
+				)
+			),
+			ast::deepCopy( setter ),
+		} ) );
+	}
+
+	return new ast::VariableExpr( topLocation, timeout );
+}
+
+ast::Expr * GenerateWaitForCore::call(
+	ast::CompoundStmt * out,
+	const CodeLocation & location,
+	size_t numClauses,
+	ast::ObjectDecl * acceptables,
+	ast::Expr * timeout
+) {
+	ast::ObjectDecl * index = new ast::ObjectDecl( location,
+		namer_idx.newName(),
+		new ast::BasicType( ast::BasicType::ShortSignedInt ),
+		new ast::SingleInit( location,
+			ast::ConstantExpr::from_int( location, -1 )
+		)
+	);
+	out->push_back( new ast::DeclStmt( location, index ) );
+
+	ast::ObjectDecl * mask = new ast::ObjectDecl( location,
+		namer_msk.newName(),
+		new ast::StructInstType( decl_mask ),
+		new ast::ListInit( location, {
+			new ast::SingleInit( location,
+				new ast::AddressExpr( location,
+					new ast::VariableExpr( location, index )
+				)
+			),
+			new ast::ListInit( location, {
+				new ast::SingleInit( location,
+					new ast::VariableExpr( location, acceptables )
+				),
+				new ast::SingleInit( location,
+					ast::ConstantExpr::from_ulong( location, numClauses )
+				),
+			}),
+		})
+	);
+	out->push_back( new ast::DeclStmt( location, mask ) );
+
+	ast::ApplicationExpr * waitforMask = new ast::ApplicationExpr( location,
+		ast::VariableExpr::functionPointer( location, decl_waitfor ),
+		{
+			new ast::CastExpr(
+				new ast::VariableExpr( location, mask ),
+				new ast::ReferenceType(
+					new ast::StructInstType( decl_mask )
+				)
+			),
+			timeout
+		}
+	);
+	out->push_back( new ast::ExprStmt( location, waitforMask ) );
+
+	return new ast::VariableExpr( location, index );
+}
+
+ast::Stmt * choose( const ast::WaitForStmt * waitfor, ast::Expr * result ) {
+	const CodeLocation & location = waitfor->location;
+
+	ast::SwitchStmt * theSwitch = new ast::SwitchStmt( location,
+		result,
+		std::vector<ast::ptr<ast::CaseClause>>()
+	);
+
+	// For some reason, enumerate doesn't work here because of references.
+	for ( size_t i = 0 ; i < waitfor->clauses.size() ; ++i ) {
+		theSwitch->cases.push_back(
+			new ast::CaseClause( location,
+				ast::ConstantExpr::from_ulong( location, i ),
+				{
+					new ast::CompoundStmt( location, {
+						waitfor->clauses[i]->stmt,
+						new ast::BranchStmt( location,
+							ast::BranchStmt::Break,
+							ast::Label( location )
+						)
+					})
+				}
+			)
+		);
+	}
+
+	if ( waitfor->timeout_stmt ) {
+		theSwitch->cases.push_back(
+			new ast::CaseClause( location,
+				ast::ConstantExpr::from_int( location, -2 ),
+				{
+					new ast::CompoundStmt( location, {
+						waitfor->timeout_stmt,
+						new ast::BranchStmt( location,
+							ast::BranchStmt::Break,
+							ast::Label( location )
+						)
+					})
+				}
+			)
+		);
+	}
+
+	if ( waitfor->else_stmt ) {
+		theSwitch->cases.push_back(
+			new ast::CaseClause( location,
+				ast::ConstantExpr::from_int( location, -1 ),
+				{
+					new ast::CompoundStmt( location, {
+						waitfor->else_stmt,
+						new ast::BranchStmt( location,
+							ast::BranchStmt::Break,
+							ast::Label( location )
+						)
+					})
+				}
+			)
+		);
+	}
+
+	return theSwitch;
+}
+
+void GenerateWaitForCore::previsit( const ast::FunctionDecl * decl ) {
+	if ( "__waitfor_internal" == decl->name ) {
+		decl_waitfor = decl;
+	}
+}
+
+void GenerateWaitForCore::previsit( const ast::StructDecl * decl ) {
+	if ( !decl->body ) {
+		return;
+	} else if ( "__acceptable_t" == decl->name ) {
+		assert( !decl_acceptable );
+		decl_acceptable = decl;
+	} else if ( "__waitfor_mask_t" == decl->name ) {
+		assert( !decl_mask );
+		decl_mask = decl;
+	} else if ( "monitor$" == decl->name ) {
+		assert( !decl_monitor );
+		decl_monitor = decl;
+	}
+}
+
+ast::Stmt * GenerateWaitForCore::postvisit( const ast::WaitForStmt * stmt ) {
+	if ( !decl_monitor || !decl_acceptable || !decl_mask ) {
+		SemanticError( stmt, "waitfor keyword requires monitors to be in scope, add #include <monitor.hfa>" );
+	}
+
+	const CodeLocation & location = stmt->location;
+	ast::CompoundStmt * comp = new ast::CompoundStmt( location );
+
+	ast::ObjectDecl * acceptables = declareAcceptables( comp, location, stmt->clauses.size() );
+	ast::ObjectDecl * flag        = declareFlag( comp, location );
+	ast::Stmt       * setter      = makeSetter( location, flag );
+
+	// For some reason, enumerate doesn't work here because of references.
+	for ( size_t i = 0 ; i < stmt->clauses.size() ; ++i ) {
+		init_clause( comp, acceptables, i, stmt->clauses[i], setter );
+	}
+
+	ast::Expr * timeout = init_timeout(
+		comp,
+		location,
+		stmt->timeout_time,
+		stmt->timeout_cond,
+		stmt->else_stmt,
+		stmt->else_cond,
+		setter
+	);
+
+	ast::CompoundStmt * compound = new ast::CompoundStmt( location );
+	comp->push_back( new ast::IfStmt( location,
+		new ast::VariableExpr( location, flag ),
+		compound,
+		nullptr
+	));
+
+	ast::Expr * result = call(
+		compound, location, stmt->clauses.size(), acceptables, timeout );
+	compound->push_back( choose( stmt, result ) );
+	return comp;
+}
+
+} // namespace
+
+void generateWaitFor( ast::TranslationUnit & translationUnit ) {
+	ast::Pass<GenerateWaitForCore>::run( translationUnit );
+}
+
+} // namespace Concurrency
+
+// Local Variables: //
+// tab-width: 4 //
+// mode: c++ //
+// compile-command: "make install" //
+// End: //
Index: src/Concurrency/module.mk
===================================================================
--- src/Concurrency/module.mk	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ src/Concurrency/module.mk	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -19,4 +19,5 @@
 	Concurrency/Keywords.cc \
 	Concurrency/Keywords.h \
+	Concurrency/WaitforNew.cpp \
 	Concurrency/Waitfor.cc \
 	Concurrency/Waitfor.h
Index: src/GenPoly/Specialize.cc
===================================================================
--- src/GenPoly/Specialize.cc	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ src/GenPoly/Specialize.cc	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -247,4 +247,5 @@
 			structureArg( (*actualBegin)->get_type(), argBegin, argEnd, back_inserter( appExpr->get_args() ) );
 		}
+		assertf( argBegin == argEnd, "Did not structure all arguments." );
 
 		appExpr->env = TypeSubstitution::newFromExpr( appExpr, env );
Index: src/InitTweak/FixGlobalInit.cc
===================================================================
--- src/InitTweak/FixGlobalInit.cc	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ src/InitTweak/FixGlobalInit.cc	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -162,5 +162,5 @@
 			} // if
 			if ( Statement * ctor = ctorInit->ctor ) {
-				addDataSectonAttribute( objDecl );
+				addDataSectionAttribute( objDecl );
 				initStatements.push_back( ctor );
 				objDecl->init = nullptr;
Index: src/InitTweak/FixInit.cc
===================================================================
--- src/InitTweak/FixInit.cc	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ src/InitTweak/FixInit.cc	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -806,5 +806,5 @@
 						// The attribute works, and is meant to apply, both for leaving the static local alone,
 						// and for hoisting it out as a static global.
-						addDataSectonAttribute( objDecl );
+						addDataSectionAttribute( objDecl );
 
 						// originally wanted to take advantage of gcc nested functions, but
Index: src/InitTweak/InitTweak.cc
===================================================================
--- src/InitTweak/InitTweak.cc	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ src/InitTweak/InitTweak.cc	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -587,5 +587,5 @@
 
 	bool isConstructable( const ast::Type * type ) {
-		return ! dynamic_cast< const ast::VarArgsType * >( type ) && ! dynamic_cast< const ast::ReferenceType * >( type ) 
+		return ! dynamic_cast< const ast::VarArgsType * >( type ) && ! dynamic_cast< const ast::ReferenceType * >( type )
 		&& ! dynamic_cast< const ast::FunctionType * >( type ) && ! Tuples::isTtype( type );
 	}
@@ -1025,5 +1025,5 @@
 		if (!assign) {
 			auto td = new ast::TypeDecl({}, "T", {}, nullptr, ast::TypeDecl::Dtype, true);
-			assign = new ast::FunctionDecl({}, "?=?", {}, 
+			assign = new ast::FunctionDecl({}, "?=?", {},
 			{ new ast::ObjectDecl({}, "_dst", new ast::ReferenceType(new ast::TypeInstType("T", td))),
 			  new ast::ObjectDecl({}, "_src", new ast::TypeInstType("T", td))},
@@ -1095,7 +1095,7 @@
 
 			// address of a variable or member expression is constexpr
-			if ( ! dynamic_cast< const ast::NameExpr * >( arg ) 
-			&& ! dynamic_cast< const ast::VariableExpr * >( arg ) 
-			&& ! dynamic_cast< const ast::MemberExpr * >( arg ) 
+			if ( ! dynamic_cast< const ast::NameExpr * >( arg )
+			&& ! dynamic_cast< const ast::VariableExpr * >( arg )
+			&& ! dynamic_cast< const ast::MemberExpr * >( arg )
 			&& ! dynamic_cast< const ast::UntypedMemberExpr * >( arg ) ) result = false;
 		}
@@ -1241,24 +1241,25 @@
 	}
 
-	void addDataSectonAttribute( ObjectDecl * objDecl ) {
+	#if defined( __x86_64 ) || defined( __i386 ) // assembler comment to prevent assembler warning message
+		#define ASM_COMMENT "#"
+	#else // defined( __ARM_ARCH )
+		#define ASM_COMMENT "//"
+	#endif
+	static const char * const data_section =  ".data" ASM_COMMENT;
+	static const char * const tlsd_section = ".tdata" ASM_COMMENT;
+	void addDataSectionAttribute( ObjectDecl * objDecl ) {
+		const bool is_tls = objDecl->get_storageClasses().is_threadlocal;
+		const char * section = is_tls ? tlsd_section : data_section;
 		objDecl->attributes.push_back(new Attribute("section", {
-			new ConstantExpr( Constant::from_string(".data"
-#if defined( __x86_64 ) || defined( __i386 ) // assembler comment to prevent assembler warning message
-					"#"
-#else // defined( __ARM_ARCH )
-					"//"
-#endif
-				))}));
+			new ConstantExpr( Constant::from_string( section ) )
+		}));
 	}
 
 	void addDataSectionAttribute( ast::ObjectDecl * objDecl ) {
+		const bool is_tls = objDecl->storage.is_threadlocal;
+		const char * section = is_tls ? tlsd_section : data_section;
 		objDecl->attributes.push_back(new ast::Attribute("section", {
-			ast::ConstantExpr::from_string(objDecl->location, ".data"
-#if defined( __x86_64 ) || defined( __i386 ) // assembler comment to prevent assembler warning message
-					"#"
-#else // defined( __ARM_ARCH )
-					"//"
-#endif
-				)}));
+			ast::ConstantExpr::from_string(objDecl->location, section)
+		}));
 	}
 
Index: src/InitTweak/InitTweak.h
===================================================================
--- src/InitTweak/InitTweak.h	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ src/InitTweak/InitTweak.h	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -127,5 +127,5 @@
 	///    .section .data#,"a"
 	/// to avoid assembler warning "ignoring changed section attributes for .data"
-	void addDataSectonAttribute( ObjectDecl * objDecl );
+	void addDataSectionAttribute( ObjectDecl * objDecl );
 
 	void addDataSectionAttribute( ast::ObjectDecl * objDecl );
Index: src/main.cc
===================================================================
--- src/main.cc	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ src/main.cc	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -10,6 +10,6 @@
 // Created On       : Fri May 15 23:12:02 2015
 // Last Modified By : Andrew Beach
-// Last Modified On : Fri Apr 29  9:52:00 2022
-// Update Count     : 673
+// Last Modified On : Tue Jun  7 13:29:00 2022
+// Update Count     : 674
 //
 
@@ -447,5 +447,6 @@
 			PASS( "Expand Unique Expr", Tuples::expandUniqueExpr( transUnit ) ); // xxx - is this the right place for this? want to expand ASAP so tha, sequent passes don't need to worry about double-visiting a unique expr - needs to go after InitTweak::fix so that copy constructed return declarations are reused
 
-			PASS( "Translate Tries" , ControlStruct::translateTries( transUnit ) );
+			PASS( "Translate Tries", ControlStruct::translateTries( transUnit ) );
+			PASS( "Gen Waitfor", Concurrency::generateWaitFor( transUnit ) );
 
 			translationUnit = convert( move( transUnit ) );
@@ -517,9 +518,7 @@
 
 			PASS( "Expand Unique Expr", Tuples::expandUniqueExpr( translationUnit ) ); // xxx - is this the right place for this? want to expand ASAP so tha, sequent passes don't need to worry about double-visiting a unique expr - needs to go after InitTweak::fix so that copy constructed return declarations are reused
-
-			PASS( "Translate Tries" , ControlStruct::translateTries( translationUnit ) );
+			PASS( "Translate Tries", ControlStruct::translateTries( translationUnit ) );
+			PASS( "Gen Waitfor", Concurrency::generateWaitFor( translationUnit ) );
 		}
-
-		PASS( "Gen Waitfor" , Concurrency::generateWaitFor( translationUnit ) );
 
 		PASS( "Convert Specializations",  GenPoly::convertSpecializations( translationUnit ) ); // needs to happen before tuple types are expanded
Index: tests/concurrent/futures/.expect/wait_any.txt
===================================================================
--- tests/concurrent/futures/.expect/wait_any.txt	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ tests/concurrent/futures/.expect/wait_any.txt	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,2 @@
+start
+done
Index: tests/concurrent/futures/wait_any.cfa
===================================================================
--- tests/concurrent/futures/wait_any.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ tests/concurrent/futures/wait_any.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,77 @@
+#include <thread.hfa>
+#include <fstream.hfa>
+#include <stdlib.hfa>
+#include <mutex_stmt.hfa>
+#include <locks.hfa>
+
+simple_owner_lock l;
+pthread_cond_var( simple_owner_lock ) c;
+
+size_t num_futures = 10;
+
+future_t * futures;
+
+size_t * shuffle_arr;
+
+size_t numtimes = 10;
+
+volatile bool done = false;
+
+void synchronize() {
+    lock(l);
+    if (empty(c)) {
+        wait(c,l);
+    } else {
+        for ( i; num_futures ) {
+            reset(futures[i]);
+        }
+        notify_one(c);
+    }
+    unlock(l);
+}
+
+thread Waiter {};
+void main( Waiter & this ) {
+    for (numtimes) {
+        wait_any(futures, num_futures);
+        synchronize();
+    }
+    done = true;
+    while (done) notify_one(c);
+}
+
+thread Deliverer {};
+void main( Deliverer & this ) {
+    while (!done) {
+        size_t num_satisfy = random(1,num_futures);
+        for ( i; num_satisfy ) {								// random shuffle a few values
+			swap( shuffle_arr[random(num_futures)], shuffle_arr[random(num_futures)] );
+		}
+        for ( i; num_satisfy ) {
+            fulfil(futures[shuffle_arr[i]]);
+        }
+        synchronize();
+    }
+    done = false;
+}
+
+int main() {
+	sout | "start";
+    futures = alloc( num_futures );
+    shuffle_arr = alloc( num_futures );
+    for ( i; num_futures ) {								// random shuffle a few values
+        futures[i]{};
+        swap( shuffle_arr[random(num_futures)], shuffle_arr[random(num_futures)] );
+    }
+    for ( i; num_futures ) {
+        shuffle_arr[i] = i;
+    }
+	processor procs[1];
+	{
+		Waiter w;
+		Deliverer d;
+	}
+    free( futures );
+    free( shuffle_arr );
+	sout | "done";
+}
Index: tests/unified_locking/.expect/block_spin_lock.txt
===================================================================
--- tests/unified_locking/.expect/block_spin_lock.txt	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ tests/unified_locking/.expect/block_spin_lock.txt	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,3 @@
+Starting
+Done!
+Match!
Index: tests/unified_locking/.expect/clh.txt
===================================================================
--- tests/unified_locking/.expect/clh.txt	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ tests/unified_locking/.expect/clh.txt	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,3 @@
+Starting
+Done!
+Match!
Index: tests/unified_locking/.expect/mcs_block_spin_lock.txt
===================================================================
--- tests/unified_locking/.expect/mcs_block_spin_lock.txt	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ tests/unified_locking/.expect/mcs_block_spin_lock.txt	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,3 @@
+Starting
+Done!
+Match!
Index: tests/unified_locking/.expect/mcs_spin.txt
===================================================================
--- tests/unified_locking/.expect/mcs_spin.txt	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ tests/unified_locking/.expect/mcs_spin.txt	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,3 @@
+Starting
+Done!
+Match!
Index: tests/unified_locking/.expect/pthread_locks.txt
===================================================================
--- tests/unified_locking/.expect/pthread_locks.txt	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ tests/unified_locking/.expect/pthread_locks.txt	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,6 @@
+Start Test 1: lock and condition variable single wait/notify
+Done Test 1
+Start Test 2: lock and condition variable 3 wait/notify all
+Done Test 2
+Start Test 3: lock and condition variable multiple acquire and wait/notify
+Done Test 3
Index: tests/unified_locking/.expect/simple_owner_lock.txt
===================================================================
--- tests/unified_locking/.expect/simple_owner_lock.txt	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ tests/unified_locking/.expect/simple_owner_lock.txt	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,3 @@
+Starting
+Done!
+Match!
Index: tests/unified_locking/.expect/spin_queue_lock.txt
===================================================================
--- tests/unified_locking/.expect/spin_queue_lock.txt	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ tests/unified_locking/.expect/spin_queue_lock.txt	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,3 @@
+Starting
+Done!
+Match!
Index: tests/unified_locking/block_spin_lock.cfa
===================================================================
--- tests/unified_locking/block_spin_lock.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ tests/unified_locking/block_spin_lock.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,8 @@
+#include <locks.hfa>
+
+#define LOCK block_spin_lock
+#include "mutex_test.hfa"
+
+int main() {
+    test();
+}
Index: tests/unified_locking/clh.cfa
===================================================================
--- tests/unified_locking/clh.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ tests/unified_locking/clh.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,8 @@
+#include <locks.hfa>
+
+#define LOCK clh_lock
+#include "mutex_test.hfa"
+
+int main() {
+    test();
+}
Index: tests/unified_locking/mcs_block_spin_lock.cfa
===================================================================
--- tests/unified_locking/mcs_block_spin_lock.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ tests/unified_locking/mcs_block_spin_lock.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,8 @@
+#include <locks.hfa>
+
+#define LOCK mcs_block_spin_lock
+#include "mutex_test.hfa"
+
+int main() {
+    test();
+}
Index: tests/unified_locking/mcs_spin.cfa
===================================================================
--- tests/unified_locking/mcs_spin.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ tests/unified_locking/mcs_spin.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,78 @@
+
+#include <fstream.hfa>
+#include <locks.hfa>
+#include <thread.hfa>
+
+const unsigned int num_times = 50;
+
+struct MutexObj {
+	mcs_spin_lock l;
+	thread$ * id;
+	uint32_t sum;
+	uint32_t cnt;
+};
+
+MutexObj mo;
+
+void trash() {
+	unsigned t[100];
+	for(i; 100) {
+		t[i] = 0xDEADBEEF;
+	}
+}
+
+uint32_t cs() {
+	thread$ * me = active_thread();
+	uint32_t value;
+    mcs_spin_node node;
+	lock(mo.l, node);
+	{
+		uint32_t tsum = mo.sum;
+		uint32_t cnt = mo.cnt;
+		mo.id = me;
+		yield(random(5));
+		value = ((uint32_t)random()) ^ ((uint32_t)me);
+		if(mo.id != me) sout | "Intruder!";
+		mo.cnt = cnt + 1;
+		mo.sum = tsum + value;
+	}
+	unlock(mo.l, node);
+	return value;
+}
+
+thread LockCheck {
+	uint32_t sum;
+};
+
+void main(LockCheck & this) {
+	this.sum = 0;
+	for(num_times) {
+		trash();
+		this.sum += cs();
+		trash();
+		yield(random(10));
+	}
+}
+
+void test() {
+	uint32_t sum = -32;
+	mo.sum = -32;
+	mo.cnt = 0;
+	processor p[2];
+	sout | "Starting";
+	{
+		LockCheck checkers[13];
+		for(i;13) {
+			sum += join(checkers[i]).sum;
+		}
+	}
+	sout | "Done!";
+	if(mo.cnt != (13 * num_times)) sout | "Invalid cs count!" | mo.cnt | "vs "| (13 * num_times) | "(13 *" | num_times | ')';
+	if(sum == mo.sum) sout | "Match!";
+	else sout | "No Match!" | sum | "vs" | mo.sum;
+}
+
+int main() {
+    test();
+	return 0;
+}
Index: tests/unified_locking/mutex_test.hfa
===================================================================
--- tests/unified_locking/mutex_test.hfa	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ tests/unified_locking/mutex_test.hfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -22,9 +22,10 @@
 }
 
-uint32_t cs() {
+uint32_t cs(uint32_t & entries) {
 	thread$ * me = active_thread();
 	uint32_t value;
 	lock(mo.l);
 	{
+		entries++;
 		uint32_t tsum = mo.sum;
 		uint32_t cnt = mo.cnt;
@@ -42,11 +43,13 @@
 thread LockCheck {
 	uint32_t sum;
+	uint32_t entries;
 };
 
 void main(LockCheck & this) {
 	this.sum = 0;
+	this.entries = 0;
 	for(num_times) {
 		trash();
-		this.sum += cs();
+		this.sum += cs( this.entries );
 		trash();
 		yield(random(10));
@@ -58,4 +61,5 @@
 	mo.sum = -32;
 	mo.cnt = 0;
+	uint32_t real_entries = 0;
 	processor p[2];
 	sout | "Starting";
@@ -63,9 +67,12 @@
 		LockCheck checkers[13];
 		for(i;13) {
-			sum += join(checkers[i]).sum;
+			LockCheck & curr = join(checkers[i]);
+			sum += curr.sum;
+			real_entries += curr.entries;
 		}
 	}
 	sout | "Done!";
-	if(mo.cnt != (13 * num_times)) sout | "Invalid cs count!" | mo.cnt | "vs "| (13 * num_times) | "(13 *" | num_times | ')';
+	if(real_entries != (13 * num_times)) sout | "Invalid real cs count!" | mo.cnt | "vs "| (13 * num_times) | "(13 *" | num_times | ')';
+	if(mo.cnt != (13 * num_times)) sout | "Invalid concurrent cs count!" | mo.cnt | "vs "| (13 * num_times) | "(13 *" | num_times | ')';
 	if(sum == mo.sum) sout | "Match!";
 	else sout | "No Match!" | sum | "vs" | mo.sum;
Index: tests/unified_locking/pthread_locks.cfa
===================================================================
--- tests/unified_locking/pthread_locks.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ tests/unified_locking/pthread_locks.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,80 @@
+#include <stdio.h>
+#include "locks.hfa"
+#include <stdlib.hfa>
+#include <thread.hfa>
+
+const unsigned int num_times = 50000;
+
+simple_owner_lock l;
+pthread_cond_var( simple_owner_lock ) c;
+
+volatile int counter = 0;
+
+thread Wait_Signal_1 {};
+
+void main( Wait_Signal_1 & this ) {
+	for (unsigned int i = 0; i < num_times; i++) {
+		lock(l);
+		if(empty(c) && i != num_times - 1) {
+			wait(c,l);
+		}else{
+			notify_one(c);
+		}
+		unlock(l);
+	}
+}
+
+thread Wait_3_Signal_3 {};
+
+void main( Wait_3_Signal_3 & this ) {
+	for (unsigned int i = 0; i < num_times; i++) {
+		lock(l);
+        counter++;
+		if(counter == 4 || i == num_times - 1) {
+            counter = 0;
+			notify_all(c);
+		}else{
+			wait(c,l);
+		}
+		unlock(l);
+	}
+}
+
+thread Rec_Lock_Wait_Signal_1 {};
+
+void main( Rec_Lock_Wait_Signal_1 & this ) {
+	for (unsigned int i = 0; i < num_times; i++) {
+		lock(l);
+		lock(l);
+		lock(l);
+		if(empty(c) && i != num_times - 1) {
+			wait(c,l);
+		}else{
+			notify_one(c);
+		}
+		unlock(l);
+		unlock(l);
+		unlock(l);
+	}
+}
+
+int main() {
+	processor p[3];
+	printf("Start Test 1: lock and condition variable single wait/notify\n");
+	{
+		Wait_Signal_1 t1[2];
+	}
+	printf("Done Test 1\n");
+
+	printf("Start Test 2: lock and condition variable 3 wait/notify all\n");
+	{
+		Wait_3_Signal_3 t1[4];
+	}
+	printf("Done Test 2\n");
+
+	printf("Start Test 3: lock and condition variable multiple acquire and wait/notify\n");
+	{
+		Rec_Lock_Wait_Signal_1 t1[2];
+	}
+	printf("Done Test 3\n");
+}
Index: tests/unified_locking/simple_owner_lock.cfa
===================================================================
--- tests/unified_locking/simple_owner_lock.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ tests/unified_locking/simple_owner_lock.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,8 @@
+#include <locks.hfa>
+
+#define LOCK simple_owner_lock
+#include "mutex_test.hfa"
+
+int main() {
+    test();
+}
Index: tests/unified_locking/spin_queue_lock.cfa
===================================================================
--- tests/unified_locking/spin_queue_lock.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
+++ tests/unified_locking/spin_queue_lock.cfa	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -0,0 +1,8 @@
+#include <locks.hfa>
+
+#define LOCK spin_queue_lock
+#include "mutex_test.hfa"
+
+int main() {
+    test();
+}
Index: tools/cfa.nanorc
===================================================================
--- tools/cfa.nanorc	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ tools/cfa.nanorc	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -10,10 +10,10 @@
 color green "\<(forall|trait|(o|d|f|t)type|mutex|_Bool|volatile|virtual)\>"
 color green "\<(float|double|bool|char|int|short|long|enum|void|auto)\>"
-color green "\<(static|const|extern|(un)?signed|inline)\>" "\<(sizeof)\>"
+color green "\<(static|const|extern|(un)?signed|inline|sizeof|vtable)\>"
 color green "\<((s?size)|one|zero|((u_?)?int(8|16|32|64|ptr)))_t\>"
 
 # Declarations
 color brightgreen "\<(struct|union|typedef|trait|coroutine|generator)\>"
-color brightgreen "\<(monitor|thread|with)\>"
+color brightgreen "\<(monitor|thread|with|exception)\>"
 
 # Control Flow Structures
Index: tools/jenkins/setup.sh.in
===================================================================
--- tools/jenkins/setup.sh.in	(revision 1df492a157311639c92151e6df741fe232bef7d6)
+++ tools/jenkins/setup.sh.in	(revision eb5962a78fb1dd3369887e6341b879ed4926e450)
@@ -29,5 +29,5 @@
 function getrunpath()
 {
-	local elfout=$(readelf -d $1 | grep "RUNPATH")
+	local elfout=$(readelf -d $1 | grep -E "RPATH|RUNPATH")
 	regex='\[/([[:alpha:][:digit:]@/_.-]+)\]'
 	if [[ $elfout =~ $regex ]]; then
@@ -43,14 +43,14 @@
 {
 	local deps=$(ldd $1)
+	retlcldeps=()
 	retsysdeps=()
-	retlcldeps=()
 	while IFS= read -r line; do
-		regex1='/([[:alpha:][:digit:]@/_.-]+)'
-		regex2='(libcfa[[:alpha:][:digit:].]+) => not found'
+		regex1='(libcfa[[:alpha:][:digit:].]+)'
+		regex2='/([[:alpha:][:digit:]@/_.-]+)'
 		regex3='linux-vdso.so.1|linux-gate.so.1'
 		if [[ $line =~ $regex1 ]]; then
+			retlcldeps+=(${BASH_REMATCH[1]})
+		elif [[ $line =~ $regex2 ]]; then
 			retsysdeps+=(${BASH_REMATCH[1]})
-		elif [[ $line =~ $regex2 ]]; then
-			retlcldeps+=(${BASH_REMATCH[1]})
 		elif [[ $line =~ $regex3 ]]; then
 			# echo "ignoring '$line}': intrinsic"
