Index: benchmark/io/http/protocol.cfa
===================================================================
--- benchmark/io/http/protocol.cfa	(revision 9c5aef93ec8be1d4946206206a356a8054f43fc1)
+++ benchmark/io/http/protocol.cfa	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
@@ -173,21 +173,4 @@
 }
 
-static void zero_sqe(struct io_uring_sqe * sqe) {
-	sqe->flags = 0;
-	sqe->ioprio = 0;
-	sqe->fd = 0;
-	sqe->off = 0;
-	sqe->addr = 0;
-	sqe->len = 0;
-	sqe->fsync_flags = 0;
-	sqe->__pad2[0] = 0;
-	sqe->__pad2[1] = 0;
-	sqe->__pad2[2] = 0;
-	sqe->fd = 0;
-	sqe->off = 0;
-	sqe->addr = 0;
-	sqe->len = 0;
-}
-
 enum FSM_STATE {
 	Initial,
Index: doc/theses/mubeen_zulfiqar_MMath/benchmarks.tex
===================================================================
--- doc/theses/mubeen_zulfiqar_MMath/benchmarks.tex	(revision 9c5aef93ec8be1d4946206206a356a8054f43fc1)
+++ doc/theses/mubeen_zulfiqar_MMath/benchmarks.tex	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
@@ -216,28 +216,2 @@
 \paragraph{Relevant Knobs}
 *** FIX ME: Insert Relevant Knobs
-
-
-
-\section{Existing Memory Allocators}
-With dynamic allocation being an important feature of C, there are many stand-alone memory allocators that have been designed for different purposes. For this thesis, we chose 7 of the most popular and widely used memory allocators.
-
-\paragraph{dlmalloc}
-dlmalloc (FIX ME: cite allocator) is a thread-safe allocator that is single threaded and single heap. dlmalloc maintains free-lists of different sizes to store freed dynamic memory. (FIX ME: cite wasik)
-
-\paragraph{hoard}
-Hoard (FIX ME: cite allocator) is a thread-safe allocator that is multi-threaded and using a heap layer framework. It has per-thread heaps that have thread-local free-lists, and a global shared heap. (FIX ME: cite wasik)
-
-\paragraph{jemalloc}
-jemalloc (FIX ME: cite allocator) is a thread-safe allocator that uses multiple arenas. Each thread is assigned an arena. Each arena has chunks that contain contagious memory regions of same size. An arena has multiple chunks that contain regions of multiple sizes.
-
-\paragraph{ptmalloc}
-ptmalloc (FIX ME: cite allocator) is a modification of dlmalloc. It is a thread-safe multi-threaded memory allocator that uses multiple heaps. ptmalloc heap has similar design to dlmalloc's heap.
-
-\paragraph{rpmalloc}
-rpmalloc (FIX ME: cite allocator) is a thread-safe allocator that is multi-threaded and uses per-thread heap. Each heap has multiple size-classes and each size-class contains memory regions of the relevant size.
-
-\paragraph{tbb malloc}
-tbb malloc (FIX ME: cite allocator) is a thread-safe allocator that is multi-threaded and uses private heap for each thread. Each private-heap has multiple bins of different sizes. Each bin contains free regions of the same size.
-
-\paragraph{tc malloc}
-tcmalloc (FIX ME: cite allocator) is a thread-safe allocator. It uses per-thread cache to store free objects that prevents contention on shared resources in multi-threaded application. A central free-list is used to refill per-thread cache when it gets empty.
Index: doc/theses/mubeen_zulfiqar_MMath/performance.tex
===================================================================
--- doc/theses/mubeen_zulfiqar_MMath/performance.tex	(revision 9c5aef93ec8be1d4946206206a356a8054f43fc1)
+++ doc/theses/mubeen_zulfiqar_MMath/performance.tex	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
@@ -18,4 +18,42 @@
 \noindent
 ====================
+
+\section{Machine Specification}
+
+The performance experiments were run on three different multicore systems to determine if there is consistency across platforms:
+\begin{itemize}
+\item
+AMD EPYC 7662, 64-core socket $\times$ 2, 2.0 GHz
+\item
+Huawei ARM TaiShan 2280 V2 Kunpeng 920, 24-core socket $\times$ 4, 2.6 GHz
+\item
+Intel Xeon Gold 5220R, 48-core socket $\times$ 2, 2.20GHz
+\end{itemize}
+
+
+\section{Existing Memory Allocators}
+With dynamic allocation being an important feature of C, there are many stand-alone memory allocators that have been designed for different purposes. For this thesis, we chose 7 of the most popular and widely used memory allocators.
+
+\paragraph{dlmalloc}
+dlmalloc (FIX ME: cite allocator) is a thread-safe allocator that is single threaded and single heap. dlmalloc maintains free-lists of different sizes to store freed dynamic memory. (FIX ME: cite wasik)
+
+\paragraph{hoard}
+Hoard (FIX ME: cite allocator) is a thread-safe allocator that is multi-threaded and using a heap layer framework. It has per-thread heaps that have thread-local free-lists, and a global shared heap. (FIX ME: cite wasik)
+
+\paragraph{jemalloc}
+jemalloc (FIX ME: cite allocator) is a thread-safe allocator that uses multiple arenas. Each thread is assigned an arena. Each arena has chunks that contain contagious memory regions of same size. An arena has multiple chunks that contain regions of multiple sizes.
+
+\paragraph{ptmalloc}
+ptmalloc (FIX ME: cite allocator) is a modification of dlmalloc. It is a thread-safe multi-threaded memory allocator that uses multiple heaps. ptmalloc heap has similar design to dlmalloc's heap.
+
+\paragraph{rpmalloc}
+rpmalloc (FIX ME: cite allocator) is a thread-safe allocator that is multi-threaded and uses per-thread heap. Each heap has multiple size-classes and each size-class contains memory regions of the relevant size.
+
+\paragraph{tbb malloc}
+tbb malloc (FIX ME: cite allocator) is a thread-safe allocator that is multi-threaded and uses private heap for each thread. Each private-heap has multiple bins of different sizes. Each bin contains free regions of the same size.
+
+\paragraph{tc malloc}
+tcmalloc (FIX ME: cite allocator) is a thread-safe allocator. It uses per-thread cache to store free objects that prevents contention on shared resources in multi-threaded application. A central free-list is used to refill per-thread cache when it gets empty.
+
 
 \section{Memory Allocators}
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 9c5aef93ec8be1d4946206206a356a8054f43fc1)
+++ libcfa/src/concurrency/io.cfa	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
@@ -287,5 +287,5 @@
 	//=============================================================================================
 	// submission
-	static inline void __submit( struct $io_context * ctx, __u32 idxs[], __u32 have, bool lazy) {
+	static inline void __submit_only( struct $io_context * ctx, __u32 idxs[], __u32 have) {
 		// We can proceed to the fast path
 		// Get the right objects
@@ -306,4 +306,10 @@
 		ctx->proc->io.pending = true;
 		ctx->proc->io.dirty   = true;
+	}
+
+	static inline void __submit( struct $io_context * ctx, __u32 idxs[], __u32 have, bool lazy) {
+		__sub_ring_t & sq = ctx->sq;
+		__submit_only(ctx, idxs, have);
+
 		if(sq.to_submit > 30) {
 			__tls_stats()->io.flush.full++;
@@ -402,8 +408,12 @@
 // I/O Arbiter
 //=============================================================================================
-	static inline void block(__outstanding_io_queue & queue, __outstanding_io & item) {
+	static inline bool enqueue(__outstanding_io_queue & queue, __outstanding_io & item) {
+		bool was_empty;
+
 		// Lock the list, it's not thread safe
 		lock( queue.lock __cfaabi_dbg_ctx2 );
 		{
+			was_empty = empty(queue.queue);
+
 			// Add our request to the list
 			add( queue.queue, item );
@@ -414,5 +424,5 @@
 		unlock( queue.lock );
 
-		wait( item.sem );
+		return was_empty;
 	}
 
@@ -432,5 +442,7 @@
 		pa.want = want;
 
-		block(this.pending, (__outstanding_io&)pa);
+		enqueue(this.pending, (__outstanding_io&)pa);
+
+		wait( pa.sem );
 
 		return pa.ctx;
@@ -485,5 +497,14 @@
 		ei.lazy = lazy;
 
-		block(ctx->ext_sq, (__outstanding_io&)ei);
+		bool we = enqueue(ctx->ext_sq, (__outstanding_io&)ei);
+
+		ctx->proc->io.pending = true;
+
+		if( we ) {
+			sigval_t value = { PREEMPT_IO };
+			pthread_sigqueue(ctx->proc->kernel_thread, SIGUSR1, value);
+		}
+
+		wait( ei.sem );
 
 		__cfadbg_print_safe(io, "Kernel I/O : %u submitted from arbiter\n", have);
@@ -501,5 +522,5 @@
 					__external_io & ei = (__external_io&)drop( ctx.ext_sq.queue );
 
-					__submit(&ctx, ei.idxs, ei.have, ei.lazy);
+					__submit_only(&ctx, ei.idxs, ei.have);
 
 					post( ei.sem );
Index: libcfa/src/concurrency/io/setup.cfa
===================================================================
--- libcfa/src/concurrency/io/setup.cfa	(revision 9c5aef93ec8be1d4946206206a356a8054f43fc1)
+++ libcfa/src/concurrency/io/setup.cfa	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
@@ -56,4 +56,5 @@
 
 	#include "bitmanip.hfa"
+	#include "fstream.hfa"
 	#include "kernel_private.hfa"
 	#include "thread.hfa"
@@ -258,4 +259,13 @@
 		struct __sub_ring_t & sq = this.sq;
 		struct __cmp_ring_t & cq = this.cq;
+		{
+			__u32 fhead = sq.free_ring.head;
+			__u32 ftail = sq.free_ring.tail;
+
+			__u32 total = *sq.num;
+			__u32 avail = ftail - fhead;
+
+			if(avail != total) abort | "Processor (" | (void*)this.proc | ") tearing down ring with" | (total - avail) | "entries allocated but not submitted, out of" | total;
+		}
 
 		// unmap the submit queue entries
Index: libcfa/src/concurrency/iofwd.hfa
===================================================================
--- libcfa/src/concurrency/iofwd.hfa	(revision 9c5aef93ec8be1d4946206206a356a8054f43fc1)
+++ libcfa/src/concurrency/iofwd.hfa	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
@@ -19,4 +19,5 @@
 extern "C" {
 	#include <asm/types.h>
+	#include <sys/stat.h> // needed for mode_t
 	#if CFA_HAVE_LINUX_IO_URING_H
 		#include <linux/io_uring.h>
@@ -133,2 +134,21 @@
 // Check if a function is blocks a only the user thread
 bool has_user_level_blocking( fptr_t func );
+
+#if CFA_HAVE_LINUX_IO_URING_H
+	static inline void zero_sqe(struct io_uring_sqe * sqe) {
+		sqe->flags = 0;
+		sqe->ioprio = 0;
+		sqe->fd = 0;
+		sqe->off = 0;
+		sqe->addr = 0;
+		sqe->len = 0;
+		sqe->fsync_flags = 0;
+		sqe->__pad2[0] = 0;
+		sqe->__pad2[1] = 0;
+		sqe->__pad2[2] = 0;
+		sqe->fd = 0;
+		sqe->off = 0;
+		sqe->addr = 0;
+		sqe->len = 0;
+	}
+#endif
Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision 9c5aef93ec8be1d4946206206a356a8054f43fc1)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
@@ -347,5 +347,5 @@
 					struct oneshot * want = expected == 0p ? 1p : 2p;
 					if(__atomic_compare_exchange_n(&this.ptr, &expected, want, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-						if( expected == 0p ) { /* paranoid */ verify( this.ptr == 1p); return 0p; }
+						if( expected == 0p ) { return 0p; }
 						thread$ * ret = post( *expected, do_unpark );
 						__atomic_store_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 9c5aef93ec8be1d4946206206a356a8054f43fc1)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
@@ -60,4 +60,10 @@
 extern bool __preemption_enabled();
 
+enum {
+	PREEMPT_NORMAL    = 0,
+	PREEMPT_TERMINATE = 1,
+	PREEMPT_IO = 2,
+};
+
 static inline void __disable_interrupts_checked() {
 	/* paranoid */ verify( __preemption_enabled() );
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 9c5aef93ec8be1d4946206206a356a8054f43fc1)
+++ libcfa/src/concurrency/preemption.cfa	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
@@ -96,9 +96,4 @@
 	lock{};
 }
-
-enum {
-	PREEMPT_NORMAL    = 0,
-	PREEMPT_TERMINATE = 1,
-};
 
 //=============================================================================================
@@ -664,4 +659,5 @@
 	choose(sfp->si_value.sival_int) {
 		case PREEMPT_NORMAL   : ;// Normal case, nothing to do here
+		case PREEMPT_IO       : ;// I/O asked to stop spinning, nothing to do here
 		case PREEMPT_TERMINATE: verify( __atomic_load_n( &__cfaabi_tls.this_processor->do_terminate, __ATOMIC_SEQ_CST ) );
 		default:
Index: src/Concurrency/Keywords.cc
===================================================================
--- src/Concurrency/Keywords.cc	(revision 9c5aef93ec8be1d4946206206a356a8054f43fc1)
+++ src/Concurrency/Keywords.cc	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
@@ -422,6 +422,7 @@
 			;
 		else if ( auto param = isMainFor( decl, cast_target ) ) {
-			// This should never trigger.
-			assert( vtable_decl );
+			if ( !vtable_decl ) {
+				SemanticError( decl, context_error );
+			}
 			// Should be safe because of isMainFor.
 			StructInstType * struct_type = static_cast<StructInstType *>(
Index: tests/concurrent/.expect/mainError.txt
===================================================================
--- tests/concurrent/.expect/mainError.txt	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
+++ tests/concurrent/.expect/mainError.txt	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
@@ -0,0 +1,11 @@
+concurrent/mainError.cfa:1:1 error: thread keyword requires threads to be in scope, add #include <thread.hfa>
+thread Test: with body 1
+
+concurrent/mainError.cfa:2:1 error: thread keyword requires threads to be in scope, add #include <thread.hfa>
+main: function
+... with parameters
+  reference to instance of struct Test with body 1
+... returning nothing
+... with body
+  CompoundStmt
+
Index: tests/concurrent/mainError.cfa
===================================================================
--- tests/concurrent/mainError.cfa	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
+++ tests/concurrent/mainError.cfa	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
@@ -0,0 +1,2 @@
+thread Test {};
+void main(Test&) {}
Index: tests/io/.expect/away_fair.txt
===================================================================
--- tests/io/.expect/away_fair.txt	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
+++ tests/io/.expect/away_fair.txt	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
@@ -0,0 +1,12 @@
+starting
+100
+200
+300
+400
+500
+600
+700
+800
+900
+1000
+done
Index: tests/io/away_fair.cfa
===================================================================
--- tests/io/away_fair.cfa	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
+++ tests/io/away_fair.cfa	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
@@ -0,0 +1,107 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2022 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// away_fair.cfa -- Test that spinning doesn't cause submissions to get stuck.
+//                  This test should work without io_uring but isn't very useful without
+//
+// Author           : Thierry Delisle
+// Created On       : Wed Mar 2 12:56:51 2022
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#include <bits/defs.hfa>
+#include <fstream.hfa>
+#include <kernel.hfa>
+#include <thread.hfa>
+#include <iofwd.hfa>
+#include <io/types.hfa>
+
+Duration default_preemption() {
+	return 0;
+}
+
+enum { TIMES = 1000 };
+
+volatile unsigned counter = 0;
+
+// ----- Spinner -----
+// spins trying to prevent other threads from getting to this processor
+thread Spinner {};
+void ^?{}(Spinner &mutex ) {}
+void main(Spinner &) {
+	unsigned last = 0;
+	for() {
+		unsigned curr = __atomic_load_n(&counter, __ATOMIC_SEQ_CST);
+
+		if(curr >= TIMES) return;
+
+		if(last == curr) {
+			Pause();
+			continue;
+		}
+
+		last = curr;
+		yield();
+	}
+}
+
+// ----- Spinner -----
+// try to submit io but yield so that it's likely we are moved to the slow path
+thread Submitter {};
+void ^?{}(Submitter &mutex ) {}
+void main(Submitter & this) {
+	for(TIMES) {
+		#if CFA_HAVE_LINUX_IO_URING_H
+			io_future_t f;
+			struct io_uring_sqe * sqe;
+			__u32 idx;
+			struct $io_context * ctx = cfa_io_allocate(&sqe, &idx, 1);
+
+			zero_sqe(sqe);
+			sqe->opcode = IORING_OP_NOP;
+			sqe->user_data = (uintptr_t)&f;
+		#endif
+
+		yield( prng( this, 15 ) );
+
+		#if CFA_HAVE_LINUX_IO_URING_H
+			// Submit everything
+			asm volatile("": : :"memory");
+			cfa_io_submit( ctx, &idx, 1, false );
+		#endif
+
+		unsigned i = __atomic_add_fetch( &counter, 1, __ATOMIC_SEQ_CST );
+		if(0 == (i % 100)) sout | i;
+
+		#if CFA_HAVE_LINUX_IO_URING_H
+			wait( f );
+		#endif
+	}
+}
+
+// ----- Yielder -----
+// Add some chaos into the mix
+thread Yielder {};
+void ^?{}(Yielder &mutex ) {}
+void main(Yielder&) {
+	while(TIMES > __atomic_load_n(&counter, __ATOMIC_SEQ_CST)) {
+		yield();
+	}
+}
+
+
+int main() {
+	processor p;
+	sout | "starting";
+	{
+		Yielder y;
+		Spinner s;
+		Submitter io;
+	}
+	sout | "done";
+}
Index: tests/io/many_read.cfa
===================================================================
--- tests/io/many_read.cfa	(revision 9c5aef93ec8be1d4946206206a356a8054f43fc1)
+++ tests/io/many_read.cfa	(revision bf8b77ebfa0b27a20558bbb27b2a3114c568a995)
@@ -5,5 +5,5 @@
 // file "LICENCE" distributed with Cforall.
 //
-// many_read.cfa -- Make sure that multiple concurrent reads to mess up.
+// many_read.cfa -- Make sure that multiple concurrent reads don't mess up.
 //
 // Author           : Thierry Delisle