Index: Jenkins/FullBuild
===================================================================
--- Jenkins/FullBuild	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ Jenkins/FullBuild	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -17,6 +17,6 @@
 
 				parallel (
-					clang_x86: { trigger_build( 'gcc-8',   'x86' ) },
-					gcc_5_x86: { trigger_build( 'gcc-7',   'x86' ) },
+					gcc_8_x86: { trigger_build( 'gcc-8',   'x86' ) },
+					gcc_7_x86: { trigger_build( 'gcc-7',   'x86' ) },
 					gcc_6_x86: { trigger_build( 'gcc-6',   'x86' ) },
 					gcc_9_x64: { trigger_build( 'gcc-9',   'x64' ) },
Index: Jenkinsfile
===================================================================
--- Jenkinsfile	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ Jenkinsfile	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -127,5 +127,7 @@
 			}
 
-			sh "${SrcDir}/configure CXX=${Settings.Compiler.CXX} CC=${Settings.Compiler.CC} ${Settings.Architecture.flags} AR=gcc-ar RANLIB=gcc-ranlib ${targets} --quiet --prefix=${BuildDir}"
+			ast = Settings.NewAST ? "--enable-new-ast" : "--disable-new-ast"
+
+			sh "${SrcDir}/configure CXX=${Settings.Compiler.CXX} CC=${Settings.Compiler.CC} ${Settings.Architecture.flags} AR=gcc-ar RANLIB=gcc-ranlib ${targets} ${ast} --quiet --prefix=${BuildDir}"
 
 			// Configure libcfa
@@ -359,4 +361,5 @@
 	public final CC_Desc Compiler
 	public final Arch_Desc Architecture
+	public final Boolean NewAST
 	public final Boolean RunAllTests
 	public final Boolean RunBenchmark
@@ -410,4 +413,5 @@
 
 		this.IsSandbox          = (branch == "jenkins-sandbox")
+		this.NewAST             = param.NewAST
 		this.RunAllTests        = param.RunAllTests
 		this.RunBenchmark       = param.RunBenchmark
@@ -470,8 +474,13 @@
 				],												\
 				[$class: 'BooleanParameterDefinition',  						\
+					description: 'If true, build compiler using new AST', 		\
+					name: 'NewAST', 									\
+					defaultValue: false,  								\
+				], 												\
+				[$class: 'BooleanParameterDefinition',  						\
 					description: 'If false, only the quick test suite is ran', 		\
 					name: 'RunAllTests', 								\
 					defaultValue: false,  								\
-				], 												\
+				],
 				[$class: 'BooleanParameterDefinition',  						\
 					description: 'If true, jenkins also runs benchmarks', 		\
Index: benchmark/readyQ/bench.go
===================================================================
--- benchmark/readyQ/bench.go	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ benchmark/readyQ/bench.go	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,74 @@
+package main
+
+import (
+	"bufio"
+	"flag"
+	"fmt"
+	"os"
+	"runtime"
+	"sync/atomic"
+	"time"
+)
+
+var clock_mode bool
+var threads_left int64
+var stop int32
+var duration float64
+var stop_count uint64
+var nprocs int
+var nthreads int
+
+func fflush(f *bufio.Writer) {
+	defer f.Flush()
+	f.Write([]byte("\r"))
+}
+
+func wait(start time.Time, is_tty bool) {
+	f := bufio.NewWriter(os.Stdout)
+	tdur := time.Duration(duration)
+	for true {
+		time.Sleep(100 * time.Millisecond)
+		end := time.Now()
+		delta := end.Sub(start)
+		if is_tty {
+			fmt.Printf(" %.1f",delta.Seconds())
+			fflush(f)
+		}
+		if clock_mode && delta >= (tdur * time.Second) {
+			break
+		} else if !clock_mode && atomic.LoadInt64(&threads_left) == 0 {
+			break
+		}
+	}
+}
+
+func bench_init() {
+	nprocsOpt := flag.Int("p", 1, "The number of processors")
+	nthreadsOpt := flag.Int("t", 1, "The number of threads")
+	durationOpt := flag.Float64("d", 0, "Duration of the experiment in seconds")
+	stopOpt := flag.Uint64("i", 0, "Duration of the experiment in iterations")
+
+	flag.Parse()
+
+	nprocs = *nprocsOpt
+	nthreads = *nthreadsOpt
+	duration = *durationOpt
+	stop_count = *stopOpt
+
+	if duration > 0 && stop_count > 0 {
+		panic(fmt.Sprintf("--duration and --iterations cannot be used together\n"))
+	} else if duration > 0 {
+		clock_mode = true
+		stop_count = 0xFFFFFFFFFFFFFFFF
+		fmt.Printf("Running for %f seconds\n", duration)
+	} else if stop_count > 0 {
+		clock_mode = false
+		fmt.Printf("Running for %d iterations\n", stop_count)
+	} else {
+		duration = 5
+		clock_mode = true
+		fmt.Printf("Running for %f seconds\n", duration)
+	}
+
+	runtime.GOMAXPROCS(nprocs)
+}
Index: benchmark/readyQ/cycle.cfa
===================================================================
--- benchmark/readyQ/cycle.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ benchmark/readyQ/cycle.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,102 @@
+#include "rq_bench.hfa"
+
+struct Partner {
+	unsigned long long count;
+	unsigned long long blocks;
+	bench_sem self;
+	bench_sem * next;
+};
+
+void ?{}( Partner & this ) {
+	this.count = this.blocks = 0;
+}
+
+thread BThrd {
+	Partner & partner;
+};
+
+void ?{}( BThrd & this, Partner * partner ) {
+	((thread&)this){ bench_cluster };
+	&this.partner = partner;
+}
+
+void ^?{}( BThrd & mutex this ) {}
+
+void main( BThrd & thrd ) with(thrd.partner) {
+	count = 0;
+	for() {
+		blocks += wait( self );
+		post( *next );
+		count ++;
+		if( clock_mode && stop) break;
+		if(!clock_mode && count >= stop_count) break;
+	}
+
+	__atomic_fetch_add(&threads_left, -1, __ATOMIC_SEQ_CST);
+}
+
+int main(int argc, char * argv[]) {
+	unsigned ring_size = 2;
+	cfa_option opt[] = {
+		BENCH_OPT,
+		{ 'r', "ringsize", "Number of threads in a cycle", ring_size }
+	};
+	BENCH_OPT_PARSE("cforall cycle benchmark");
+
+	{
+		unsigned long long global_counter = 0;
+		unsigned long long global_blocks  = 0;
+		unsigned tthreads = nthreads * ring_size;
+		Time start, end;
+		BenchCluster bc = { nprocs };
+		{
+			threads_left = tthreads;
+			BThrd * threads[tthreads];
+			Partner thddata[tthreads];
+			for(i; tthreads) {
+				unsigned pi = (i + nthreads) % tthreads;
+				thddata[i].next = &thddata[pi].self;
+			}
+			for(int i = 0; i < tthreads; i++) {
+				threads[i] = malloc();
+				(*threads[i]){ &thddata[i] };
+			}
+			printf("Starting\n");
+
+			bool is_tty = isatty(STDOUT_FILENO);
+			start = getTimeNsec();
+
+			for(i; nthreads) {
+				post( thddata[i].self );
+			}
+			wait(start, is_tty);
+
+			stop = true;
+			end = getTimeNsec();
+			printf("\nDone\n");
+
+			for(i; tthreads) {
+				Partner & partner = join( *threads[i] ).partner;
+				global_counter += partner.count;
+				global_blocks  += partner.blocks;
+				delete(threads[i]);
+			}
+		}
+
+		printf("Duration (ms)        : %'ld\n", (end - start)`ms);
+		printf("Number of processors : %'d\n", nprocs);
+		printf("Number of threads    : %'d\n", tthreads);
+		printf("Cycle size (# thrds) : %'d\n", ring_size);
+		printf("Total Operations(ops): %'15llu\n", global_counter);
+		printf("Total blocks         : %'15llu\n", global_blocks);
+		printf("Ops per second       : %'18.2lf\n", ((double)global_counter) / (end - start)`s);
+		printf("ns per ops           : %'18.2lf\n", ((double)(end - start)`ns) / global_counter);
+		printf("Ops per threads      : %'15llu\n", global_counter / tthreads);
+		printf("Ops per procs        : %'15llu\n", global_counter / nprocs);
+		printf("Ops/sec/procs        : %'18.2lf\n", (((double)global_counter) / nprocs) / (end - start)`s);
+		printf("ns per ops/procs     : %'18.2lf\n", ((double)(end - start)`ns) / (global_counter / nprocs));
+		fflush(stdout);
+	}
+
+	return 0;
+}
Index: benchmark/readyQ/cycle.cpp
===================================================================
--- benchmark/readyQ/cycle.cpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ benchmark/readyQ/cycle.cpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,86 @@
+
+#include "rq_bench.hpp"
+
+struct Partner {
+	unsigned long long count  = 0;
+	unsigned long long blocks = 0;
+	bench_sem self;
+	bench_sem * next;
+};
+
+void partner_main( Partner * self ) {
+	self->count = 0;
+	for(;;) {
+		self->blocks += self->self.wait();
+		self->next->post();
+		self->count ++;
+		if( clock_mode && stop) break;
+		if(!clock_mode && self->count >= stop_count) break;
+	}
+
+	__atomic_fetch_add(&threads_left, -1, __ATOMIC_SEQ_CST);
+}
+
+int main(int argc, char * argv[]) {
+	unsigned ring_size = 2;
+	option_t opt[] = {
+		BENCH_OPT,
+		{ 'r', "ringsize", "Number of threads in a cycle", ring_size }
+	};
+	BENCH_OPT_PARSE("cforall cycle benchmark");
+
+	{
+		unsigned long long global_counter = 0;
+		unsigned long long global_blocks  = 0;
+		unsigned tthreads = nthreads * ring_size;
+		uint64_t start, end;
+		FibreInit(1, nprocs);
+		{
+			threads_left = tthreads;
+			Fibre * threads[tthreads];
+			Partner thddata[tthreads];
+			for(int i = 0; i < tthreads; i++) {
+				unsigned pi = (i + nthreads) % tthreads;
+				thddata[i].next = &thddata[pi].self;
+			}
+			for(int i = 0; i < tthreads; i++) {
+				threads[i] = new Fibre( reinterpret_cast<void (*)(void *)>(partner_main), &thddata[i] );
+			}
+			printf("Starting\n");
+
+			bool is_tty = isatty(STDOUT_FILENO);
+			start = getTimeNsec();
+
+			for(int i = 0; i < nthreads; i++) {
+				thddata[i].self.post();
+			}
+			wait(start, is_tty);
+
+			stop = true;
+			end = getTimeNsec();
+			printf("\nDone\n");
+
+			for(int i = 0; i < tthreads; i++) {
+				fibre_join( threads[i], nullptr );
+				global_counter += thddata[i].count;
+				global_blocks  += thddata[i].blocks;
+			}
+		}
+
+		printf("Duration (ms)        : %'ld\n", to_miliseconds(end - start));
+		printf("Number of processors : %'d\n", nprocs);
+		printf("Number of threads    : %'d\n", tthreads);
+		printf("Cycle size (# thrds) : %'d\n", ring_size);
+		printf("Total Operations(ops): %'15llu\n", global_counter);
+		printf("Total blocks         : %'15llu\n", global_blocks);
+		printf("Ops per second       : %'18.2lf\n", ((double)global_counter) / to_fseconds(end - start));
+		printf("ns per ops           : %'18.2lf\n", ((double)(end - start)) / global_counter);
+		printf("Ops per threads      : %'15llu\n", global_counter / tthreads);
+		printf("Ops per procs        : %'15llu\n", global_counter / nprocs);
+		printf("Ops/sec/procs        : %'18.2lf\n", (((double)global_counter) / nprocs) / to_fseconds(end - start));
+		printf("ns per ops/procs     : %'18.2lf\n", ((double)(end - start)) / (global_counter / nprocs));
+		fflush(stdout);
+	}
+
+	return 0;
+}
Index: benchmark/readyQ/cycle.go
===================================================================
--- benchmark/readyQ/cycle.go	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ benchmark/readyQ/cycle.go	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,81 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"sync/atomic"
+	"time"
+	"golang.org/x/text/language"
+	"golang.org/x/text/message"
+)
+
+func partner(result chan uint64, mine chan int, next chan int) {
+	count := uint64(0)
+	for true {
+		<- mine
+		next <- 0
+		count += 1
+		if  clock_mode && atomic.LoadInt32(&stop) == 1 { break }
+		if !clock_mode && count >= stop_count { break }
+	}
+
+	atomic.AddInt64(&threads_left, -1);
+	result <- count
+}
+
+func main() {
+	var ring_size int
+
+	ring_sizeOpt := flag.Int("r", 2, "The number of threads per cycles")
+
+	bench_init()
+
+	ring_size = *ring_sizeOpt
+
+	tthreads := nthreads * ring_size
+	threads_left = int64(tthreads)
+
+	result := make(chan uint64)
+	channels := make([]chan int, tthreads)
+	for i := range channels {
+		channels[i] = make(chan int, 1)
+	}
+
+	for i := 0; i < tthreads; i++ {
+		pi := (i + nthreads) % tthreads
+		go partner(result, channels[i], channels[pi])
+	}
+	fmt.Printf("Starting\n");
+
+	atomic.StoreInt32(&stop, 0)
+	start := time.Now()
+	for i := 0; i < nthreads; i++ {
+		channels[i] <- 0
+	}
+	wait(start, true);
+
+	atomic.StoreInt32(&stop, 1)
+	end := time.Now()
+	delta := end.Sub(start)
+
+	fmt.Printf("\nDone\n")
+
+	global_counter := uint64(0)
+	for i := 0; i < tthreads; i++ {
+		global_counter += <- result
+	}
+
+	p := message.NewPrinter(language.English)
+	p.Printf("Duration (ms)        : %f\n", delta.Seconds());
+	p.Printf("Number of processors : %d\n", nprocs);
+	p.Printf("Number of threads    : %d\n", tthreads);
+	p.Printf("Cycle size (# thrds) : %d\n", ring_size);
+	p.Printf("Total Operations(ops): %15d\n", global_counter)
+	p.Printf("Yields per second    : %18.2f\n", float64(global_counter) / delta.Seconds())
+	p.Printf("ns per ops           : %18.2f\n", float64(delta.Nanoseconds()) / float64(global_counter))
+	p.Printf("Ops per threads      : %15d\n", global_counter / uint64(tthreads))
+	p.Printf("Ops per procs        : %15d\n", global_counter / uint64(nprocs))
+	p.Printf("Ops/sec/procs        : %18.2f\n", (float64(global_counter) / float64(nprocs)) / delta.Seconds())
+	p.Printf("ns per ops/procs     : %18.2f\n", float64(delta.Nanoseconds()) / (float64(global_counter) / float64(nprocs)))
+
+}
Index: benchmark/readyQ/rq_bench.hfa
===================================================================
--- benchmark/readyQ/rq_bench.hfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ benchmark/readyQ/rq_bench.hfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,137 @@
+#include <clock.hfa>
+#include <kernel.hfa>
+#include <parseargs.hfa>
+#include <stdio.h>
+#include <stdlib.hfa>
+#include <thread.hfa>
+#include <time.hfa>
+#include <unistd.h>
+
+volatile bool stop = false;
+bool clock_mode;
+double duration = -1;
+unsigned long long stop_count = 0;
+unsigned nprocs = 1;
+unsigned nthreads = 1;
+
+volatile unsigned long long threads_left;
+
+#define thread_loop for(this.count = 0;; this.count++)
+
+#define BENCH_OPT \
+	{'d', "duration",  "Duration of the experiments in seconds", duration }, \
+	{'i', "iterations",  "Number of iterations of the experiments", stop_count }, \
+	{'t', "nthreads",  "Number of threads to use", nthreads }, \
+	{'p', "nprocs",    "Number of processors to use", nprocs }
+
+#define BENCH_OPT_PARSE(name) \
+	{ \
+		int opt_cnt = sizeof(opt) / sizeof(cfa_option); \
+		char **left; \
+		parse_args( argc, argv, opt, opt_cnt, "[OPTIONS]...\n" name, left ); \
+		if(duration > 0 && stop_count > 0) { \
+			fprintf(stderr, "--duration and --iterations cannot be used together\n"); \
+			print_args_usage(argc, argv, opt, opt_cnt, "[OPTIONS]...\n" name, true); \
+		} else if(duration > 0) { \
+			clock_mode = true; \
+			stop_count = 0xFFFFFFFFFFFFFFFF; \
+			printf("Running for %lf seconds\n", duration); \
+		} else if(stop_count > 0) { \
+			clock_mode = false; \
+			printf("Running for %lu iterations\n", stop_count); \
+		} else { \
+			duration = 5; clock_mode = true;\
+			printf("Running for %lf seconds\n", duration); \
+		} \
+	}
+
+struct cluster & bench_cluster;
+
+struct BenchCluster {
+	cluster cl;
+	processor * procs;
+	unsigned nprocs;
+};
+
+void ?{}( BenchCluster & this, unsigned nprocs ) {
+	(this.cl){ "Benchmark Cluster" };
+	&bench_cluster = &this.cl;
+	this.nprocs = nprocs;
+	this.procs  = alloc( this.nprocs );
+	for(i; this.nprocs){
+		processor * p = &this.procs[i];
+		(*p){ "Benchmark Processor", this.cl };
+	}
+}
+
+void ^?{}( BenchCluster & this ) {
+	adelete( this.procs );
+	^(this.cl){};
+}
+
+void wait(const Time & start, bool is_tty) {
+	for() {
+		sleep(100`ms);
+		Time end = getTimeNsec();
+		Duration delta = end - start;
+		if(is_tty) {
+			printf(" %.1f\r", delta`ds);
+			fflush(stdout);
+		}
+		if( clock_mode && delta >= duration`s ) {
+			break;
+		}
+		else if( !clock_mode && threads_left == 0 ) {
+			break;
+		}
+	}
+}
+
+struct bench_sem {
+	struct $thread * volatile ptr;
+};
+
+static inline {
+	void  ?{}(bench_sem & this) {
+		this.ptr = 0p;
+	}
+
+	void ^?{}(bench_sem & this) {}
+
+	bool wait(bench_sem & this) {
+		for() {
+			struct $thread * expected = this.ptr;
+			if(expected == 1p) {
+				if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					return false;
+				}
+			}
+			else {
+				/* paranoid */ verify( expected == 0p );
+				if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					park();
+					return true;
+				}
+			}
+
+		}
+	}
+
+	bool post(bench_sem & this) {
+		for() {
+			struct $thread * expected = this.ptr;
+			if(expected == 1p) return false;
+			if(expected == 0p) {
+				if(__atomic_compare_exchange_n(&this.ptr, &expected, 1p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					return false;
+				}
+			}
+			else {
+				if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					unpark( expected );
+					return true;
+				}
+			}
+		}
+	}
+}
Index: benchmark/readyQ/rq_bench.hpp
===================================================================
--- benchmark/readyQ/rq_bench.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ benchmark/readyQ/rq_bench.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,386 @@
+#include <cassert>
+
+#include <time.h>										// timespec
+#include <sys/time.h>									// timeval
+
+enum { TIMEGRAN = 1000000000LL };					// nanosecond granularity, except for timeval
+
+
+#include <libfibre/fibre.h>
+
+
+
+volatile bool stop = false;
+bool clock_mode;
+double duration = -1;
+unsigned long long stop_count = 0;
+unsigned nprocs = 1;
+unsigned nthreads = 1;
+
+volatile unsigned long long threads_left;
+
+#define BENCH_OPT \
+	{'d', "duration",  "Duration of the experiments in seconds", duration }, \
+	{'i', "iterations",  "Number of iterations of the experiments", stop_count }, \
+	{'t', "nthreads",  "Number of threads to use", nthreads }, \
+	{'p', "nprocs",    "Number of processors to use", nprocs }
+
+#define BENCH_OPT_PARSE(name) \
+	{ \
+		int opt_cnt = sizeof(opt) / sizeof(option_t); \
+		char **left; \
+		parse_args( argc, argv, opt, opt_cnt, "[OPTIONS]...\n" name, &left ); \
+		if(duration > 0 && stop_count > 0) { \
+			fprintf(stderr, "--duration and --iterations cannot be used together\n"); \
+			print_args_usage(argc, argv, opt, opt_cnt, "[OPTIONS]...\n" name, true); \
+		} else if(duration > 0) { \
+			clock_mode = true; \
+			stop_count = 0xFFFFFFFFFFFFFFFF; \
+			printf("Running for %lf seconds\n", duration); \
+		} else if(stop_count > 0) { \
+			clock_mode = false; \
+			printf("Running for %llu iterations\n", stop_count); \
+		} else { \
+			duration = 5; clock_mode = true;\
+			printf("Running for %lf seconds\n", duration); \
+		} \
+	}
+
+uint64_t getTimeNsec() {
+	timespec curr;
+	clock_gettime( CLOCK_REALTIME, &curr );
+	return (int64_t)curr.tv_sec * TIMEGRAN + curr.tv_nsec;
+}
+
+uint64_t to_miliseconds( uint64_t durtn ) { return durtn / (TIMEGRAN / 1000LL); }
+double to_fseconds(uint64_t durtn ) { return durtn / (double)TIMEGRAN; }
+uint64_t from_fseconds(double sec) { return sec * TIMEGRAN; }
+
+void wait(const uint64_t & start, bool is_tty) {
+	for(;;) {
+		Fibre::usleep(100000);
+		uint64_t end = getTimeNsec();
+		uint64_t delta = end - start;
+		if(is_tty) {
+			printf(" %.1f\r", to_fseconds(delta));
+			fflush(stdout);
+		}
+		if( clock_mode && delta >= from_fseconds(duration) ) {
+			break;
+		}
+		else if( !clock_mode && threads_left == 0 ) {
+			break;
+		}
+	}
+}
+
+class bench_sem {
+	Fibre * volatile ptr = nullptr;
+public:
+	inline bool wait() {
+		static Fibre * const ready  = reinterpret_cast<Fibre * const>(1ull);
+		for(;;) {
+			Fibre * expected = this->ptr;
+			if(expected == ready) {
+				if(__atomic_compare_exchange_n(&this->ptr, &expected, nullptr, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					return false;
+				}
+			}
+			else {
+				/* paranoid */ assert( expected == nullptr );
+				if(__atomic_compare_exchange_n(&this->ptr, &expected, fibre_self(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					fibre_park();
+					return true;
+				}
+			}
+
+		}
+	}
+
+	inline bool post() {
+		static Fibre * const ready  = reinterpret_cast<Fibre * const>(1ull);
+		for(;;) {
+			Fibre * expected = this->ptr;
+			if(expected == ready) return false;
+			if(expected == nullptr) {
+				if(__atomic_compare_exchange_n(&this->ptr, &expected, ready, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					return false;
+				}
+			}
+			else {
+				if(__atomic_compare_exchange_n(&this->ptr, &expected, nullptr, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+					fibre_unpark( expected );
+					return true;
+				}
+			}
+		}
+	}
+};
+
+// ==========================================================================================
+#include <cstring>
+
+//-----------------------------------------------------------------------------
+// Typed argument parsing
+bool parse_yesno(const char * arg, bool & value ) {
+	if(strcmp(arg, "yes") == 0) {
+		value = true;
+		return true;
+	}
+
+	if(strcmp(arg, "no") == 0) {
+		value = false;
+		return true;
+	}
+
+	return false;
+}
+
+bool parse_settrue (const char *, bool & value ) {
+	value = true;
+	return true;
+}
+
+bool parse_setfalse(const char *, bool & value )  {
+	value = false;
+	return true;
+}
+
+bool parse(const char * arg, const char * & value ) {
+	value = arg;
+	return true;
+}
+
+bool parse(const char * arg, int & value) {
+	char * end;
+	int r = strtoll(arg, &end, 10);
+	if(*end != '\0') return false;
+
+	value = r;
+	return true;
+}
+
+bool parse(const char * arg, unsigned & value) {
+	char * end;
+	unsigned long long int r = strtoull(arg, &end, 10);
+	if(*end != '\0') return false;
+	if(r > UINT_MAX) return false;
+
+	value = r;
+	return true;
+}
+
+bool parse(const char * arg, unsigned long & value) {
+	char * end;
+	unsigned long long int r = strtoull(arg, &end, 10);
+	if(*end != '\0') return false;
+	if(r > ULONG_MAX) return false;
+
+	value = r;
+	return true;
+}
+
+bool parse(const char * arg, unsigned long long & value) {
+        char * end;
+        unsigned long long int r = strtoull(arg, &end, 10);
+        if(*end != '\0') return false;
+        if(r > ULLONG_MAX) return false;
+
+        value = r;
+        return true;
+}
+
+bool parse(const char * arg, double & value) {
+	char * end;
+	double r = strtod(arg, &end);
+	if(*end != '\0') return false;
+
+	value = r;
+	return true;
+}
+
+//-----------------------------------------------------------------------------
+struct option_t {
+      char short_name;
+      const char * long_name;
+      const char * help;
+      void * variable;
+      bool (*parse_fun)(const char *, void * );
+
+	template<typename T>
+	inline option_t( char short_name, const char * long_name, const char * help, T & variable ) {
+		this->short_name = short_name;
+		this->long_name  = long_name;
+		this->help       = help;
+		this->variable   = reinterpret_cast<void*>(&variable);
+		this->parse_fun  = reinterpret_cast<bool (*)(const char *, void * )>(static_cast<bool (*)(const char *, T & )>(parse));
+	}
+
+	template<typename T>
+	inline option_t( char short_name, const char * long_name, const char * help, T & variable, bool (*parse)(const char *, T & )) {
+		this->short_name = short_name;
+		this->long_name  = long_name;
+		this->help       = help;
+		this->variable   = reinterpret_cast<void*>(&variable);
+		this->parse_fun  = reinterpret_cast<bool (*)(const char *, void * )>(parse);
+	}
+};
+
+extern option_t last_option;
+
+
+//-----------------------------------------------------------------------------
+#include <cstdint>
+#include <climits>
+#include <errno.h>
+#include <unistd.h>
+extern "C" {
+	#include <getopt.h>
+	#include <sys/ioctl.h>
+
+	extern FILE * stderr;
+	extern FILE * stdout;
+
+	extern int fileno(FILE *stream);
+
+	extern int fprintf ( FILE * stream, const char * format, ... );
+
+	extern          long long int strtoll (const char* str, char** endptr, int base);
+	extern unsigned long long int strtoull(const char* str, char** endptr, int base);
+	extern                 double strtod  (const char* str, char** endptr);
+}
+
+static void usage(char * cmd, option_t options[], size_t opt_count, const char * usage, FILE * out)  __attribute__ ((noreturn));
+
+//-----------------------------------------------------------------------------
+// getopt_long wrapping
+void parse_args(
+	int argc,
+	char * argv[],
+	option_t options[],
+	size_t opt_count,
+	const char * usage_msg,
+	char ** * left
+) {
+	struct option optarr[opt_count + 2];
+	{
+		int idx = 0;
+		for(int i = 0; i < opt_count; i++) {
+			if(options[i].long_name) {
+				optarr[idx].name = options[i].long_name;
+				optarr[idx].flag = nullptr;
+				optarr[idx].val  = options[i].short_name;
+				if(    ((intptr_t)options[i].parse_fun) == ((intptr_t)parse_settrue)
+				    || ((intptr_t)options[i].parse_fun) == ((intptr_t)parse_setfalse) ) {
+					optarr[idx].has_arg = no_argument;
+				} else {
+					optarr[idx].has_arg = required_argument;
+				}
+				idx++;
+			}
+		}
+		optarr[idx+0].name = "help";
+		optarr[idx+0].has_arg = no_argument;
+		optarr[idx+0].flag = 0;
+		optarr[idx+0].val = 'h';
+		optarr[idx+1].name = 0;
+		optarr[idx+1].has_arg = no_argument;
+		optarr[idx+1].flag = 0;
+		optarr[idx+1].val = 0;
+	}
+
+	char optstring[opt_count * 3];
+	for(auto & o : optstring) {
+		o = '\0';
+	}
+	{
+		int idx = 0;
+		for(int i = 0; i < opt_count; i++) {
+			optstring[idx] = options[i].short_name;
+			idx++;
+			if(    ((intptr_t)options[i].parse_fun) != ((intptr_t)parse_settrue)
+			    && ((intptr_t)options[i].parse_fun) != ((intptr_t)parse_setfalse) ) {
+				optstring[idx] = ':';
+				idx++;
+			}
+		}
+		optstring[idx+0] = 'h';
+		optstring[idx+1] = '\0';
+	}
+
+	FILE * out = stderr;
+	for(;;) {
+		int idx = 0;
+		int opt = getopt_long(argc, argv, optstring, optarr, &idx);
+		switch(opt) {
+			case -1:
+				if(left != nullptr) *left = argv + optind;
+				return;
+			case 'h':
+				out = stdout;
+			case '?':
+				usage(argv[0], options, opt_count, usage_msg, out);
+			default:
+				for(int i = 0; i < opt_count; i++) {
+					if(opt == options[i].short_name) {
+						const char * arg = optarg ? optarg : "";
+						bool success = options[i].parse_fun( arg, options[i].variable );
+						if(success) goto NEXT_ARG;
+
+						fprintf(out, "Argument '%s' for option %c could not be parsed\n\n", arg, (char)opt);
+						usage(argv[0], options, opt_count, usage_msg, out);
+					}
+				}
+				std::abort();
+		}
+		NEXT_ARG:;
+	}
+}
+
+//-----------------------------------------------------------------------------
+// Print usage
+static void printopt(FILE * out, int width, int max, char sn, const char * ln, const char * help) {
+	int hwidth = max - (11 + width);
+	if(hwidth <= 0) hwidth = max;
+
+	fprintf(out, "  -%c, --%-*s   %.*s\n", sn, width, ln, hwidth, help);
+	for(;;) {
+		help += std::min(strlen(help), (unsigned long)hwidth);
+		if('\0' == *help) break;
+		fprintf(out, "%*s%.*s\n", width + 11, "", hwidth, help);
+	}
+}
+
+__attribute__((noreturn)) void print_args_usage(int , char * argv[], option_t options[], size_t opt_count, const char * usage_msg, bool error) {
+	usage(argv[0], options, opt_count, usage_msg, error ? stderr : stdout);
+}
+
+static __attribute__((noreturn)) void usage(char * cmd, option_t options[], size_t opt_count, const char * help, FILE * out) {
+	int width = 0;
+	{
+		for(int i = 0; i < opt_count; i++) {
+			if(options[i].long_name) {
+				int w = strlen(options[i].long_name);
+				if(w > width) width = w;
+			}
+		}
+	}
+
+	int max_width = 1000000;
+	int outfd = fileno(out);
+	if(isatty(outfd)) {
+		struct winsize size;
+		int ret = ioctl(outfd, TIOCGWINSZ, &size);
+		if(ret < 0) abort(); // "ioctl error: (%d) %s\n", (int)errno, strerror(errno)
+		max_width = size.ws_col;
+	}
+
+	fprintf(out, "Usage:\n  %s %s\n", cmd, help);
+
+	for(int i = 0; i < opt_count; i++) {
+		printopt(out, width, max_width, options[i].short_name, options[i].long_name, options[i].help);
+	}
+	fprintf(out, "  -%c, --%-*s   %s\n", 'h', width, "help", "print this help message");
+	exit(out == stdout ? 0 : 1);
+}
+
Index: benchmark/rmit.py
===================================================================
--- benchmark/rmit.py	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ benchmark/rmit.py	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,236 @@
+#!/usr/bin/python3
+"""
+Python Script to implement R.M.I.T. testing : Randomized Multiple Interleaved Trials
+
+./rmit.py run COMMAND CANDIDATES
+-t trials
+-o option:values
+"""
+
+
+import argparse
+import datetime
+import itertools
+import os
+import random
+import re
+import subprocess
+import sys
+
+
+def parse_range(x):
+    result = []
+    for part in x.split(','):
+        if '-' in part:
+            a, b = part.split('-')
+            a, b = int(a), int(b)
+            result.extend(range(a, b + 1))
+        else:
+            a = int(part)
+            result.append(a)
+    return result
+
+class DependentOpt:
+	def __init__(self, key, value):
+		self.key = key
+		self.value = value
+		self.vars = re.findall("[a-zA-Z]", value)
+
+def parse_option(opt):
+	match = re.search("^(.*):(.*)$", opt)
+	if not match :
+		print('ERROR: extra options should match pattern .*:.*, got {}'.format(opt), file=sys.stderr)
+		sys.exit(1)
+
+	key    = match.group(1)
+	values = match.group(2)
+
+	try:
+		num = int(values)
+		return key, [num]
+	except:
+		pass
+
+	if re.search("^[0-9-,]+$", values):
+		values = parse_range(values)
+		return key, [v for v in values]
+	else:
+		return key, DependentOpt(key, values)
+
+def eval_one(fmt, vals):
+	orig = fmt
+	for k, v in vals:
+		fmt = fmt.replace(k, str(v))
+
+	if not re.search("^[0-9-/*+ ]+$", fmt):
+		print('ERROR: pattern option {} (interpreted as {}) could not be evaluated'.format(orig, fmt), file=sys.stderr)
+		sys.exit(1)
+
+	return eval(fmt)
+
+def eval_options(opts):
+	dependents = [d for d in opts.values() if type(d) is DependentOpt]
+	processed = []
+	nopts = []
+	for d in dependents:
+		processed.append(d.key)
+		lists = []
+		for dvar in d.vars:
+			if not dvar in opts.keys():
+				print('ERROR: extra pattern option {}:{} uses unknown key {}'.format(d.key,d.value,dvar), file=sys.stderr)
+				sys.exit(1)
+
+			lists.append([(dvar, o) for o in opts[dvar]])
+			processed.append(dvar)
+
+		kopt = []
+		for vals in list(itertools.product(*lists)):
+			res = ['-{}'.format(d.key), "{}".format(eval_one(d.value, vals))]
+			for k, v in vals:
+				res.extend(['-{}'.format(k), "{}".format(v)])
+			kopt.append(res)
+		nopts.append(kopt)
+
+
+	for k, vals in opts.items():
+		if k not in processed:
+			kopt = []
+			for v in vals:
+				kopt.append(['-{}'.format(k), "{}".format(v)])
+			nopts.append(kopt)
+
+	return nopts
+
+def actions_eta(actions):
+	time = 0
+	for a in actions:
+		i = 0
+		while i < len(a):
+			if a[i] == '-d':
+				i += 1
+				if i != len(a):
+					time += int(a[i])
+			i += 1
+	return time
+
+if __name__ == "__main__":
+	# ================================================================================
+	# parse command line arguments
+	formats = ['raw', 'csv']
+	parser = argparse.ArgumentParser(description='Python Script to implement R.M.I.T. testing : Randomized Multiple Interleaved Trials')
+	parser.add_argument('--list', help='List all the commands that would be run', action='store_true')
+	parser.add_argument('--format', help='How to print the result', choices=formats, default='csv')
+	parser.add_argument('--file', nargs='?', type=argparse.FileType('w'), default=sys.stdout)
+	parser.add_argument('-t', '--trials', help='Number of trials to run per combinaison', type=int, default=3)
+	parser.add_argument('-o','--option',action='append')
+	parser.add_argument('command', metavar='command', type=str, nargs=1, help='the command prefix to run')
+	parser.add_argument('candidates', metavar='candidates', type=str, nargs='*', help='the candidate suffix to run')
+
+	try:
+		options =  parser.parse_args()
+	except:
+		print('ERROR: invalid arguments', file=sys.stderr)
+		parser.print_help(sys.stderr)
+		sys.exit(1)
+
+	# ================================================================================
+	# Identify the commands to run
+	commands = ["./" + options.command[0] + "-" + c for c in options.candidates]
+	for c in commands:
+		if not os.path.isfile(c):
+			print('ERROR: invalid command {}, file does not exist'.format(c), file=sys.stderr)
+			sys.exit(1)
+
+		if not os.access(c, os.X_OK):
+			print('ERROR: invalid command {}, file not executable'.format(c), file=sys.stderr)
+			sys.exit(1)
+
+
+	# ================================================================================
+	# Identify the options to run
+	opts = dict([parse_option(o) for o in options.option])
+
+	# Evaluate the options (options can depend on the value of other options)
+	opts = eval_options(opts)
+
+	# ================================================================================
+	# Figure out all the combinations to run
+	actions = []
+	for p in itertools.product(range(options.trials), commands, *opts):
+		act = [p[1]]
+		for o in p[2:]:
+			act.extend(o)
+		actions.append(act)
+
+	# ================================================================================
+	# Figure out all the combinations to run
+	if options.list:
+		for a in actions:
+			print(" ".join(a))
+		sys.exit(0)
+
+
+	# ================================================================================
+	# Prepare to run
+	print(actions)
+
+	# find expected time
+	time = actions_eta(actions)
+	print("Running {} trials{}".format(len(actions), "" if time == 0 else " (expecting to take {}".format(str(datetime.timedelta(seconds=int(time)))) ))
+
+	random.shuffle(actions)
+	result = []
+
+	# ================================================================================
+	# Run
+	for i, a in enumerate(actions):
+		sa = " ".join(a)
+		print("{}/{} : {}          \r".format(i, len(actions), sa), end = '')
+		fields = {}
+		with subprocess.Popen( a, stdout  = subprocess.PIPE, stderr  = subprocess.PIPE) as proc:
+			out, err = proc.communicate()
+			if proc.returncode != 0:
+				print("ERROR: command '{}' encountered error, returned code {}".format(sa, proc.returncode), file=sys.stderr)
+				print(err.decode("utf-8"))
+				sys.exit(1)
+			for s in out.decode("utf-8").splitlines():
+				match = re.search("^(.*):(.*)$", s)
+				if match:
+					fields[match.group(1).strip()] = float(match.group(2).strip().replace(',',''))
+
+		result.append([a[0][2:], sa, fields])
+
+	print("Done                                                                                ")
+
+	# ================================================================================
+	# Print csv
+	if options.format == 'raw':
+		for r in result:
+			print(r, file=options.file)
+		sys.exit(0)
+
+	# ================================================================================
+	# Print csv
+	if options.format == 'csv':
+		# Clean result
+		headers = ["series", "command"]
+		data = []
+		first = True
+		for r in result:
+			if first:
+				first = False
+				headers.extend(r[2].keys())
+			else :
+				pass
+
+			d = [r[0], r[1]]
+			for k in headers[2:]:
+				d.append(r[2][k])
+
+			data.append(d)
+
+		# Print csv
+		print(",\t".join(headers), file=options.file)
+		for d in data:
+			print(",\t".join(["{}".format(dd) for dd in d]), file=options.file)
+		sys.exit(0)
Index: configure.ac
===================================================================
--- configure.ac	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ configure.ac	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -28,12 +28,14 @@
 # New AST toggling support
 AH_TEMPLATE([CFA_USE_NEW_AST],[Sets whether or not to use the new-ast, this is adefault value and can be overrided by --old-ast and --new-ast])
+DEFAULT_NEW_AST="False"
 AC_ARG_ENABLE(new-ast,
 	[  --enable-new-ast     whether or not to use new ast as the default AST algorithm],
 	[case "${enableval}" in
-		yes) newast=true ;;
-		no)  newast=false ;;
+		yes) newast=true ; DEFAULT_NEW_AST="True"  ;;
+		no)  newast=false; DEFAULT_NEW_AST="False" ;;
 		*) AC_MSG_ERROR([bad value ${enableval} for --enable-new-ast]) ;;
 	esac],[newast=false])
 AC_DEFINE_UNQUOTED([CFA_USE_NEW_AST], $newast)
+AC_SUBST(DEFAULT_NEW_AST)
 
 #==============================================================================
Index: doc/papers/concurrency/mail2
===================================================================
--- doc/papers/concurrency/mail2	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ doc/papers/concurrency/mail2	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -1,2 +1,3 @@
+
 Date: Wed, 26 Jun 2019 20:12:38 +0000
 From: Aaron Thomas <onbehalfof@manuscriptcentral.com>
@@ -1074,2 +1075,213 @@
 Software: Practice and Experience Editorial Office
 
+
+
+Date: Thu, 15 Oct 2020 13:48:52 +0000
+From: Richard Jones <onbehalfof@manuscriptcentral.com>
+Reply-To: R.E.Jones@kent.ac.uk
+To: tdelisle@uwaterloo.ca, pabuhr@uwaterloo.ca
+Subject: Software: Practice and Experience - Decision on Manuscript ID
+ SPE-19-0219.R3
+
+15-Oct-2020
+
+Dear Dr Buhr,
+
+It is a pleasure to accept your manuscript entitled "Advanced Control-flow and Concurrency in Cforall" in its current form for publication in Software: Practice and Experience.  
+
+Please note although the manuscript is accepted the files will now be checked to ensure that everything is ready for publication, and you may be contacted if final versions of files for publication are required.
+
+Your article cannot be published until the publisher has received the appropriate signed license agreement. Within the next few days the corresponding author will receive an email from Wiley's Author Services system which will ask them to log in and will present them with the appropriate license for completion.
+
+Thank you for your fine contribution.
+
+Sincerely,
+Richard
+
+Prof. Richard Jones
+Editor, Software: Practice and Experience
+R.E.Jones@kent.ac.uk
+
+P.S. - You can help your research get the attention it deserves! Check out Wiley's free Promotion Guide for best-practice recommendations for promoting your work at www.wileyauthors.com/eeo/guide. And learn more about Wiley Editing Services which offers professional video, design, and writing services to create shareable video abstracts, infographics, conference posters, lay summaries, and research news stories for your research at www.wileyauthors.com/eeo/promotion.
+
+This journal accepts artwork submissions for Cover Images. This is an optional service you can use to help increase article exposure and showcase your research. For more information, including artwork guidelines, pricing, and submission details, please visit the Journal Cover Image page at www.wileyauthors.com/eeo/covers. If you want help creating an image, Wiley Editing Services offers a professional cover image design service that creates eye-catching images, ready to be showcased on the journal cover at www.wileyauthors.com/eeo/design.
+
+
+
+Date: Fri, 16 Oct 2020 12:44:42 +0000
+From: Mayank Roy Chowdhury <onbehalfof@manuscriptcentral.com>
+Reply-To: speoffice@wiley.com
+To: pabuhr@uwaterloo.ca
+Subject: Manuscript Accepted - Please submit final updates to SPE-19-0219.R3 [email ref: ENR-AW-1-c]
+
+16-Oct-2020
+
+Dear Dr. Buhr,
+
+Manuscript id: SPE-19-0219.R3
+Manuscript title: Advanced Control-flow and Concurrency in Cforall
+
+Although your manuscript has been accepted for publication it is now being returned to your author center for you to review and make any final adjustments or corrections prior to production and publication.
+
+Any special instructions will be listed below:
+1) Funding Information added in ScholorOne but missing in main document, Kindly add the Funding information in main document.
+2) Please provide the clean version of the manuscript without any highlights or tracked changes.
+3) Kindly check and make sure citations for all figures and Tables are present in the main document
+
+Please now log back into your Scholar One Author Center and click on the "Manuscripts Accepted for First Look" queue. In order to update the submission, click on the "submit updated manuscript" link in the "Actions" column and follow the steps as you would during a manuscript submission process.
+
+On the File Upload screen please upload the FINAL versions of all the files, including print quality image files. For information about image quality requirements, please refer to the guidelines at https://authorservices.wiley.com/asset/photos/electronic_artwork_guidelines.pdf.
+
+Instructions for uploading replacement files:
+1. On the "File Upload" step, click on the "edit" button for the file you wish to replace.
+2. In the "Upload a later version" section, browse to locate the replacement final version.
+3. Add any comments concerning the replacement (e.g. "high res image").
+4. Select whether the new file is a minor or major version (we suggest you select minor version)
+5. Click upload.
+6. Click 'Submit' when all the files have been uploaded and you will receive an automated email to say that submission is successful.
+
+Please submit your updates within the next 7 days to ensure there are no unnecessary delays in production.
+
+Sincerely,
+Software: Practice and Experience Editorial Office
+
+
+
+From: SPE Office <speoffice@wiley.com>
+To: "Peter A. Buhr" <pabuhr@uwaterloo.ca>
+Subject: Re: Manuscript Accepted - Please submit final updates to SPE-19-0219.R3 [email ref: ENR-AW-1-c]
+Date: Mon, 19 Oct 2020 17:04:24 +0000
+
+Dear Dr. Buhr,
+
+Thank you very much for contacting the Editorial Office.
+
+I would like to let you know that the files has been found in order and moved to production.
+
+Plesae let me know for further assistance in this regard.
+
+Best Regards
+
+Mayank Roy Chowdhury
+Editorial Assistant
+Software practice and Experience
+________________________________
+From: Peter A. Buhr <pabuhr@uwaterloo.ca>
+Sent: Sunday, October 18, 2020 2:00 PM
+To: SPE Office <speoffice@wiley.com>
+Cc: Thierry Delisle <tdelisle@uwaterloo.ca>
+Subject: Re: Manuscript Accepted - Please submit final updates to SPE-19-0219.R3 [email ref: ENR-AW-1-c]
+
+       This is an external email.
+
+    Mayank Roy Chowdhury <onbehalfof@manuscriptcentral.com> writes:
+
+    Instructions for uploading replacement files:
+    1. On the "File Upload" step, click on the "edit" button for the file you wish to replace.
+    2. In the "Upload a later version" section, browse to locate the replacement final version.
+    3. Add any comments concerning the replacement (e.g. "high res image").
+    4. Select whether the new file is a minor or major version (we suggest you select minor version)
+    5. Click upload.
+    6. Click 'Submit' when all the files have been uploaded and you will receive an automated email to say that submission is successful.
+
+There was no "edit" button on the "File Upload" page, so I just upload the
+final version of the PDF and source files using the mechanism on the "File
+Upload" page and submitted that.
+
+
+
+Date: Tue, 20 Oct 2020 13:28:37 +0530
+To: "Dr. Peter Buhr" <pabuhr@uwaterloo.ca>
+From: jpcms@spi-global.com
+Subject: Information: Production Editor Contact Software:Practice and Experience  | Advanced Control-flow and Concurrency in C A
+
+Dear Dr. Peter Buhr,
+
+We are in the process of preparing "Advanced Control-flow and Concurrency in C A" for publication. Your production editor, Joel Pacaanas, will support you and your article throughout the process.
+
+Please get in touch with your Production Editor at SPEproofs@wiley.com;EllaMae.Navor@spi-global.com if you have any questions.
+		
+Sincerely,
+Booking-in Team,
+On behalf of Wiley
+
+Article ID: SPE_2925
+Article DOI: 10.1002/SPE.2925
+
+
+
+Date: Tue, 20 Oct 2020 10:33:04 +0000
+From: <cs-author@wiley.com>
+To: <pabuhr@uwaterloo.ca>
+Subject: In Production: Your article accepted in Software: Practice and Experience
+
+Dear Peter Buhr,
+
+Article ID: SPE2925
+Article DOI: 10.1002/spe.2925
+Internal Article ID: 16922213
+Article: Advanced Control-flow and Concurrency in C A
+Journal: Software: Practice and Experience
+
+Congratulations on the acceptance of your article for publication in Software: Practice and Experience. 
+
+Your article has been received and the production process is now underway. We look forward to working with you and publishing your article. Using Wiley Author Services, you can track your article's progress.
+
+Please click below to login - if you are using a different email address than this one, you will need to manually assign this article to your Dashboard (see https://hub.wiley.com/docs/support/assigning-a-missing-article-to-my-dashboard-DOC-11871?utm_source=new%20user%20invitation&utm_medium=email How do I assign a missing article to My Dashboard?): 
+
+https://authorservices.wiley.com/index.html#login?campaign=email_invitation-new
+
+If applicable, a list of available actions will appear below - check out your Author Services Dashboard for all actions related to your articles.
+
+Sign your license agreement (REQUIRED)  -- you will receive an email when this task is ready on your dashboard. Track your article's progress to publicationAccess your published articleInvite colleagues to view your published article
+If you need any assistance, please click http://www.wileyauthors.com/help?utm_source=new%20user%20invitation&utm_medium=email here to view our Help section.
+
+Sincerely,
+Wiley Author Services
+
+P.S. - Some journals accept artwork submissions for Cover Images. This is an optional service you can use to help increase article exposure and showcase your research. Pricing and placement options vary by journal. For more information, including artwork guidelines, pricing, and submission details, please visit the https://authorservices.wiley.com/author-resources/Journal-Authors/Promotion/journal-cover-image.html?utm_source=as&utm_medium=email&utm_term=invitation_msg&utm_content=covers&utm_campaign=2019feb?campaign=email_invitation-new" target=_blank">Journal Cover Image page. If you want help creating an image, Wiley Editing Services offers a professional https://wileyeditingservices.com/en/article-promotion/cover-image-design.html?utm_source=as&utm_medium=email&utm_term=ie&utm_content=cid&utm_campaign=prodops" target=_blank">Cover Image Design service that creates eye-catching images, ready to be showcased on the journal cover.
+
+
+
+Date: Thu, 22 Oct 2020 20:21:49 +0000
+From: <cs-author@wiley.com>
+To: <pabuhr@uwaterloo.ca>
+Subject: You have actions to complete in Author Services
+
+Dear Peter Buhr,
+
+Article ID: SPE2925
+Article DOI: 10.1002/spe.2925
+Internal Article ID: 16922213
+Article: Advanced Control-flow and Concurrency in C A
+Journal: Software: Practice and Experience
+
+For the above article, you have the following open tasks:
+
+Sign your license agreement in order to publish your article. Simply click the Sign License button on your https://authorservices.wiley.com?campaign=email_license-notice1">Wiley Author Services Dashboard.
+
+Need any help? Please visit our https://authorsupport.wiley.com/s/">Author Support Center. 
+
+Sincerely,
+Wiley Author Services
+
+
+
+Date: Thu, 22 Oct 2020 23:13:07 +0000
+From: <cs-author@wiley.com>
+To: <pabuhr@uwaterloo.ca>
+Subject: License was successfully submitted! Thank you!
+
+Dear Peter Buhr,                                                                  
+
+Article ID: SPE2925
+Article DOI: 10.1002/spe.2925
+Internal Article ID: 16922213
+Article: Advanced Control-flow and Concurrency in C A  
+Journal: Software: Practice and Experience                                                                      
+                                                                          
+You've successfully completed license signing for your article - thank you! You can view your signed agreement at any time by visiting your https://authorservices.wiley.com?campaign=email_license-confirm">Wiley Author Services Dashboard.                                                                      
+
+Sincerely,                                                                                 
+
+Wiley Author Services
Index: doc/proposals/vtable.md
===================================================================
--- doc/proposals/vtable.md	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ doc/proposals/vtable.md	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -512,4 +512,28 @@
 possibly like the one used to create the assertion.
 
+### Extension: Associated Types Use
+If the `associated_types.md` proposal is accepted the following trait could
+be added:
+
+    trait is_virtual(dtype T) {
+        dtype table;
+        // An example assertion:
+        const table & get_virtual_table(T &);
+    }
+
+There may be more assertions but there has to be at least one way to find
+the (possibly default) virtual table. It is required to construct instances
+of the type.
+
+Without the assotiated type it would look like this:
+
+    trait is_virtual(dtype T, dtype table) {
+        const table & get_virtual_table(T &);
+    }
+
+Which is just a little bit longer to use but becomes more problematic if the
+user has to explicately provide the table's name as it doesn't really have its
+own type name. If it does it is probably mangled.
+
 ### Virtual Tables as Types
 Here we consider encoding plus the implementation of functions on it to be a
Index: doc/theses/andrew_beach_MMath/thesis-frontpgs.tex
===================================================================
--- doc/theses/andrew_beach_MMath/thesis-frontpgs.tex	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ doc/theses/andrew_beach_MMath/thesis-frontpgs.tex	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -5,4 +5,12 @@
 % TITLE PAGE
 %----------------------------------------------------------------------
+
+% Slowly generalizing.
+\ethesissetup{
+  author=Pat Neugraad,%
+  title={University of Waterloo E-Thesis Template for \LaTeX},
+  degree=phd,%
+  program=Zoology,%
+}
 
 \pagestyle{empty}
@@ -15,27 +23,23 @@
         \vspace*{1.0cm}
 
-        \Huge
-        {\bf University of Waterloo E-Thesis Template for \LaTeX }
-
-        \vspace*{1.0cm}
-
-        \normalsize
+        {\Huge\bf \eprint{title}}
+
+        \vspace*{1.0cm}
+
         by \\
 
         \vspace*{1.0cm}
 
-        \Large
-        Pat Neugraad \\
+        {\Large \eprint{author}} \\
 
         \vspace*{3.0cm}
 
-        \normalsize
         A thesis \\
         presented to the University of Waterloo \\ 
         in fulfillment of the \\
         thesis requirement for the degree of \\
-        Doctor of Philosophy \\
+        \eprint{degree} \\
         in \\
-        Zoology \\
+        \eprint{program} \\
 
         \vspace*{2.0cm}
@@ -45,5 +49,5 @@
         \vspace*{1.0cm}
 
-        \copyright\ Pat Neugraad 2017 \\
+        \copyright{} \eprint{author} 2017 \\
         \end{center}
 \end{titlepage}
Index: doc/theses/andrew_beach_MMath/uw-ethesis.cls
===================================================================
--- doc/theses/andrew_beach_MMath/uw-ethesis.cls	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ doc/theses/andrew_beach_MMath/uw-ethesis.cls	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -19,4 +19,9 @@
 %
 % Exported Names:
+%   \ethesissetup{<key-value-pairs>}
+%     Preforms set-up (or a reconfiguration) of the document class. See the
+%     Set-Up Keys section for the keys that may be passed in. Use commas to
+%     seperate key-value-pairs.
+%
 %   \ifformat{<format>}{<true>}{<false>}
 %     If the document's format is <format> than expands to <true> otherwise
@@ -27,13 +32,27 @@
 %     initial setup depends on the document format but they can be overriden
 %     with options in <setup> (set hyperref's \hypersetup for details).
+%
+%   \eprint{<key>}
+%     Expands to a human readable value tracked by the ethesis class. This
+%     can be used to retreave and format values set during set up.
+%
+% Set-Up Keys:
+%   author=<text>
+%   title=<text>
+%   program=<text>
+%   subject=<text>
+%   keywords=<text>
+%   degree=masters|phd
+%   faculty=ahs|arts|eng|env|math|sci
 \NeedsTeXFormat{LaTeX2e}
-\ProvidesClass{uw-ethesis}[2020/03/24 v0.1 UW-eThesis Template Document Class]
+\ProvidesClass{uw-ethesis}[2020/10/25 v0.2 UW-eThesis Template Document Class]
 
 \RequirePackage{etoolbox}
+\RequirePackage{xkeyval}
 
 % Requested Format:
-\newrobustcmd*{\ethesis@format}{digital}
-\DeclareOption{print}{\renewrobustcmd*{\ethesis@format}{print}}
-\DeclareOption{digital}{\renewrobustcmd*{\ethesis@format}{digital}}
+\newrobustcmd*{\ethesis@@format}{digital}
+\DeclareOption{print}{\renewrobustcmd*{\ethesis@@format}{print}}
+\DeclareOption{digital}{\renewrobustcmd*{\ethesis@@format}{digital}}
 
 \ProcessOptions\relax
@@ -81,7 +100,7 @@
 % a recto page. This will often require an empty verso (left-hand side) page
 % that should not have the page number printed on it.
-\let\origdoublepage\cleardoublepage
+\let\ethesis@origdoublepage\cleardoublepage
 \newcommand{\clearemptydoublepage}{%
-  \clearpage{\pagestyle{empty}\origdoublepage}}
+  \clearpage{\pagestyle{empty}\ethesis@origdoublepage}}
 \let\cleardoublepage\clearemptydoublepage
 
@@ -89,9 +108,24 @@
 \renewcommand*{\bibname}{References}
 
-% Configurations
-\def\setThesisTitle#1{\newrobustcmd*{\ethesis@title}{#1}}
-\def\setThesisAuthor#1{\newrobustcmd*{\ethesis@author}{#1}}
-\def\setThesisSubject#1{\newrobustcmd*{\ethesis@subject}{#1}}
-\def\setThesisKeywords#1{\newrobustcmd*{\ethesis@keywords}{#1}}
+\newrobustcmd*\ethesissetup[1]{\setkeys{ethesis}{#1}}
+
+\define@cmdkeys{ethesis}[ethesis@@]{%
+  author,title,program,subject,keywords}
+
+\define@choicekey{ethesis}{degree}{masters,phd}{\def\ethesis@@degree{#1}}
+\define@choicekey{ethesis}{faculty}{ahs,arts,eng,env,math,sci}%
+  {\def\ethesis@@faculty{#1}}
+
+\newrobustcmd*\eprint[1]{
+  \ifcsdef{ethesis@long#1}{\csuse{ethesis@long#1}}{%
+    \ifcsdef{ethesis@@#1}{\csuse{ethesis@@#1}}{%
+      % ERROR: (Check for a way to emit an actual error.)
+      [UW-eThesis doesn't know how to print: #1 ]
+    }
+  }
+}
+
+\newrobustcmd*\ethesis@longdegree{%
+  \ifdefstring{\ethesis@@degree}{phd}{Doctor of Philosophy}{Masters}}
 
 % Includes the hyperref package loading a number of defaults.
@@ -106,8 +140,8 @@
     pdfstartview={FitH},    % Fits the width of the page to the window.
   }
-  \ifdef{\ethesis@title}{\hypersetup{pdftitle={\ethesis@title}}}{}
-  \ifdef{\ethesis@author}{\hypersetup{pdfauthor={\ethesis@author}}}{}
-  \ifdef{\ethesis@subject}{\hypersetup{pdfsubject={\ethesis@subject}}}{}
-  \ifdef{\ethesis@keywords}{\hypersetup{pdfkeywords={\ethesis@keywords}}}{}
+  \ifdef{\ethesis@@title}{\hypersetup{pdftitle={\ethesis@@title}}}{}
+  \ifdef{\ethesis@@author}{\hypersetup{pdfauthor={\ethesis@@author}}}{}
+  \ifdef{\ethesis@@subject}{\hypersetup{pdfsubject={\ethesis@@subject}}}{}
+  \ifdef{\ethesis@@keywords}{\hypersetup{pdfkeywords={\ethesis@@keywords}}}{}
   \ifformat{print}{
     \hypersetup{
Index: c/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/cforall.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/cforall.cpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ 	(revision )
@@ -1,43 +1,0 @@
-#include <cassert>
-#include <clib/cfathread.h>
-
-typedef cfathread_t thread_t;
-static_assert(sizeof(thread_t) == sizeof(void*), "thread_t musst be of same size as void*");
-
-#if !defined(__cplusplus)
-#error no __cplusplus define!
-#endif
-
-extern "C" {
-	//--------------------
-	// Basic thread support
-	thread_t thrdlib_create( void (*the_main)( thread_t ) ) {
-		return cfathread_create( the_main );
-	}
-
-	void thrdlib_join( thread_t handle ) {
-		cfathread_join( handle );
-	}
-
-	void thrdlib_park( thread_t ) {
-		cfathread_park();
-	}
-
-	void thrdlib_unpark( thread_t handle ) {
-		cfathread_unpark( handle );
-	}
-
-	void thrdlib_yield( void ) {
-		cfathread_yield();
-	}
-
-	//--------------------
-	// Basic kernel features
-	void thrdlib_init( int procs ) {
-		cfathread_setproccnt(procs);
-	}
-
-	void thrdlib_clean( void ) {
-		cfathread_setproccnt(1);
-	}
-}
Index: doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/cforall.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/cforall.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/cforall.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,43 @@
+#include <cassert>
+#include <clib/cfathread.h>
+
+typedef cfathread_t thread_t;
+static_assert(sizeof(thread_t) == sizeof(void*), "thread_t musst be of same size as void*");
+
+#if !defined(__cplusplus)
+#error no __cplusplus define!
+#endif
+
+extern "C" {
+	//--------------------
+	// Basic thread support
+	thread_t thrdlib_create( void (*the_main)( thread_t ) ) {
+		return cfathread_create( the_main );
+	}
+
+	void thrdlib_join( thread_t handle ) {
+		cfathread_join( handle );
+	}
+
+	void thrdlib_park( thread_t ) {
+		cfathread_park();
+	}
+
+	void thrdlib_unpark( thread_t handle ) {
+		cfathread_unpark( handle );
+	}
+
+	void thrdlib_yield( void ) {
+		cfathread_yield();
+	}
+
+	//--------------------
+	// Basic kernel features
+	void thrdlib_init( int procs ) {
+		cfathread_setproccnt(procs);
+	}
+
+	void thrdlib_clean( void ) {
+		cfathread_setproccnt(1);
+	}
+}
Index: c/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/fibre.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/fibre.cpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ 	(revision )
@@ -1,48 +1,0 @@
-#include <cassert>
-#include <libfibre/cfibre.h>
-
-typedef cfibre_t thread_t;
-static_assert(sizeof(thread_t) == sizeof(void*), "thread_t musst be of same size as void*");
-
-void * fibre_runner(void * arg) {
-	auto the_main = (void (*)( thread_t ))arg;
-	the_main( cfibre_self() );
-	return nullptr;
-}
-
-extern "C" {
-	//--------------------
-	// Basic thread support
-	thread_t thrdlib_create( void (*the_main)( thread_t ) ) {
-		thread_t fibre;
-		cfibre_create( &fibre, nullptr, fibre_runner, (void*)the_main );
-		return fibre;
-	}
-
-	void thrdlib_join( thread_t handle ) {
-		cfibre_join( handle, nullptr );
-	}
-
-	void thrdlib_park( thread_t handle ) {
-		assert( handle == cfibre_self() );
-		cfibre_park();
-	}
-
-	void thrdlib_unpark( thread_t handle ) {
-		cfibre_unpark( handle );
-	}
-
-	void thrdlib_yield( void ) {
-		cfibre_yield();
-	}
-
-	//--------------------
-	// Basic kernel features
-	void thrdlib_init( int procs ) {
-		cfibre_init_n(1, procs );
-	}
-
-	void thrdlib_clean( void ) {
-
-	}
-}
Index: doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/fibre.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/fibre.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/fibre.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,48 @@
+#include <cassert>
+#include <libfibre/cfibre.h>
+
+typedef cfibre_t thread_t;
+static_assert(sizeof(thread_t) == sizeof(void*), "thread_t musst be of same size as void*");
+
+void * fibre_runner(void * arg) {
+	auto the_main = (void (*)( thread_t ))arg;
+	the_main( cfibre_self() );
+	return nullptr;
+}
+
+extern "C" {
+	//--------------------
+	// Basic thread support
+	thread_t thrdlib_create( void (*the_main)( thread_t ) ) {
+		thread_t fibre;
+		cfibre_create( &fibre, nullptr, fibre_runner, (void*)the_main );
+		return fibre;
+	}
+
+	void thrdlib_join( thread_t handle ) {
+		cfibre_join( handle, nullptr );
+	}
+
+	void thrdlib_park( thread_t handle ) {
+		assert( handle == cfibre_self() );
+		cfibre_park();
+	}
+
+	void thrdlib_unpark( thread_t handle ) {
+		cfibre_unpark( handle );
+	}
+
+	void thrdlib_yield( void ) {
+		cfibre_yield();
+	}
+
+	//--------------------
+	// Basic kernel features
+	void thrdlib_init( int procs ) {
+		cfibre_init_n(1, procs );
+	}
+
+	void thrdlib_clean( void ) {
+
+	}
+}
Index: c/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/pthread.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/pthread.cpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ 	(revision )
@@ -1,99 +1,0 @@
-#include <pthread.h>
-#include <errno.h>
-#include <cstring>
-#include <cstdio>
-#include <iostream>
-
-#define CHECKED(x) { int err = x; if( err != 0 ) { std::cerr << "KERNEL ERROR: Operation \"" #x "\" return error " << err << " - " << strerror(err) << std::endl; std::abort(); } }
-
-struct __bin_sem_t {
-	pthread_mutex_t 	lock;
-	pthread_cond_t  	cond;
-	int     		val;
-
-	__bin_sem_t() {
-		// Create the mutex with error checking
-		pthread_mutexattr_t mattr;
-		pthread_mutexattr_init( &mattr );
-		pthread_mutexattr_settype( &mattr, PTHREAD_MUTEX_ERRORCHECK_NP);
-		pthread_mutex_init(&lock, &mattr);
-
-		pthread_cond_init (&cond, nullptr);
-		val = 0;
-	}
-
-	~__bin_sem_t() {
-		CHECKED( pthread_mutex_destroy(&lock) );
-		CHECKED( pthread_cond_destroy (&cond) );
-	}
-
-	void wait() {
-		CHECKED( pthread_mutex_lock(&lock) );
-			while(val < 1) {
-				pthread_cond_wait(&cond, &lock);
-			}
-			val -= 1;
-		CHECKED( pthread_mutex_unlock(&lock) );
-	}
-
-	bool post() {
-		bool needs_signal = false;
-
-		CHECKED( pthread_mutex_lock(&lock) );
-			if(val < 1) {
-				val += 1;
-				pthread_cond_signal(&cond);
-				needs_signal = true;
-			}
-		CHECKED( pthread_mutex_unlock(&lock) );
-
-		return needs_signal;
-	}
-};
-
-#undef CHECKED
-
-//--------------------
-// Basic types
-struct pthread_runner_t {
-	pthread_t handle;
-	__bin_sem_t sem;
-};
-typedef pthread_runner_t * thread_t;
-
-static_assert(sizeof(thread_t) == sizeof(void*), "thread_t musst be of same size as void*");
-
-extern "C" {
-	//--------------------
-	// Basic thread support
-	thread_t thrdlib_create( void (*main)( thread_t ) ) {
-		thread_t thrd = new pthread_runner_t();
-		int r = pthread_create( &thrd->handle, nullptr, (void *(*)(void *))main, thrd );
-		if( r != 0 ) std::abort();
-		return thrd;
-	}
-
-	void thrdlib_join( thread_t handle ) {
-		void * ret;
-		int r = pthread_join( handle->handle, &ret );
-		if( r != 0 ) std::abort();
-		delete handle;
-	}
-
-	void thrdlib_park( thread_t handle ) {
-		handle->sem.wait();
-	}
-
-	void thrdlib_unpark( thread_t handle ) {
-		handle->sem.post();
-	}
-
-	void thrdlib_yield( void ) {
-		int r = pthread_yield();
-		if( r != 0 ) std::abort();
-	}
-
-	//--------------------
-	// Basic kernel features
-	void thrdlib_init( int ) {}
-}
Index: doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/pthread.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/pthread.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ doc/theses/thierry_delisle_PhD/code/readQ_example/thrdlib/pthread.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,99 @@
+#include <pthread.h>
+#include <errno.h>
+#include <cstring>
+#include <cstdio>
+#include <iostream>
+
+#define CHECKED(x) { int err = x; if( err != 0 ) { std::cerr << "KERNEL ERROR: Operation \"" #x "\" return error " << err << " - " << strerror(err) << std::endl; std::abort(); } }
+
+struct __bin_sem_t {
+	pthread_mutex_t 	lock;
+	pthread_cond_t  	cond;
+	int     		val;
+
+	__bin_sem_t() {
+		// Create the mutex with error checking
+		pthread_mutexattr_t mattr;
+		pthread_mutexattr_init( &mattr );
+		pthread_mutexattr_settype( &mattr, PTHREAD_MUTEX_ERRORCHECK_NP);
+		pthread_mutex_init(&lock, &mattr);
+
+		pthread_cond_init (&cond, nullptr);
+		val = 0;
+	}
+
+	~__bin_sem_t() {
+		CHECKED( pthread_mutex_destroy(&lock) );
+		CHECKED( pthread_cond_destroy (&cond) );
+	}
+
+	void wait() {
+		CHECKED( pthread_mutex_lock(&lock) );
+			while(val < 1) {
+				pthread_cond_wait(&cond, &lock);
+			}
+			val -= 1;
+		CHECKED( pthread_mutex_unlock(&lock) );
+	}
+
+	bool post() {
+		bool needs_signal = false;
+
+		CHECKED( pthread_mutex_lock(&lock) );
+			if(val < 1) {
+				val += 1;
+				pthread_cond_signal(&cond);
+				needs_signal = true;
+			}
+		CHECKED( pthread_mutex_unlock(&lock) );
+
+		return needs_signal;
+	}
+};
+
+#undef CHECKED
+
+//--------------------
+// Basic types
+struct pthread_runner_t {
+	pthread_t handle;
+	__bin_sem_t sem;
+};
+typedef pthread_runner_t * thread_t;
+
+static_assert(sizeof(thread_t) == sizeof(void*), "thread_t musst be of same size as void*");
+
+extern "C" {
+	//--------------------
+	// Basic thread support
+	thread_t thrdlib_create( void (*main)( thread_t ) ) {
+		thread_t thrd = new pthread_runner_t();
+		int r = pthread_create( &thrd->handle, nullptr, (void *(*)(void *))main, thrd );
+		if( r != 0 ) std::abort();
+		return thrd;
+	}
+
+	void thrdlib_join( thread_t handle ) {
+		void * ret;
+		int r = pthread_join( handle->handle, &ret );
+		if( r != 0 ) std::abort();
+		delete handle;
+	}
+
+	void thrdlib_park( thread_t handle ) {
+		handle->sem.wait();
+	}
+
+	void thrdlib_unpark( thread_t handle ) {
+		handle->sem.post();
+	}
+
+	void thrdlib_yield( void ) {
+		int r = pthread_yield();
+		if( r != 0 ) std::abort();
+	}
+
+	//--------------------
+	// Basic kernel features
+	void thrdlib_init( int ) {}
+}
Index: doc/theses/thierry_delisle_PhD/comp_II/presentation.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/comp_II/presentation.tex	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ doc/theses/thierry_delisle_PhD/comp_II/presentation.tex	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -36,6 +36,22 @@
 	\miniframeson
 }
-\section{\CFA and Concurrency}
+\section{Concurrency and \CFA}
+\begin{frame}{Project}
+	\begin{center}
+		{\large Produce a scheduler for \CFA that is simple for programmers to understand and offers good general performance.}
+	\end{center}
+\end{frame}
+%------------------------------
 \begin{frame}{\CFA}
+	\CFA is a modern extension of C.
+	It adds to C : overloading, constructors/destructors, polymorphism, and much more.
+
+	~\newline
+	For this project, the relevant aspects are:
+	\begin{itemize}
+		\item Fast and safe system language.
+		\item Threading.
+		\item Manual memory management.
+	\end{itemize}
 
 \end{frame}
@@ -104,5 +120,5 @@
 \begin{frame}{Priority Scheduling}
 	\begin{center}
-	{\large
+		{\large
 			Runs all ready threads in group \textit{A} before any ready threads in group \textit{B}.
 		}
@@ -136,4 +152,31 @@
 
 	Processors begin busy for long periods can mean starvation.
+\end{frame}
+%------------------------------
+\begin{frame}{Scheduling in Practice: Summary}
+	\begin{columns}
+		\begin{column}{0.5\textwidth}
+			\textbf{Feedback Scheduling}
+			\newline
+
+			\begin{itemize}
+				\item Inappropriate for short lived threads.
+				\item Overkill for cooperating threads.\newline
+			\end{itemize}
+		\end{column}
+		\begin{column}{0.5\textwidth}
+			\textbf{Priority Scheduling}
+			\newline
+
+			\begin{itemize}
+				\item Allows lasting starvation.\newline
+				\item Hard to reason about.\newline~\newline
+			\end{itemize}
+		\end{column}
+	\end{columns}
+
+	~\newline
+	~\newline
+	\CFA would benefit from something different.
 \end{frame}
 %==============================
@@ -190,5 +233,5 @@
 	\begin{itemize}
 		\item Acquire for reading for normal scheduling operations.
-		\item Acquire for right when resizing the array and creating/deleting internal queues.
+		\item Acquire for writing when resizing the array and creating/deleting internal queues.
 	\end{itemize}
 \end{frame}
@@ -314,6 +357,8 @@
 	Runtime system and scheduling are still open topics.
 	\newline
+	\newline
 
 	This work offers a novel runtime and scheduling package.
+	\newline
 	\newline
 
@@ -336,5 +381,5 @@
 
 %------------------------------
-\begin{frame}{Timeline}
+\begin{frame}{}
 	\begin{center}
 		{\large Questions?}
Index: doc/theses/thierry_delisle_PhD/comp_II/presentationstyle.sty
===================================================================
--- doc/theses/thierry_delisle_PhD/comp_II/presentationstyle.sty	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ doc/theses/thierry_delisle_PhD/comp_II/presentationstyle.sty	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -20,4 +20,5 @@
 \setbeamertemplate{blocks}[rounded][shadow=false]
 \newcommand\xrowht[2][0]{\addstackgap[.5\dimexpr#2\relax]{\vphantom{#1}}}
+\setbeamertemplate{sections/subsections in toc}{\inserttocsectionnumber.~\inserttocsection}
 
 %==============================
@@ -36,4 +37,6 @@
 \setbeamercolor{palette primary}{bg=colbg}
 \setbeamercolor{palette tertiary}{fg=red}
+\setbeamercolor{section in toc}{fg=white}
+\setbeamercolor{subsection in toc}{fg=gray}
 
 %==============================
Index: doc/theses/thierry_delisle_PhD/thesis/Makefile
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/Makefile	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ doc/theses/thierry_delisle_PhD/thesis/Makefile	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -15,4 +15,5 @@
 	front \
 	intro \
+	existing \
 	runtime \
 	core \
@@ -27,4 +28,5 @@
 	base \
 	empty \
+	system \
 }
 
@@ -37,5 +39,5 @@
 ## Define the documents that need to be made.
 all: thesis.pdf
-thesis.pdf: ${TEXTS} ${FIGURES} ${PICTURES} glossary.tex
+thesis.pdf: ${TEXTS} ${FIGURES} ${PICTURES} glossary.tex local.bib
 
 DOCUMENT = thesis.pdf
Index: doc/theses/thierry_delisle_PhD/thesis/fig/system.fig
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/fig/system.fig	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ doc/theses/thierry_delisle_PhD/thesis/fig/system.fig	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -1,3 +1,3 @@
-#FIG 3.2  Produced by xfig version 3.2.5c
+#FIG 3.2  Produced by xfig version 3.2.7b
 Landscape
 Center
@@ -36,21 +36,4 @@
 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4500 3600 15 15 4500 3600 4515 3615
 -6
-6 3225 4125 4650 4425
-6 4350 4200 4650 4350
-1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4425 4275 15 15 4425 4275 4440 4290
-1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4500 4275 15 15 4500 4275 4515 4290
-1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4575 4275 15 15 4575 4275 4590 4290
--6
-1 1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3450 4275 225 150 3450 4275 3675 4425
-1 1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4050 4275 225 150 4050 4275 4275 4425
--6
-6 6675 4125 7500 4425
-6 7200 4200 7500 4350
-1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 7275 4275 15 15 7275 4275 7290 4290
-1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 7350 4275 15 15 7350 4275 7365 4290
-1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 7425 4275 15 15 7425 4275 7440 4290
--6
-1 1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 6900 4275 225 150 6900 4275 7125 4425
--6
 6 6675 3525 8025 3975
 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
@@ -79,9 +62,8 @@
 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3975 2850 150 150 3975 2850 4125 2850
 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 7200 2775 150 150 7200 2775 7350 2775
-1 3 0 1 0 0 0 0 0 0.000 1 0.0000 2250 4830 30 30 2250 4830 2280 4860
+1 3 0 1 0 0 0 0 0 0.000 1 0.0000 2250 4830 30 30 2250 4830 2280 4830
 1 3 0 1 0 0 0 0 0 0.000 1 0.0000 7200 2775 30 30 7200 2775 7230 2805
 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3525 3600 150 150 3525 3600 3675 3600
-1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3875 4800 100 100 3875 4800 3975 4800
-1 1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4650 4800 150 75 4650 4800 4800 4875
+1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4625 4838 100 100 4625 4838 4725 4838
 2 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
 	 2400 4200 2400 3750 1950 3750 1950 4200 2400 4200
@@ -153,19 +135,18 @@
 	1 1 1.00 45.00 90.00
 	 7875 3750 7875 2325 7200 2325 7200 2550
+2 2 1 1 -1 -1 0 0 -1 3.000 0 0 0 0 0 5
+	 6975 4950 6750 4950 6750 4725 6975 4725 6975 4950
 2 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
 	 5850 4950 5850 4725 5625 4725 5625 4950 5850 4950
-2 2 1 1 -1 -1 0 0 -1 3.000 0 0 0 0 0 5
-	 6975 4950 6750 4950 6750 4725 6975 4725 6975 4950
-4 1 -1 0 0 0 10 0.0000 2 105 720 5550 4425 Processors\001
-4 1 -1 0 0 0 10 0.0000 2 120 1005 4200 3225 Blocked Tasks\001
-4 1 -1 0 0 0 10 0.0000 2 150 870 4200 3975 Ready Tasks\001
-4 1 -1 0 0 0 10 0.0000 2 135 1095 7350 1725 Other Cluster(s)\001
-4 1 -1 0 0 0 10 0.0000 2 105 840 4650 1725 User Cluster\001
-4 1 -1 0 0 0 10 0.0000 2 150 615 2175 3675 Manager\001
-4 1 -1 0 0 0 10 0.0000 2 105 990 2175 3525 Discrete-event\001
-4 1 -1 0 0 0 10 0.0000 2 135 795 2175 4350 preemption\001
-4 0 -1 0 0 0 10 0.0000 2 150 1290 2325 4875 generator/coroutine\001
-4 0 -1 0 0 0 10 0.0000 2 120 270 4050 4875 task\001
-4 0 -1 0 0 0 10 0.0000 2 105 450 7050 4875 cluster\001
-4 0 -1 0 0 0 10 0.0000 2 105 660 5925 4875 processor\001
-4 0 -1 0 0 0 10 0.0000 2 105 555 4875 4875 monitor\001
+4 1 -1 0 0 0 10 0.0000 2 135 900 5550 4425 Processors\001
+4 1 -1 0 0 0 10 0.0000 2 165 1170 4200 3975 Ready Threads\001
+4 1 -1 0 0 0 10 0.0000 2 165 1440 7350 1725 Other Cluster(s)\001
+4 1 -1 0 0 0 10 0.0000 2 135 1080 4650 1725 User Cluster\001
+4 1 -1 0 0 0 10 0.0000 2 165 630 2175 3675 Manager\001
+4 1 -1 0 0 0 10 0.0000 2 135 1260 2175 3525 Discrete-event\001
+4 1 -1 0 0 0 10 0.0000 2 150 900 2175 4350 preemption\001
+4 0 -1 0 0 0 10 0.0000 2 135 630 7050 4875 cluster\001
+4 1 -1 0 0 0 10 0.0000 2 135 1350 4200 3225 Blocked Threads\001
+4 0 -1 0 0 0 10 0.0000 2 135 540 4800 4875 thread\001
+4 0 -1 0 0 0 10 0.0000 2 120 810 5925 4875 processor\001
+4 0 -1 0 0 0 10 0.0000 2 165 1710 2325 4875 generator/coroutine\001
Index: doc/theses/thierry_delisle_PhD/thesis/glossary.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/glossary.tex	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ doc/theses/thierry_delisle_PhD/thesis/glossary.tex	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -1,18 +1,52 @@
 \makeglossaries
+
+% ----------------------------------
+% Acronyms
+\newacronym{api}{API}{Application Programming Interface}
+\newacronym{fifo}{FIFO}{First-In, First-Out}
+\newacronym{io}{I/O}{Input and Output}
+\newacronym{numa}{NUMA}{Non-Uniform Memory Access}
+\newacronym{raii}{RAII}{Resource Acquisition Is Initialization}
+\newacronym{tls}{TLS}{Thread Local Storage}
+
+% ----------------------------------
+% Definitions
+
+\longnewglossaryentry{thrd}
+{name={thread}}
+{
+Threads created and managed inside user-space. Each thread has its own stack and its own thread of execution. User-level threads are invisible to the underlying operating system.
+
+\textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}
+}
+
+\longnewglossaryentry{proc}
+{name={processor}}
+{
+
+}
+
+\longnewglossaryentry{rQ}
+{name={ready-queue}}
+{
+
+}
+
+\longnewglossaryentry{uthrding}
+{name={user-level threading}}
+{
+
+
+\textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}
+}
+
+% ----------------------------------
 
 \longnewglossaryentry{hthrd}
 {name={hardware thread}}
 {
-Threads representing the underlying hardware directly.
+Threads representing the underlying hardware directly, \eg the CPU core, or hyper-thread if the hardware supports multiple threads of execution per core. The number of hardware threads is considered to be always fixed to a specific number determined by the hardware.
 
-\textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}
-}
-
-\longnewglossaryentry{thrd}
-{name={threads}}
-{
-Threads created and managed inside user-space. Each thread has its own stack and its own thread of execution. User-level threads are invisible to the underlying operating system.
-
-\textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}
+\textit{Synonyms : }
 }
 
@@ -57,15 +91,5 @@
 }
 
-\longnewglossaryentry{proc}
-{name={virtual processor}}
-{
 
-}
-
-\longnewglossaryentry{Q}
-{name={work-queue}}
-{
-
-}
 
 \longnewglossaryentry{at}
@@ -131,7 +155,2 @@
 }
 
-
-\newacronym{tls}{TLS}{Thread Local Storage}
-\newacronym{api}{API}{Application Program Interface}
-\newacronym{raii}{RAII}{Resource Acquisition Is Initialization}
-\newacronym{numa}{NUMA}{Non-Uniform Memory Access}
Index: doc/theses/thierry_delisle_PhD/thesis/local.bib
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/local.bib	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ doc/theses/thierry_delisle_PhD/thesis/local.bib	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,610 @@
+% --------------------------------------------------
+% Cforall
+@misc{cfa:frontpage,
+  url = {https://cforall.uwaterloo.ca/}
+}
+@article{cfa:typesystem,
+  author    = {Aaron Moss and Robert Schluntz and Peter A. Buhr},
+  title     = {{\CFA} : Adding modern programming language features to {C}},
+  journal   = {Softw. Pract. Exp.},
+  volume    = {48},
+  number    = {12},
+  pages     = {2111--2146},
+  year      = {2018},
+  url       = {https://doi.org/10.1002/spe.2624},
+  doi       = {10.1002/spe.2624},
+  timestamp = {Thu, 09 Apr 2020 17:14:14 +0200},
+  biburl    = {https://dblp.org/rec/journals/spe/MossSB18.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+
+
+% --------------------------------------------------
+% old survey mostly about job scheduling
+% talks about the literature mostly centering on upfront/static scheduling with task graphs as inputs
+% already mentions multi-core
+@article{DBLP:journals/csur/Gonzalez77,
+  author    = {Mario J. Gonzalez Jr.},
+  title     = {Deterministic Processor Scheduling},
+  journal   = {{ACM} Comput. Surv.},
+  volume    = {9},
+  number    = {3},
+  pages     = {173--204},
+  year      = {1977},
+  url       = {https://doi.org/10.1145/356698.356700},
+  doi       = {10.1145/356698.356700},
+  timestamp = {Tue, 06 Nov 2018 12:50:48 +0100},
+  biburl    = {https://dblp.org/rec/journals/csur/Gonzalez77.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% very short survey, highlights more interseting surveys as :
+% Casavant and Kuhl [1988], Chapin [1993], Shirazi et al. [1995], and Singhal and Shivaratri [1994]
+% still seems to mention static or partially static scheduling as a dominating trend
+@article{DBLP:journals/csur/Chapin96,
+  author    = {Steve J. Chapin},
+  title     = {Distributed and Multiprocessor Scheduling},
+  journal   = {{ACM} Comput. Surv.},
+  volume    = {28},
+  number    = {1},
+  pages     = {233--235},
+  year      = {1996},
+  url       = {https://doi.org/10.1145/234313.234410},
+  doi       = {10.1145/234313.234410},
+  timestamp = {Tue, 06 Nov 2018 12:50:49 +0100},
+  biburl    = {https://dblp.org/rec/journals/csur/Chapin96.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% more comprehensive survey that discusses many algorithms
+% still exclusively static scheduling
+@article{DBLP:journals/csur/KwokA99,
+  author    = {Yu{-}Kwong Kwok and Ishfaq Ahmad},
+  title     = {Static scheduling algorithms for allocating directed task graphs to multiprocessors},
+  journal   = {{ACM} Comput. Surv.},
+  volume    = {31},
+  number    = {4},
+  pages     = {406--471},
+  year      = {1999},
+  url       = {https://doi.org/10.1145/344588.344618},
+  doi       = {10.1145/344588.344618},
+  timestamp = {Fri, 30 Nov 2018 12:48:46 +0100},
+  biburl    = {https://dblp.org/rec/journals/csur/KwokA99.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% recent survey on kernel-thread scheduling
+% specifically focusing on schedulers that try to optimize to
+% reduce contention or reduce cache conflicts or improve some other ressource sharing metric
+@article{DBLP:journals/csur/ZhuravlevSBFP12,
+  author    = {Sergey Zhuravlev and Juan Carlos Saez and Sergey Blagodurov and Alexandra Fedorova and Manuel Prieto},
+  title     = {Survey of scheduling techniques for addressing shared resources in multicore processors},
+  journal   = {{ACM} Comput. Surv.},
+  volume    = {45},
+  number    = {1},
+  pages     = {4:1--4:28},
+  year      = {2012},
+  url       = {https://doi.org/10.1145/2379776.2379780},
+  doi       = {10.1145/2379776.2379780},
+  timestamp = {Tue, 06 Nov 2018 12:50:49 +0100},
+  biburl    = {https://dblp.org/rec/journals/csur/ZhuravlevSBFP12.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% great survey on work-stealing
+% highlights many of the recent work both theoretical and not
+@article{DBLP:journals/ijpp/YangH18,
+  author    = {Jixiang Yang and Qingbi He},
+  title     = {Scheduling Parallel Computations by Work Stealing: {A} Survey},
+  journal   = {Int. J. Parallel Program.},
+  volume    = {46},
+  number    = {2},
+  pages     = {173--197},
+  year      = {2018},
+  url       = {https://doi.org/10.1007/s10766-016-0484-8},
+  doi       = {10.1007/s10766-016-0484-8},
+  timestamp = {Wed, 01 Apr 2020 08:50:06 +0200},
+  biburl    = {https://dblp.org/rec/journals/ijpp/YangH18.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% --------------------------------------------------
+% introduction of work stealing
+@inproceedings{DBLP:conf/fpca/BurtonS81,
+  author    = {F. Warren Burton and M. Ronan Sleep},
+  editor    = {Arvind and Jack B. Dennis},
+  title     = {Executing functional programs on a virtual tree of processors},
+  booktitle = {Proceedings of the 1981 conference on Functional programming languages and computer architecture, {FPCA} 1981, Wentworth, New Hampshire, USA, October 1981},
+  pages     = {187--194},
+  publisher = {{ACM}},
+  year      = {1981},
+  url       = {https://doi.org/10.1145/800223.806778},
+  doi       = {10.1145/800223.806778},
+  timestamp = {Tue, 06 Nov 2018 11:07:48 +0100},
+  biburl    = {https://dblp.org/rec/conf/fpca/BurtonS81.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% introduction of randomized work stealing
+@inproceedings{DBLP:conf/focs/Blumofe94,
+  author    = {Robert D. Blumofe},
+  title     = {Scheduling Multithreaded Computations by Work Stealing},
+  booktitle = {35th Annual Symposium on Foundations of Computer Science, Santa Fe, New Mexico, USA, 20-22 November 1994},
+  pages     = {356--368},
+  publisher = {{IEEE} Computer Society},
+  year      = {1994},
+  url       = {https://doi.org/10.1109/SFCS.1994.365680},
+  doi       = {10.1109/SFCS.1994.365680},
+  timestamp = {Wed, 16 Oct 2019 14:14:54 +0200},
+  biburl    = {https://dblp.org/rec/conf/focs/Blumofe94.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% migration cost
+@inproceedings{DBLP:conf/sigmetrics/SquillanteN91,
+  author    = {Mark S. Squillante and Randolph D. Nelson},
+  editor    = {Tom W. Keller},
+  title     = {Analysis of Task Migration in Shared-Memory Multiprocessor Scheduling},
+  booktitle = {Proceedings of the 1991 {ACM} {SIGMETRICS} conference on Measurement and modeling of computer systems, San Diego, California, USA, May 21-24, 1991},
+  pages     = {143--155},
+  publisher = {{ACM}},
+  year      = {1991},
+  url       = {https://doi.org/10.1145/107971.107987},
+  doi       = {10.1145/107971.107987},
+  timestamp = {Sat, 07 Sep 2019 11:59:22 +0200},
+  biburl    = {https://dblp.org/rec/conf/sigmetrics/SquillanteN91.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/pe/EagerLZ86,
+  author    = {Derek L. Eager and Edward D. Lazowska and John Zahorjan},
+  title     = {A Comparison of Receiver-Initiated and Sender-Initiated Adaptive Load Sharing},
+  journal   = {Perform. Evaluation},
+  volume    = {6},
+  number    = {1},
+  pages     = {53--68},
+  year      = {1986},
+  url       = {https://doi.org/10.1016/0166-5316(86)90008-8},
+  doi       = {10.1016/0166-5316(86)90008-8},
+  timestamp = {Sat, 22 Feb 2020 19:26:16 +0100},
+  biburl    = {https://dblp.org/rec/journals/pe/EagerLZ86.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% affinity for work-stealing
+@article{DBLP:journals/tpds/SquillanteL93,
+  author    = {Mark S. Squillante and Edward D. Lazowska},
+  title     = {Using Processor-Cache Affinity Information in Shared-Memory Multiprocessor Scheduling},
+  journal   = {{IEEE} Trans. Parallel Distributed Syst.},
+  volume    = {4},
+  number    = {2},
+  pages     = {131--143},
+  year      = {1993},
+  url       = {https://doi.org/10.1109/71.207589},
+  doi       = {10.1109/71.207589},
+  timestamp = {Fri, 02 Oct 2020 14:40:30 +0200},
+  biburl    = {https://dblp.org/rec/journals/tpds/SquillanteL93.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+ systems with affinity scheduling
+@inproceedings{squillante2001threshold,
+  title={Threshold-based priority policies for parallel-server systems with affinity scheduling},
+  author={Squillante, Mark S and Xia, Cathy H and Yao, David D and Zhang, Li},
+  booktitle={Proceedings of the 2001 American Control Conference.(Cat. No. 01CH37148)},
+  volume={4},
+  pages={2992--2999},
+  year={2001},
+  organization={IEEE}
+}
+
+@article{DBLP:journals/mst/AcarBB02,
+  author    = {Umut A. Acar and Guy E. Blelloch and Robert D. Blumofe},
+  title     = {The Data Locality of Work Stealing},
+  journal   = {Theory Comput. Syst.},
+  volume    = {35},
+  number    = {3},
+  pages     = {321--347},
+  year      = {2002},
+  url       = {https://doi.org/10.1007/s00224-002-1057-3},
+  doi       = {10.1007/s00224-002-1057-3},
+  timestamp = {Sun, 28 May 2017 13:18:25 +0200},
+  biburl    = {https://dblp.org/rec/journals/mst/AcarBB02.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/tcs/NarangS11,
+  author    = {Ankur Narang and Rudrapatna K. Shyamasundar},
+  title     = {Performance driven distributed scheduling of parallel hybrid computations},
+  journal   = {Theor. Comput. Sci.},
+  volume    = {412},
+  number    = {32},
+  pages     = {4212--4225},
+  year      = {2011},
+  url       = {https://doi.org/10.1016/j.tcs.2010.11.044},
+  doi       = {10.1016/j.tcs.2010.11.044},
+  timestamp = {Sun, 28 May 2017 13:20:06 +0200},
+  biburl    = {https://dblp.org/rec/journals/tcs/NarangS11.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+Optimization via reflection on work stealing in TBB
+@inproceedings{DBLP:conf/ipps/RobisonVK08,
+  author    = {Arch Robison and Michael Voss and Alexey Kukanov},
+  title     = {Optimization via Reflection on Work Stealing in {TBB}},
+  booktitle = {22nd {IEEE} International Symposium on Parallel and Distributed Processing, {IPDPS} 2008, Miami, Florida USA, April 14-18, 2008},
+  pages     = {1--8},
+  publisher = {{IEEE}},
+  year      = {2008},
+  url       = {https://doi.org/10.1109/IPDPS.2008.4536188},
+  doi       = {10.1109/IPDPS.2008.4536188},
+  timestamp = {Wed, 16 Oct 2019 14:14:51 +0200},
+  biburl    = {https://dblp.org/rec/conf/ipps/RobisonVK08.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/ipl/SuksompongLS16,
+  author    = {Warut Suksompong and Charles E. Leiserson and Tao B. Schardl},
+  title     = {On the efficiency of localized work stealing},
+  journal   = {Inf. Process. Lett.},
+  volume    = {116},
+  number    = {2},
+  pages     = {100--106},
+  year      = {2016},
+  url       = {https://doi.org/10.1016/j.ipl.2015.10.002},
+  doi       = {10.1016/j.ipl.2015.10.002},
+  timestamp = {Fri, 26 May 2017 22:54:40 +0200},
+  biburl    = {https://dblp.org/rec/journals/ipl/SuksompongLS16.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+%theory
+@article{DBLP:journals/jpdc/MirchandaneyTS90,
+  author    = {Ravi Mirchandaney and Donald F. Towsley and John A. Stankovic},
+  title     = {Adaptive Load Sharing in Heterogeneous Distributed Systems},
+  journal   = {J. Parallel Distributed Comput.},
+  volume    = {9},
+  number    = {4},
+  pages     = {331--346},
+  year      = {1990},
+  url       = {https://doi.org/10.1016/0743-7315(90)90118-9},
+  doi       = {10.1016/0743-7315(90)90118-9},
+  timestamp = {Sat, 22 Feb 2020 19:36:31 +0100},
+  biburl    = {https://dblp.org/rec/journals/jpdc/MirchandaneyTS90.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/mst/BenderR02,
+  author    = {Michael A. Bender and Michael O. Rabin},
+  title     = {Online Scheduling of Parallel Programs on Heterogeneous Systems with Applications to Cilk},
+  journal   = {Theory Comput. Syst.},
+  volume    = {35},
+  number    = {3},
+  pages     = {289--304},
+  year      = {2002},
+  url       = {https://doi.org/10.1007/s00224-002-1055-5},
+  doi       = {10.1007/s00224-002-1055-5},
+  timestamp = {Sun, 28 May 2017 13:18:24 +0200},
+  biburl    = {https://dblp.org/rec/journals/mst/BenderR02.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/sigmetrics/GastG10,
+  author    = {Nicolas Gast and Bruno Gaujal},
+  editor    = {Vishal Misra and Paul Barford and Mark S. Squillante},
+  title     = {A mean field model of work stealing in large-scale systems},
+  booktitle = {{SIGMETRICS} 2010, Proceedings of the 2010 {ACM} {SIGMETRICS} International Conference on Measurement and Modeling of Computer Systems, New York, New York, USA, 14-18 June 2010},
+  pages     = {13--24},
+  publisher = {{ACM}},
+  year      = {2010},
+  url       = {https://doi.org/10.1145/1811039.1811042},
+  doi       = {10.1145/1811039.1811042},
+  timestamp = {Tue, 06 Nov 2018 11:07:18 +0100},
+  biburl    = {https://dblp.org/rec/conf/sigmetrics/GastG10.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/jacm/BlellochGM99,
+  author    = {Guy E. Blelloch and Phillip B. Gibbons and Yossi Matias},
+  title     = {Provably Efficient Scheduling for Languages with Fine-Grained Parallelism},
+  journal   = {J. {ACM}},
+  volume    = {46},
+  number    = {2},
+  pages     = {281--321},
+  year      = {1999},
+  url       = {https://doi.org/10.1145/301970.301974},
+  doi       = {10.1145/301970.301974},
+  timestamp = {Tue, 06 Nov 2018 12:51:45 +0100},
+  biburl    = {https://dblp.org/rec/journals/jacm/BlellochGM99.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/siamcomp/BerenbrinkFG03,
+  author    = {Petra Berenbrink and Tom Friedetzky and Leslie Ann Goldberg},
+  title     = {The Natural Work-Stealing Algorithm is Stable},
+  journal   = {{SIAM} J. Comput.},
+  volume    = {32},
+  number    = {5},
+  pages     = {1260--1279},
+  year      = {2003},
+  url       = {https://doi.org/10.1137/S0097539701399551},
+  doi       = {10.1137/S0097539701399551},
+  timestamp = {Sat, 27 May 2017 14:22:58 +0200},
+  biburl    = {https://dblp.org/rec/journals/siamcomp/BerenbrinkFG03.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/mst/AroraBP01,
+  author    = {Nimar S. Arora and Robert D. Blumofe and C. Greg Plaxton},
+  title     = {Thread Scheduling for Multiprogrammed Multiprocessors},
+  journal   = {Theory Comput. Syst.},
+  volume    = {34},
+  number    = {2},
+  pages     = {115--144},
+  year      = {2001},
+  url       = {https://doi.org/10.1007/s00224-001-0004-z},
+  doi       = {10.1007/s00224-001-0004-z},
+  timestamp = {Sun, 28 May 2017 13:18:24 +0200},
+  biburl    = {https://dblp.org/rec/journals/mst/AroraBP01.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/anor/TchiboukdjianGT13,
+  author    = {Marc Tchiboukdjian and Nicolas Gast and Denis Trystram},
+  title     = {Decentralized list scheduling},
+  journal   = {Ann. Oper. Res.},
+  volume    = {207},
+  number    = {1},
+  pages     = {237--259},
+  year      = {2013},
+  url       = {https://doi.org/10.1007/s10479-012-1149-7},
+  doi       = {10.1007/s10479-012-1149-7},
+  timestamp = {Thu, 13 Aug 2020 12:41:25 +0200},
+  biburl    = {https://dblp.org/rec/journals/anor/TchiboukdjianGT13.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/isaac/TchiboukdjianGTRB10,
+  author    = {Marc Tchiboukdjian and Nicolas Gast and Denis Trystram and Jean{-}Louis Roch and Julien Bernard},
+  editor    = {Otfried Cheong and Kyung{-}Yong Chwa and Kunsoo Park},
+  title     = {A Tighter Analysis of Work Stealing},
+  booktitle = {Algorithms and Computation - 21st International Symposium, {ISAAC} 2010, Jeju Island, Korea, December 15-17, 2010, Proceedings, Part {II}},
+  series    = {Lecture Notes in Computer Science},
+  volume    = {6507},
+  pages     = {291--302},
+  publisher = {Springer},
+  year      = {2010},
+  url       = {https://doi.org/10.1007/978-3-642-17514-5\_25},
+  doi       = {10.1007/978-3-642-17514-5\_25},
+  timestamp = {Fri, 13 Dec 2019 13:08:09 +0100},
+  biburl    = {https://dblp.org/rec/conf/isaac/TchiboukdjianGTRB10.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/ppopp/AgrawalLS10,
+  author    = {Kunal Agrawal and Charles E. Leiserson and Jim Sukha},
+  editor    = {R. Govindarajan and David A. Padua and Mary W. Hall},
+  title     = {Helper locks for fork-join parallel programming},
+  booktitle = {Proceedings of the 15th {ACM} {SIGPLAN} Symposium on Principles and Practice of Parallel Programming, {PPOPP} 2010, Bangalore, India, January 9-14, 2010},
+  pages     = {245--256},
+  publisher = {{ACM}},
+  year      = {2010},
+  url       = {https://doi.org/10.1145/1693453.1693487},
+  doi       = {10.1145/1693453.1693487},
+  timestamp = {Tue, 06 Nov 2018 16:57:27 +0100},
+  biburl    = {https://dblp.org/rec/conf/ppopp/AgrawalLS10.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/spaa/AgrawalFLSSU14,
+  author    = {Kunal Agrawal and Jeremy T. Fineman and Kefu Lu and Brendan Sheridan and Jim Sukha and Robert Utterback},
+  editor    = {Guy E. Blelloch and Peter Sanders},
+  title     = {Provably good scheduling for parallel programs that use data structures through implicit batching},
+  booktitle = {26th {ACM} Symposium on Parallelism in Algorithms and Architectures, {SPAA} '14, Prague, Czech Republic - June 23 - 25, 2014},
+  pages     = {84--95},
+  publisher = {{ACM}},
+  year      = {2014},
+  url       = {https://doi.org/10.1145/2612669.2612688},
+  doi       = {10.1145/2612669.2612688},
+  timestamp = {Wed, 21 Nov 2018 11:18:43 +0100},
+  biburl    = {https://dblp.org/rec/conf/spaa/AgrawalFLSSU14.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/ipps/ColeR13,
+  author    = {Richard Cole and
+               Vijaya Ramachandran},
+  title     = {Analysis of Randomized Work Stealing with False Sharing},
+  booktitle = {27th {IEEE} International Symposium on Parallel and Distributed Processing,
+               {IPDPS} 2013, Cambridge, MA, USA, May 20-24, 2013},
+  pages     = {985--998},
+  publisher = {{IEEE} Computer Society},
+  year      = {2013},
+  url       = {https://doi.org/10.1109/IPDPS.2013.86},
+  doi       = {10.1109/IPDPS.2013.86},
+  timestamp = {Wed, 16 Oct 2019 14:14:51 +0200},
+  biburl    = {https://dblp.org/rec/conf/ipps/ColeR13.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% --------------------------------------------------
+% ULE FreeBSD scheduler
+@inproceedings{DBLP:conf/bsdcon/Roberson03,
+  author    = {Jeff Roberson},
+  editor    = {Gregory Neil Shapiro},
+  title     = {{ULE:} {A} Modern Scheduler for FreeBSD},
+  booktitle = {Proceedings of BSDCon 2003, San Mateo, California, USA, September 8-12, 2003},
+  pages     = {17--28},
+  publisher = {{USENIX}},
+  year      = {2003},
+  url       = {http://www.usenix.org/publications/library/proceedings/bsdcon03/tech/roberson.html},
+  timestamp = {Wed, 04 Jul 2018 13:06:34 +0200},
+  biburl    = {https://dblp.org/rec/conf/bsdcon/Roberson03.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% --------------------------------------------------
+% Martin's LibFibre
+@article{DBLP:journals/pomacs/KarstenB20,
+  author    = {Martin Karsten and Saman Barghi},
+  title     = {User-level Threading: Have Your Cake and Eat It Too},
+  journal   = {Proc. {ACM} Meas. Anal. Comput. Syst.},
+  volume    = {4},
+  number    = {1},
+  pages     = {17:1--17:30},
+  year      = {2020},
+  url       = {https://doi.org/10.1145/3379483},
+  doi       = {10.1145/3379483},
+  timestamp = {Thu, 09 Jul 2020 22:58:54 +0200},
+  biburl    = {https://dblp.org/rec/journals/pomacs/KarstenB20.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% --------------------------------------------------
+% Linux CFS
+@inproceedings{DBLP:conf/eurosys/LoziLFGQF16,
+  author    = {Jean{-}Pierre Lozi and Baptiste Lepers and Justin R. Funston and Fabien Gaud and Vivien Qu{\'{e}}ma and Alexandra Fedorova},
+  editor    = {Cristian Cadar and Peter R. Pietzuch and Kimberly Keeton and Rodrigo Rodrigues},
+  title     = {The Linux scheduler: a decade of wasted cores},
+  booktitle = {Proceedings of the Eleventh European Conference on Computer Systems, EuroSys 2016, London, United Kingdom, April 18-21, 2016},
+  pages     = {1:1--1:16},
+  publisher = {{ACM}},
+  year      = {2016},
+  url       = {https://doi.org/10.1145/2901318.2901326},
+  doi       = {10.1145/2901318.2901326},
+  timestamp = {Tue, 06 Nov 2018 16:58:31 +0100},
+  biburl    = {https://dblp.org/rec/conf/eurosys/LoziLFGQF16.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@misc{MAN:linux/cfs,
+  title = {{CFS} Scheduler - The Linux Kernel documentation},
+  url = {https://www.kernel.org/doc/html/latest/scheduler/sched-design-CFS.html}
+}
+
+@misc{MAN:linux/cfs2,
+  title = {{CFS}: Completely fair process scheduling in Linux},
+  author = {Marty Kalin},
+  year = {2019},
+  month = {February},
+  url = {https://opensource.com/article/19/2/fair-scheduling-linux}
+}
+
+@article{MAN:linux/cfs/pelt,
+  title={Per-entity load tracking},
+  author={Corbet, Jonathan},
+  journal={LWN article, available at: https://lwn.net/Articles/531853},
+  year={2013}
+}
+
+@article{MAN:linux/cfs/balancing,
+  title={Reworking {CFS} load balancing},
+  journal={LWN article, available at: https://lwn.net/Articles/793427/},
+  year={2013}
+}
+
+@manual{MAN:linux/sched,
+  title = {SCHED(7) - Linux Programmer's Manual},
+  url   = {https://man7.org/linux/man-pages/man7/sched.7.html},
+  year  = {2019},
+  month = {august}
+}
+
+% Apple's MAC OS X
+@manual{MAN:apple/scheduler,
+  title = {Mach Scheduling and Thread Interfaces - Kernel Programming Guide},
+  organization = {Apple Inc.},
+  url = {https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}
+}
+
+Windows's Scheduler
+@inbook{MAN:windows/scheduler,
+  author = {Kate Chase and Mark E. Russinovich},
+  title = {Windows Internals},
+  chapter = {Processes, Threads, and Jobs in the Windows Operating System},
+  edition = {5th Edition},
+  publisher = {Microsoft Press},
+  year = {2009},
+  month = {June},
+  series = {Developer Reference},
+  url = {https://www.microsoftpressstore.com/articles/article.aspx?p=2233328&seqNum=7#:~:text=Overview\%20of\%20Windows\%20Scheduling,a\%20phenomenon\%20called\%20processor\%20affinity}
+}
+
+@online{GITHUB:go,
+  title = {GitHub - The Go Programming Language},
+  author = {The Go Programming Language},
+  url = {https://github.com/golang/go},
+  version = {Change-Id: If07f40b1d73b8f276ee28ffb8b7214175e56c24d}
+}
+
+@inproceedings{YTUBE:go,
+  author = {Dmitry Vyukov},
+  title = {Go scheduler: Implementing language with lightweight concurrency},
+  year = {2019},
+  booktitle = {Hydra},
+  url = {https://www.youtube.com/watch?v=-K11rY57K7k&ab_channel=Hydra}
+}
+
+@inproceedings{:erlang,
+  author = {Kenneth Lundin, Ericsson AB},
+  title = {Inside the Erlang VM},
+  year = {2008},
+  booktitle = {Erlang User Conference},
+  url = {http://www.erlang.se/euc/08/euc_smp.pdf}
+}
+
+
+
+@manual{MAN:tbb/scheduler,
+  title = {Scheduling Algorithm - Intel{\textregistered} Threading Building Blocks Developer Reference},
+  organization = {Intel{\textregistered}},
+  url = {https://www.threadingbuildingblocks.org/docs/help/reference/task_scheduler/scheduling_algorithm.html}
+}
+
+@manual{MAN:quasar,
+  title = {Quasar Core - Quasar User Manual},
+  organization = {Parallel Universe},
+  url = {https://docs.paralleluniverse.co/quasar/}
+}
+@misc{MAN:project-loom,
+  url = {https://www.baeldung.com/openjdk-project-loom}
+}
+
+@misc{MAN:java/fork-join,
+  url = {https://www.baeldung.com/java-fork-join}
+}
+
+% --------------------------------------------------
+% Wikipedia Entries
+@misc{wiki:taskparallel,
+  author = "{Wikipedia contributors}",
+  title = "Control theory --- {W}ikipedia{,} The Free Encyclopedia",
+  year = "2020",
+  url = "https://en.wikipedia.org/wiki/Task_parallelism",
+  note = "[Online; accessed 22-October-2020]"
+}
+
+@misc{wiki:controltheory,
+  author = "{Wikipedia contributors}",
+  title = "Task parallelism --- {W}ikipedia{,} The Free Encyclopedia",
+  year = "2020",
+  url = "https://en.wikipedia.org/wiki/Control_theory",
+  note = "[Online; accessed 22-October-2020]"
+}
+
+@misc{wiki:implicitpar,
+  author = "{Wikipedia contributors}",
+  title = "Implicit parallelism --- {W}ikipedia{,} The Free Encyclopedia",
+  year = "2020",
+  url = "https://en.wikipedia.org/wiki/Implicit_parallelism",
+  note = "[Online; accessed 23-October-2020]"
+}
+
+@misc{wiki:explicitpar,
+  author = "{Wikipedia contributors}",
+  title = "Explicit parallelism --- {W}ikipedia{,} The Free Encyclopedia",
+  year = "2017",
+  url = "https://en.wikipedia.org/wiki/Explicit_parallelism",
+  note = "[Online; accessed 23-October-2020]"
+}
Index: doc/theses/thierry_delisle_PhD/thesis/text/core.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/text/core.tex	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ doc/theses/thierry_delisle_PhD/thesis/text/core.tex	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -1,15 +1,34 @@
 \chapter{Scheduling Core}\label{core}
 
-This chapter addresses the need of scheduling on a somewhat ideal scenario
+Before discussing scheduling in general, where it is important to address systems that are changing states, this document discusses scheduling in a somewhat ideal scenerio, where the system has reached a steady state. For this purpose, a steady state is loosely defined as a state where there are always \glspl{thrd} ready to run and but the system has the ressources necessary to accomplish the work. In short, the system is neither overloaded or underloaded.
 
-\section{Existing Schedulers}
-\subsection{Feedback Scheduling}
+I believe it is important to discuss the steady state first because it is the easiest case to handle and, relatedly, the case in which the best performance is to be expected. As such, when the system is either overloaded or underloaded, a common approach is to try to adapt the system to the new load and return to the steady state. Flaws in the scheduling in the steady state tend therefore to be pervasive in all states.
 
-\subsection{Priority Scheduling}\label{priority}
+\section{Design Goals}
+As with most of the design decisions behind \CFA, the main goal is to match the expectation of the programmer, according to their probable mental model. To match these expectations, the design must offer the programmers sufficient guarantees so that, as long as the programmer respects the mental model, the system will also respect this model.
 
-\subsection{Work Stealing}
+For threading, a simple and common mental model is the ``Ideal multi-tasking CPU'' :
+
+\begin{displayquote}[Linux CFS\cit{https://www.kernel.org/doc/Documentation/scheduler/sched-design-CFS.txt}]
+	{[The]} ``Ideal multi-tasking CPU'' is a (non-existent  :-)) CPU that has 100\% physical power and which can run each task at precise equal speed, in parallel, each at [an equal fraction of the] speed.  For example: if there are 2 tasks running, then it runs each at 50\% physical power --- i.e., actually in parallel.
+\end{displayquote}
+
+Applied to threads, this model states that every ready \gls{thrd} immediately runs in parallel with all other ready \glspl{thrd}. While a strict implementation of this model is not feasible, programmers still have expectations about scheduling that come from this model.
+
+In general, the expectation at the center of this model is that ready \glspl{thrd} do not interfere with eachother but simply share the hardware. This makes it easier to reason about threading because ready \glspl{thrd} can be taken in isolation and the effect of the scheduler can be virtually ignored. This expectation of \gls{thrd} independence means the scheduler is expected to offer 2 guarantees:
+\begin{enumerate}
+	\item A fairness guarantee: a \gls{thrd} that is ready to run will not be prevented to do so by another thread.
+	\item A performance guarantee: a \gls{thrd} that wants to start or stop running will not be slowed down by other threads wanting to do the same.
+\end{enumerate}
+
+It is important to note that these guarantees are expected only up to a point. \Glspl{thrd} that are ready to run should not be prevented to do so, but they still need to share a limited amount of hardware. Therefore, the guarantee is considered respected if a \gls{thrd} gets access to a \emph{fair share} of the hardware, even if that share is very small.
+
+Similarly the performance guarantee, the lack of interferance between threads is only relevant op to a point. Ideally the cost of running and blocking would be constant regardless of contention, but the guarantee is considered satisfied if the cost is not \emph{too high} with or without contention. How much is an acceptable cost is obviously highly variable. For this document the performance experimentation will attempt to show that the cost of scheduling is not a major factor in application performance. This demonstration can be made by comparing application built in \CFA to applications built with other languages or other models. If the performance of an application built in \CFA is not meaningfully different than one built with a different runtime, then the scheduler has a negigeable impact on performance, \ie its impact can be ignored. Recall from a few paragraphs ago that the expectation of programmers is that the impact of the scheduler can be ignored. Therefore, if the cost of scheduling is not a significant portion of the runtime of several different application, I will consider the guarantee achieved.
+
+\todo{This paragraph should be moved later}
+% The next step is then to decide what is considered a \emph{fair share}, \ie what metric is used to measure fairness. Since \CFA is intended to allow numerous short lived threads, I decided to avoid total CPU time as the measure of fairness. Total CPU time inherently favors new \glspl{thrd} over older ones which isn't necessarily a good thing. Instead, fairness is measured in terms of opportunities to run. This metric is more appropriate for a mix of short and long lived \glspl{thrd}.
 
 \section{Design}
-While avoiding the pitfalls of Feedback Scheduling is fairly easy, scheduling does not innately require feedback, avoiding prioritization of \glspl{thrd} is more difficult because of implicitly priorities, see Subsection~\ref{priority}.
+While avoiding the pitfalls of Feedback Scheduling is fairly easy, scheduling does not innately require feedback, avoiding prioritization of \glspl{thrd} is more difficult because of implicitly priorities, see Subsection~\ref{priority}. A strictly \glsxtrshort{fifo} rea
 
 \subsection{Sharding}
@@ -19,7 +38,8 @@
 		\input{base.pstex_t}
 	\end{center}
-	\caption{Relaxed FIFO list at the base of the scheduler: an array of strictly FIFO lists.
-	The timestamp is in all nodes and cell arrays.}
+	\caption{Relaxed FIFO list}
 	\label{fig:base}
+	List at the base of the scheduler: an array of strictly FIFO lists.
+	The timestamp is in all nodes and cell arrays.
 \end{figure}
 
@@ -28,12 +48,4 @@
 Indeed, if the number of \glspl{thrd} does not far exceed the number of queues, it is probable that several of these queues are empty.
 Figure~\ref{fig:empty} shows an example with 2 \glspl{thrd} running on 8 queues, where the chances of getting an empty queue is 75\% per pick, meaning two random picks yield a \gls{thrd} only half the time.
-This can lead to performance problems since picks that do not yield a \gls{thrd} are not useful and do not necessarily help make more informed guesses.
-
-Solutions to this problem can take many forms, but they ultimately all have to encode where the threads are in some form. My results show that the density and locality of this encoding is generally the dominating factor in these scheme.
-
-\paragraph{Dense Information}
-
-
-
 
 
@@ -42,5 +54,12 @@
 		\input{empty.pstex_t}
 	\end{center}
-	\caption{``More empty'' state of the queue: the array contains many empty cells.}
+	\caption{``More empty'' Relaxed FIFO list}
 	\label{fig:empty}
+	Emptier state of the queue: the array contains many empty cells, that is strictly FIFO lists containing no elements.
 \end{figure}
+
+This can lead to performance problems since picks that do not yield a \gls{thrd} are not useful and do not necessarily help make more informed guesses.
+
+Solutions to this problem can take many forms, but they ultimately all have to encode where the threads are in some form. My results show that the density and locality of this encoding is generally the dominating factor in these scheme.
+
+\paragraph{Dense Information}
Index: doc/theses/thierry_delisle_PhD/thesis/text/existing.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/text/existing.tex	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ doc/theses/thierry_delisle_PhD/thesis/text/existing.tex	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,137 @@
+\chapter{Previous Work}\label{existing}
+Scheduling is a topic with a very long history, predating its use in computer science. As such, early work in computed science was inspired from other fields and focused principally on solving scheduling upfront rather that as the system is running.
+
+\section{Naming Convention}
+Scheduling has been studied by various different communities concentrating on different incarnation of the same problems. As a result, their is no real naming convention for scheduling that is respected across these communities. For this document, I will use the term \newterm{task} to refer to the abstract objects being scheduled and the term \newterm{worker} to refer to the objects which will execute these tasks.
+
+\section{Static Scheduling}
+Static schedulers require that programmers explicitly and exhaustively specify dependencies among tasks in order to schedule them. The scheduler then processes this input ahead of time and producess a \newterm{schedule} to which the system can later adhere. An example application for these schedulers
+
+In general, static schedulers are less relavant to this project since they require input from the programmers that \CFA does not have as part of its concurrency semantic.
+\todo{Rate-monotonic scheduling}
+
+
+\section{Dynamic Scheduling}
+It may be difficult to fulfill the requirements of static scheduler if dependencies are be conditionnal. In this case, it may be preferable to detect dependencies at runtime. This detection effectively takes the form of halting or suspending a task with unfulfilled dependencies and adding one or more new task(s) to the system. The new task(s) have the responsability of adding the dependent task back in the system once completed. As a consequence, the scheduler may have an incomplete view of the system, seeing only tasks we no pending dependencies. Schedulers that support this detection at runtime are referred to as \newterm{Dynamic Schedulers}.
+
+\subsection{Explicitly Informed Dynamic Schedulers}
+While dynamic schedulers do not have access to an exhaustive list of dependencies for a task, they may require to provide more or less information about each task, including for example: expected duration, required ressources, relative importance, etc. The scheduler can then use this information to direct the scheduling decisions. \cit{Examples of schedulers with more information} Precisely providing this information can be difficult for programmers, especially \emph{predicted} behaviour, and the scheduler may need to support some amount of imprecision in the provided information. For example, specifying that a tasks takes approximately 5 seconds to complete, rather than exactly 5 seconds. User provided information can also become a significant burden depending how the effort to provide the information scales with the number of tasks and there complexity. For example, providing an exhaustive list of files read by 5 tasks is an easier requirement the providing an exhaustive list of memory addresses accessed by 10'000 distinct tasks.
+
+Since the goal of this thesis is to provide a scheduler as a replacement for \CFA's existing \emph{uninformed} scheduler, Explicitly Informed schedulers are less relevant to this project. Nevertheless, some strategies are worth mentionnding.
+
+\subsubsection{Prority Scheduling}
+A commonly used information that schedulers used to direct the algorithm is priorities. Each Task is given a priority and higher-priority tasks are preferred to lower-priority ones. The simplest priority scheduling algorithm is to simply require that every task have a distinct pre-established priority and always run the available task with the highest priority. Asking programmers to provide an exhaustive set of unique priorities can be prohibitive when the system has a large number of tasks. It can therefore be diserable for schedulers to support tasks with identical priorities and/or automatically setting and adjusting priorites for tasks.
+
+\subsection{Uninformed and Self-Informed Dynamic Schedulers}
+Several scheduling algorithms do not require programmers to provide additionnal information on each task, and instead make scheduling decisions based solely on internal state and/or information implicitly gathered by the scheduler.
+
+
+\subsubsection{Feedback Scheduling}
+As mentionned, Schedulers may also gather information about each tasks to direct their decisions. This design effectively moves the scheduler to some extent into the realm of \newterm{Control Theory}\cite{wiki:controltheory}. This gathering does not generally involve programmers and as such does not increase programmer burden the same way explicitly provided information may. However, some feedback schedulers do offer the option to programmers to offer additionnal information on certain tasks, in order to direct scheduling decision. The important distinction being whether or not the scheduler can function without this additionnal information.
+
+Feedback scheduler
+
+
+\section{Work Stealing}
+One of the most popular scheduling algorithm in practice (see~\ref{existing:prod}) is work-stealing. This idea, introduce by \cite{DBLP:conf/fpca/BurtonS81}, effectively has each worker work on its local tasks first, but allows the possibility for other workers to steal local tasks if they run out of tasks. \cite{DBLP:conf/focs/Blumofe94} introduced the more familiar incarnation of this, where each workers has queue of tasks to accomplish and workers without tasks steal tasks from random workers. (The Burton and Sleep algorithm had trees of tasks and stole only among neighbours). Blumofe and Leiserson also prove worst case space and time requirements for well-structured computations.
+
+Many variations of this algorithm have been proposed over the years\cite{DBLP:journals/ijpp/YangH18}, both optmizations of existing implementations and approaches that account for new metrics.
+
+\paragraph{Granularity} A significant portion of early Work Stealing research was concentrating on \newterm{Implicit Parellelism}\cite{wiki:implicitpar}. Since the system was responsible to split the work, granularity is a challenge that cannot be left to the programmers (as opposed to \newterm{Explicit Parellelism}\cite{wiki:explicitpar} where the burden can be left to programmers). In general, fine granularity is better for load balancing and coarse granularity reduces communication overhead. The best performance generally means finding a middle ground between the two. Several methods can be employed, but I believe these are less relevant for threads, which are generally explicit and more coarse grained.
+
+\paragraph{Task Placement} Since modern computers rely heavily on cache hierarchies\cit{Do I need a citation for this}, migrating tasks from one core to another can be .  \cite{DBLP:journals/tpds/SquillanteL93}
+
+\TODO{The survey is not great on this subject}
+
+\paragraph{Complex Machine Architecture} Another aspect that has been looked at is how well Work Stealing is applicable to different machine architectures.
+
+\subsection{Theoretical Results}
+There is also a large body of research on the theoretical aspects of work stealing. These evaluate, for example, the cost of migration\cite{DBLP:conf/sigmetrics/SquillanteN91,DBLP:journals/pe/EagerLZ86}, how affinity affects performance\cite{DBLP:journals/tpds/SquillanteL93,DBLP:journals/mst/AcarBB02,DBLP:journals/ipl/SuksompongLS16} and theoretical models for heterogenous systems\cite{DBLP:journals/jpdc/MirchandaneyTS90,DBLP:journals/mst/BenderR02,DBLP:conf/sigmetrics/GastG10}. \cite{DBLP:journals/jacm/BlellochGM99} examine the space bounds of Work Stealing and \cite{DBLP:journals/siamcomp/BerenbrinkFG03} show that for underloaded systems, the scheduler will complete computations in finite time, \ie is \newterm{stable}. Others show that Work-Stealing is applicable to various scheduling contexts\cite{DBLP:journals/mst/AroraBP01,DBLP:journals/anor/TchiboukdjianGT13,DBLP:conf/isaac/TchiboukdjianGTRB10,DBLP:conf/ppopp/AgrawalLS10,DBLP:conf/spaa/AgrawalFLSSU14}. \cite{DBLP:conf/ipps/ColeR13} also studied how Randomized Work Stealing affects false sharing among tasks.
+
+However, as \cite{DBLP:journals/ijpp/YangH18} highlights, it is worth mentionning that this theoretical research has mainly focused on ``fully-strict'' computations, \ie workloads that can be fully represented with a Direct Acyclic Graph. It is unclear how well these distributions represent workloads in real world scenarios.
+
+\section{Preemption}
+One last aspect of scheduling worth mentionning is preemption since many schedulers rely on it for some of their guarantees. Preemption is the idea of interrupting tasks that have been running for too long, effectively injecting suspend points in the applications. There are multiple techniques to achieve this but they all aim to have the effect of guaranteeing that suspend points in a task are never further apart than some fixed duration. While this helps schedulers guarantee that no tasks will unfairly monopolize a worker, preemption can effectively added to any scheduler. Therefore, the only interesting aspect of preemption for the design of scheduling is whether or not to require it.
+
+\section{Schedulers in Production}\label{existing:prod}
+This section will show a quick overview of several schedulers which are generally available a the time of writing. While these schedulers don't necessarily represent to most recent advances in scheduling, they are what is generally accessible to programmers. As such, I believe that these schedulers are at least as relevant as those presented in published work. I chose both schedulers that operating in kernel space and in user space, as both can offer relevant insight for this project. However, I did not list any schedulers aimed for real-time applications, as these have constraints that are much stricter than what is needed for this project.
+
+\subsection{Operating System Schedulers}
+Operating System Schedulers tend to be fairly complex schedulers, they generally support some amount of real-time, aim to balance interactive and non-interactive tasks and support for multiple users sharing hardware without requiring these users to cooperate. Here are more details on a few schedulers used in the common operating systems: Linux, FreeBsd, Microsoft Windows and Apple's OS X. The information is less complete for operating systems behind closed source.
+
+\paragraph{Linux's CFS}
+The default scheduler used by Linux (the Completely Fair Scheduler)\cite{MAN:linux/cfs,MAN:linux/cfs2} is a feedback scheduler based on CPU time. For each processor, it constructs a Red-Black tree of tasks waiting to run, ordering them by amount of CPU time spent. The scheduler schedules the task that has spent the least CPU time. It also supports the concept of \newterm{Nice values}, which are effectively multiplicative factors on the CPU time spent. The ordering of tasks is also impacted by a group based notion of fairness, where tasks belonging to groups having spent less CPU time are preferred to tasks beloning to groups having spent more CPU time. Linux achieves load-balancing by regularly monitoring the system state\cite{MAN:linux/cfs/balancing} and using some heuristic on the load (currently CPU time spent in the last millisecond plus decayed version of the previous time slots\cite{MAN:linux/cfs/pelt}.).
+
+\cite{DBLP:conf/eurosys/LoziLFGQF16} shows that Linux's CFS also does work-stealing to balance the workload of each processors, but the paper argues this aspect can be improved significantly. The issues highlighted sem to stem from Linux's need to support fairness across tasks \emph{and} across users\footnote{Enforcing fairness across users means, for example, that given two users: one with a single task and the other with one thousand tasks, the user with a single task does not receive one one thousandth of the CPU time.}, increasing the complexity.
+
+Linux also offers a FIFO scheduler, a real-time schedulerwhich runs the highest-priority task, and a round-robin scheduler, which is an extension of the fifo-scheduler that adds fixed time slices. \cite{MAN:linux/sched}
+
+\paragraph{FreeBSD}
+The ULE scheduler used in FreeBSD\cite{DBLP:conf/bsdcon/Roberson03} is a feedback scheduler similar to Linux's CFS. It uses different data structures and heuristics but also schedules according to some combination of CPU time spent and niceness values. It also periodically balances the load of the system(according to a different heuristic), but uses a simpler Work Stealing approach.
+
+\paragraph{Windows(OS)}
+Microsoft's Operating System's Scheduler\cite{MAN:windows/scheduler} is a feedback scheduler with priorities. It supports 32 levels of priorities, some of which are reserved for real-time and prviliged applications. It schedules tasks based on the highest priorities (lowest number) and how much cpu time each tasks have used. The scheduler may also temporarily adjust priorities after certain effects like the completion of I/O requests.
+
+\todo{load balancing}
+
+\paragraph{Apple OS X}
+Apple programming documentation describes its kernel scheduler as follows:
+\begin{displayquote}
+	OS X is based on Mach and BSD. [...]. It contains an advanced scheduler based on the CMU Mach 3 scheduler. [...] Mach scheduling is based on a system of run queues at various priorities.
+
+	\begin{flushright}
+		-- Kernel Programming Guide \cite{MAN:apple/scheduler}
+	\end{flushright}
+\end{displayquote}
+
+\todo{load balancing}
+
+\subsection{User-Level Schedulers}
+By comparison, user level schedulers tend to be simpler, gathering fewer metrics and avoid complex notions of fairness. Part of the simplicity is due to the fact that all tasks have the same user, and therefore cooperation is both feasible and probable.
+\paragraph{Go}
+Go's scheduler uses a Randomized Work Stealing algorithm that has a global runqueue(\emph{GRQ}) and each processor(\emph{P}) has both a fixed-size runqueue(\emph{LRQ}) and a high-priority next ``chair'' holding a single element.\cite{GITHUB:go,YTUBE:go} Preemption is present, but only at function call boundaries.
+
+The algorithm is as follows :
+\begin{enumerate}
+	\item Once out of 61 times, directly pick 1 element from the \emph{GRQ}.
+	\item If there is an item in the ``chair'' pick it.
+	\item Else pick an item from the \emph{LRQ}.
+	\item If it was empty steal (len(\emph{GRQ}) / \#of\emph{P}) + 1 items (max 256) from the \emph{GRQ}.
+	\item If it was empty steal \emph{half} the \emph{LRQ} of another \emph{P} chosen randomly.
+\end{enumerate}
+
+\paragraph{Erlang}
+Erlang is a functionnal language that supports concurrency in the form of processes, threads that share no data. It seems to be some kind of Round-Robin Scheduler. It currently uses some mix of Work Sharing and Work Stealing to achieve load balancing\cite{:erlang}, where underloaded workers steal from other workers, but overloaded workers also push work to other workers. This migration logic seems to be directed by monitoring logic that evaluates the load a few times per seconds.
+
+\paragraph{Intel\textregistered ~Threading Building Blocks}
+\newterm{Thread Building Blocks}(TBB) is Intel's task parellelism\cite{wiki:taskparallel} framework. It runs tasks or \newterm{jobs}, schedulable objects that must always run to completion, on a pool of worker threads. TBB's scheduler is a variation of Randomized Work Stealing that also supports higher-priority graph-like dependencies\cite{MAN:tbb/scheduler}. It schedules tasks as follows (where \textit{t} is the last task completed):
+\begin{displayquote}
+	\begin{enumerate}
+		\item The task returned by \textit{t}\texttt{.execute()}
+		\item The successor of t if \textit{t} was its last completed predecessor.
+		\item A task popped from the end of the thread’s own deque.
+		\item A task with affinity for the thread.
+		\item A task popped from approximately the beginning of the shared queue.
+		\item A task popped from the beginning of another randomly chosen thread’s deque.
+	\end{enumerate}
+
+	\begin{flushright}
+		-- Intel\textregistered ~TBB documentation \cite{MAN:tbb/scheduler}
+	\end{flushright}
+\end{displayquote}
+
+\paragraph{Quasar/Project Loom}
+Java has two projects that are attempting to introduce lightweight threading into java in the form of Fibers, Quasar\cite{MAN:quasar} and Project Loom\cite{MAN:project-loom}\footnote{It is unclear to me if these are distinct projects or not}. Both projects seem to be based on the \texttt{ForkJoinPool} in Java which appears to be a simple incarnation of Randomized Work Stealing\cite{MAN:java/fork-join}.
+
+\paragraph{Grand Central Dispatch}
+This is an API produce by Apple\cit{Official GCD source} that offers task parellelism\cite{wiki:taskparallel}. Its distinctive aspect is that it uses multiple ``Dispatch Queues'', some of which are created by programmers. These queues each have their own local ordering guarantees, \eg tasks on queue $A$ are executed in \emph{FIFO} order.
+
+\todo{load balancing and scheduling}
+
+% http://web.archive.org/web/20090920043909/http://images.apple.com/macosx/technology/docs/GrandCentral_TB_brief_20090903.pdf
+
+In terms of semantics, the Dispatch Queues seem to be very similar in semantics to Intel\textregistered ~TBB \texttt{execute()} and predecessor semantics. Where it would be possible to convert from one to the other.
+
+\paragraph{LibFibre}
+LibFibre\cite{DBLP:journals/pomacs/KarstenB20} is a light-weight user-level threading framework developt at the University of Waterloo. Similarly to Go, it uses a variation of Work Stealing with a global queue that is higher priority than stealing. Unlock Go it does not have the high-priority next ``chair'' and does not use Randomized Work Stealing.
+
Index: doc/theses/thierry_delisle_PhD/thesis/text/front.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/text/front.tex	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ doc/theses/thierry_delisle_PhD/thesis/text/front.tex	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -184,6 +184,10 @@
 \phantomsection		% allows hyperref to link to the correct page
 
+% TODOs and missing citations
+% -----------------------------
 \listofcits
 \listoftodos
+\cleardoublepage
+\phantomsection		% allows hyperref to link to the correct page
 
 
Index: doc/theses/thierry_delisle_PhD/thesis/text/intro.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/text/intro.tex	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ doc/theses/thierry_delisle_PhD/thesis/text/intro.tex	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -1,1 +1,9 @@
-\chapter{Introduction}
+\chapter*{Introduction}\label{intro}
+\todo{A proper intro}
+
+The C programming language\cit{C}
+
+The \CFA programming language\cite{cfa:frontpage,cfa:typesystem} which extends the C programming language to add modern safety and productiviy features while maintaining backwards compatibility. Among it's productiviy features, \CFA introduces support for threading\cit{CFA Concurrency}, to allow programmers to write modern concurrent and parallel programming.
+While previous work on the concurrent package of \CFA focused on features and interfaces, this thesis focuses on performance, introducing \glsxtrshort{api} changes only when required by performance considerations. More specifically, this thesis concentrates on scheduling and \glsxtrshort{io}. Prior to this work, the \CFA runtime used a strictly \glsxtrshort{fifo} \gls{rQ}.
+
+This work exclusively concentrates on Linux as it's operating system since the existing \CFA runtime and compiler does not already support other operating systems. Furthermore, as \CFA is yet to be released, supporting version of Linux older that the latest version is not a goal of this work.
Index: doc/theses/thierry_delisle_PhD/thesis/text/io.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/text/io.tex	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ doc/theses/thierry_delisle_PhD/thesis/text/io.tex	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -1,5 +1,7 @@
-\chapter{I/O}
+\chapter{User Level \glsxtrshort{io}}
+As mentionned in Section~\ref{prev:io}, User-Level \glsxtrshort{io} requires multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc} using asynchronous \glsxtrshort{io} operations. Various operating systems offer various forms of asynchronous operations and as mentioned in Chapter~\ref{intro}, this work is exclusively focuesd on Linux.
 
 \section{Existing options}
+Since \glsxtrshort{io} operations are generally handled by the
 
 \subsection{\texttt{epoll}, \texttt{poll} and \texttt{select}}
@@ -7,7 +9,32 @@
 \subsection{Linux's AIO}
 
+
+
+\begin{displayquote}
+	AIO is a horrible ad-hoc design, with the main excuse being "other,
+	less gifted people, made that design, and we are implementing it for
+	compatibility because database people - who seldom have any shred of
+	taste - actually use it".
+
+	But AIO was always really really ugly.
+
+	\begin{flushright}
+		-- Linus Torvalds\cit{https://lwn.net/Articles/671657/}
+	\end{flushright}
+\end{displayquote}
+
+Interestingly, in this e-mail answer, Linus goes on to describe
+``a true \textit{asynchronous system call} interface''
+that does
+``[an] arbitrary system call X with arguments A, B, C, D asynchronously using a kernel thread''
+in
+``some kind of arbitrary \textit{queue up asynchronous system call} model''.
+This description is actually quite close to the interface of the interface described in the next section.
+
 \subsection{\texttt{io\_uring}}
+A very recent addition to Linux, \texttt{io\_uring}\cit{io\_uring} is a framework that aims to solve many of the problems listed with the above mentioned solutions.
 
-\subsection{Extra Kernel Threads}
+\subsection{Extra Kernel Threads}\label{io:morethreads}
+Finally, if the operating system does not offer any satisfying forms of asynchronous \glsxtrshort{io} operations, a solution is to fake it by creating a pool of \glspl{kthrd} and delegating operations to them in order to avoid blocking \glspl{proc}.
 
 \subsection{Discussion}
Index: doc/theses/thierry_delisle_PhD/thesis/text/practice.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/text/practice.tex	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ doc/theses/thierry_delisle_PhD/thesis/text/practice.tex	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -2,9 +2,5 @@
 The scheduling algorithm discribed in Chapter~\ref{core} addresses scheduling in a stable state.
 However, it does not address problems that occur when the system changes state.
-Indeed the \CFA runtime, supports expanding and shrinking
-
-the number of KTHREAD\_place
-
-, both manually and, to some extent automatically.
+Indeed the \CFA runtime, supports expanding and shrinking the number of KTHREAD\_place \todo{add kthrd to glossary}, both manually and, to some extent automatically.
 This entails that the scheduling algorithm must support these transitions.
 
Index: doc/theses/thierry_delisle_PhD/thesis/text/runtime.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/text/runtime.tex	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ doc/theses/thierry_delisle_PhD/thesis/text/runtime.tex	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -1,7 +1,44 @@
 \chapter{\CFA Runtime}
+This chapter offers an overview of the capabilities of the \CFA runtime prior to this work.
 
-\section{M:N Threading}
+Threading in \CFA offers is based on \Gls{uthrding}, where \glspl{thrd} are the representation of a unit of work. As such, \CFA programmers should expect these units to be fairly inexpensive, that is: programmers should be able to create a large number of \glspl{thrd} and switch between \glspl{thrd} liberally without many concerns for performance.
+
+\section{M:N Threading}\label{prev:model}
+
+C traditionnally uses a 1:1 threading model. This model uses \glspl{kthrd} to achive parallelism and concurrency. In this model, every thread of computation maps to an object in the kernel. The kernel then has the responsibility of managing these threads, \eg creating, scheduling, blocking. This also entails that the kernel has a perfect view of every thread executing in the system\footnote{This is not completly true due to primitives like \texttt{futex}es, which have a significant portion of their logic in user space.}.
+
+By contrast \CFA uses an M:N threading models, where concurrency is achieved using many user-level threads mapped onto fewer \glspl{kthrd}. The user-level threads have the same semantic meaning as a \glspl{kthrd} in the 1:1 model, they represent an independant thread of execution with it's on stack. The difference is that user-level threads do not have a corresponding object in the kernel, they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \gls{thrd} until it context switches out, it then choses a different \gls{thrd} to run.
 
 \section{Clusters}
+\begin{figure}
+	\begin{center}
+		\input{system.pstex_t}
+	\end{center}
+	\caption{Overview of the \CFA runtime}
+	\label{fig:system}
+	\Glspl{thrd} are scheduled inside a particular cluster, where it only runs on the \glspl{proc} which belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{kthrd} which lives outside any cluster and does not run \glspl{thrd}.
+\end{figure}
+\CFA allows the option to group user-level threading, in the form of clusters. Both \glspl{thrd} and \glspl{proc} belong to a specific cluster. \Glspl{thrd} will only be scheduled onto \glspl{proc} in the same cluster and scheduling is done independantly of other clusters. Figure~\ref{fig:system} shows an overview if this system. This allows programmers to control more tightly parallelism. It also opens the door to handling effects like NUMA, by pining clusters to specific NUMA node\footnote{This is not currently implemented in \CFA, but the only hurdle left is creating a generic interface for cpu masks.}.
+
+\section{Scheduling}
+The \CFA runtime was previously using a strictly \glsxtrshort{fifo} ready queue with a single lock. This setup offers perfect fairness in terms of opportunities to run/ However, it offers poor scalability, since the performance of the ready queue can never be improved by adding more \glspl{hthrd}, but the contention can cause significant performance degradation.
+
+\section{\glsxtrshort{io}}\label{prev:io}
+Prior to this work, the \CFA runtime did not add any particular support for \glsxtrshort{io} operations. \CFA being built on C, this means that, while all the operations available in C are available in \CFA, \glsxtrshort{io} operations are designed for the POSIX threading model\cit{pthreads}. Using these operations in a M:N threading model, when they are built for 1:1 threading, means that operations block \glspl{proc} instead of \glspl{thrd}. While this can work in certain cases, it limits the number of concurrent operations to the number of \glspl{proc} rather than \glspl{thrd}. This also means that deadlocks can occur because all \glspl{proc} are blocked even if at least one \gls{thrd} is ready to run. A simple example of this type of deadlock would be as follows:
+
+Given a simple network program with 2 \glspl{thrd} and a single \gls{proc}, one \gls{thrd} sends network requests to a server and the other \gls{thrd} waits for response from the server. If the second \gls{thrd} races ahead, it may wait for responses to requests that have not been sent yet. In theory, this should not be a problem, even if the second \gls{thrd} waits, the first \gls{thrd} is still ready to run and should just be able to get CPU time and send the request. In practice with M:N threading, while the first \gls{thrd} is ready, the lone \gls{proc} in this example will \emph{not} try to run the first \gls{thrd} if it is blocked in the \glsxtrshort{io} operation of the second \gls{thrd}. If this happen, the system is effectively deadlocked\footnote{In this example, the deadlocked could be resolved if the server sends unprompted messages to the client. However, this solution is not general and may not be appropriate even in this simple case.}.
+
+One of the objective of this work, is to introduce \emph{User-Level \glsxtrshort{io}} which, as a parallel to \glslink{uthrding}{User-Level \emph{Threading}}, blocks \glspl{thrd} rather than \glspl{proc} when doing \glsxtrshort{io} operations. This entails multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc}. This multiplexing requires that a single \gls{proc} be able to execute multiple operations in parallel. This cannot be done with operations that block \glspl{proc}, \ie \glspl{kthrd}, since the first operation would prevent starting new operations for its duration. Executing operations in parallel requires \emph{asynchronous} \glsxtrshort{io}, sometimes referred to as \emph{non-blocking}, since the \gls{kthrd} is not blocked.
 
 \section{Interoperating with \texttt{C}}
+While \glsxtrshort{io} operations are the classical example of operations that block \glspl{kthrd}, the challenges mentioned in the previous section do not require \glsxtrshort{io} to be involved. These challenges are a product of blocking system calls rather than \glsxtrshort{io}. C offers no tools to identify whether or not a librairy function will lead to a blocking system call. This fact means interoperatability with C becomes a challenge in a M:N threading model.
+
+Languages like Go and Java, which have strict interoperatability with C\cit{JNI, GoLang with C}, can control operations in C by ``sandboxing'' them. They can, for example, delegate C operations to \glspl{kthrd} that are not \glspl{proc}. Sandboxing may help towards guaranteeing that the deadlocks mentioned in the previous section do not occur.
+
+As mentioned in Section~\cit{\CFA intro}, \CFA is binary compatible with C and, as such, trivially supports calls to and from C librairies. Furthermore, interoperatability can happen within a single library, through inline code or simply C and \CFA translation units archived together. The fine-grained interoperatability between C and \CFA has two consequences:
+\begin{enumerate}
+	\item Precisely identifying C calls that could block is difficult.
+	\item Introducing code where interoperatability occurs could have a significant impact on general performance.
+\end{enumerate}
+
+Because of these consequences, this work does not attempt to ``sandbox'' calls to C. It is possible that conflicting calls to C could lead to deadlocks on \CFA's M:N threading model where they would not in the traditionnal 1:1 threading model. However, I judge that solving this problem in general, in a way that is composable and flexible, is too complex in itself and would add too much work to this thesis. Therefore it is outside the scope of this thesis.
Index: doc/theses/thierry_delisle_PhD/thesis/thesis.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/thesis.tex	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ doc/theses/thierry_delisle_PhD/thesis/thesis.tex	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -121,4 +121,7 @@
 % installation instructions there.
 
+\usepackage{csquotes}
+\usepackage{indentfirst} % as any self-respecting frenchman would
+
 % Setting up the page margins...
 % uWaterloo thesis requirements specify a minimum of 1 inch (72pt) margin at the
@@ -218,9 +221,17 @@
 % separate documents, they would each start with the \chapter command, i.e,
 % do not contain \documentclass or \begin{document} and \end{document} commands.
+\part{Introduction}
 \input{text/intro.tex}
+\input{text/existing.tex}
 \input{text/runtime.tex}
+\part{Design}
 \input{text/core.tex}
 \input{text/practice.tex}
 \input{text/io.tex}
+\part{Evaluation}
+\chapter{Theoretical and Existance Proofs}
+\chapter{Micro-Benchmarks}
+\chapter{Larger-Scale applications}
+\part{Conclusion \& Annexes}
 
 %----------------------------------------------------------------------
@@ -245,11 +256,11 @@
 \addcontentsline{toc}{chapter}{\textbf{References}}
 
-\bibliography{uw-ethesis}
+\bibliography{local}
 % Tip 5: You can create multiple .bib files to organize your references.
 % Just list them all in the \bibliogaphy command, separated by commas (no spaces).
 
-% The following statement causes the specified references to be added to the bibliography% even if they were not
-% cited in the text. The asterisk is a wildcard that causes all entries in the bibliographic database to be included (optional).
-\nocite{*}
+% % The following statement causes the specified references to be added to the bibliography% even if they were not
+% % cited in the text. The asterisk is a wildcard that causes all entries in the bibliographic database to be included (optional).
+% \nocite{*}
 
 % The \appendix statement indicates the beginning of the appendices.
Index: libcfa/prelude/builtins.c
===================================================================
--- libcfa/prelude/builtins.c	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/prelude/builtins.c	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -9,8 +9,10 @@
 // Author           : Peter A. Buhr
 // Created On       : Fri Jul 21 16:21:03 2017
-// Last Modified By : Peter A. Buhr
-// Last Modified On : Fri Oct  9 18:26:19 2020
-// Update Count     : 110
+// Last Modified By : Andrew Beach
+// Last Modified On : Tue Oct 27 14:42:00 2020
+// Update Count     : 111
 //
+
+#define __cforall_builtins__
 
 // type that wraps a pointer and a destructor-like function - used in generating implicit destructor calls for struct members in user-defined functions
Index: libcfa/src/concurrency/clib/cfathread.cfa
===================================================================
--- libcfa/src/concurrency/clib/cfathread.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/concurrency/clib/cfathread.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -59,5 +59,5 @@
 	void cfathread_setproccnt( int ncnt ) {
 		assert( ncnt >= 1 );
-		adelete(proc_cnt, procs);
+		adelete( procs );
 
 		proc_cnt = ncnt - 1;
Index: libcfa/src/concurrency/coroutine.cfa
===================================================================
--- libcfa/src/concurrency/coroutine.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/concurrency/coroutine.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -10,6 +10,6 @@
 // Created On       : Mon Nov 28 12:27:26 2016
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Tue May 26 22:06:09 2020
-// Update Count     : 21
+// Last Modified On : Fri Oct 23 23:05:24 2020
+// Update Count     : 22
 //
 
@@ -24,12 +24,8 @@
 #include <unistd.h>
 #include <sys/mman.h>									// mprotect
-extern "C" {
-// use this define to make unwind.h play nice, definitely a hack
-#define HIDE_EXPORTS
 #include <unwind.h>
-#undef HIDE_EXPORTS
-}
 
 #include "kernel_private.hfa"
+#include "exception.hfa"
 
 #define __CFA_INVOKE_PRIVATE__
@@ -49,10 +45,4 @@
 FORALL_DATA_INSTANCE(CoroutineCancelled, (dtype coroutine_t), (coroutine_t))
 
-struct __cfaehm_node {
-	struct _Unwind_Exception unwind_exception;
-	struct __cfaehm_node * next;
-	int handler_index;
-};
-
 forall(dtype T)
 void mark_exception(CoroutineCancelled(T) *) {}
@@ -60,4 +50,5 @@
 forall(dtype T)
 void copy(CoroutineCancelled(T) * dst, CoroutineCancelled(T) * src) {
+	dst->virtual_table = src->virtual_table;
 	dst->the_coroutine = src->the_coroutine;
 	dst->the_exception = src->the_exception;
@@ -74,5 +65,5 @@
 	verify( desc->cancellation );
 	desc->state = Cancelled;
-	exception_t * except = (exception_t *)(1 + (__cfaehm_node *)desc->cancellation);
+	exception_t * except = __cfaehm_cancellation_exception( desc->cancellation );
 
 	// TODO: Remove explitate vtable set once trac#186 is fixed.
@@ -92,5 +83,5 @@
 
 // minimum feasible stack size in bytes
-#define MinStackSize 1000
+static const size_t MinStackSize = 1000;
 extern size_t __page_size;				// architecture pagesize HACK, should go in proper runtime singleton
 
@@ -217,5 +208,5 @@
 		size = libFloor(create_size - stack_data_size - diff, libAlign());
 	} // if
-	assertf( size >= MinStackSize, "Stack size %zd provides less than minimum of %d bytes for a stack.", size, MinStackSize );
+	assertf( size >= MinStackSize, "Stack size %zd provides less than minimum of %zd bytes for a stack.", size, MinStackSize );
 
 	this->storage = (__stack_t *)((intptr_t)storage + size);
Index: libcfa/src/concurrency/exception.cfa
===================================================================
--- libcfa/src/concurrency/exception.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/concurrency/exception.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -10,20 +10,18 @@
 // Created On       : Mon Aug 17 10:41:00 2020
 // Last Modified By : Andrew Beach
-// Last Modified On : Tue Aug 25 14:41:00 2020
-// Update Count     : 0
+// Last Modified On : Wed Oct 28 14:34:00 2020
+// Update Count     : 1
 //
 
-extern "C" {
-// use this define to make unwind.h play nice, definitely a hack
-#define HIDE_EXPORTS
-#include <unwind.h>
-#undef HIDE_EXPORTS
-}
+#define __cforall_thread__
 
-#include "invoke.h"
 #include "exception.hfa"
+
 #include "coroutine.hfa"
 
 extern struct $thread * mainThread;
+extern "C" {
+extern void __cfactx_thrd_leave();
+}
 
 // Common pattern for all the stop functions, wait until the end then act.
@@ -52,6 +50,6 @@
 
 STOP_AT_END_FUNCTION(thread_cancelstop,
-	// TODO: Instead pass information to the joiner.
-	abort();
+	__cfactx_thrd_leave();
+	__cabi_abort( "Resumed cancelled thread" );
 )
 
@@ -85,4 +83,6 @@
 		stop_param = (void *)0x22;
 	} else {
+		this_thread->self_cor.cancellation = unwind_exception;
+
 		stop_func = thread_cancelstop;
 		stop_param = this_thread;
Index: libcfa/src/concurrency/exception.hfa
===================================================================
--- libcfa/src/concurrency/exception.hfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/concurrency/exception.hfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -16,13 +16,14 @@
 #pragma once
 
+// This is an internal bridge between the two modes and must be C compatable.
+
+#include <unwind.h>
 #include "bits/defs.hfa"
 #include "invoke.h"
+#include "exception.h"
 
 #ifdef __cforall
 extern "C" {
-
-#define HIDE_EXPORTS
 #endif
-#include "unwind.h"
 
 struct exception_context_t * this_exception_context(void) OPTIONAL_THREAD;
@@ -32,5 +33,4 @@
 
 #ifdef __cforall
-#undef HIDE_EXPORTS
 }
 #endif
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/concurrency/invoke.h	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -157,4 +157,8 @@
 
 		// current execution status for coroutine
+		// Possible values are:
+		//    - TICKET_BLOCKED (-1) thread is blocked
+		//    - TICKET_RUNNING ( 0) thread is running
+		//    - TICKET_UNBLOCK ( 1) thread should ignore next block
 		volatile int ticket;
 		enum __Coroutine_State state:8;
Index: libcfa/src/concurrency/io/call.cfa.in
===================================================================
--- libcfa/src/concurrency/io/call.cfa.in	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/concurrency/io/call.cfa.in	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -47,37 +47,30 @@
 	#include "kernel/fwd.hfa"
 
-	#if defined(CFA_HAVE_IOSQE_FIXED_FILE) && defined(CFA_HAVE_IOSQE_IO_DRAIN) && defined(CFA_HAVE_IOSQE_ASYNC)
-		#define REGULAR_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_DRAIN | IOSQE_ASYNC)
-	#elif defined(CFA_HAVE_IOSQE_FIXED_FILE) && defined(CFA_HAVE_IOSQE_ASYNC)
-		#define REGULAR_FLAGS (IOSQE_FIXED_FILE | IOSQE_ASYNC)
-	#elif defined(CFA_HAVE_IOSQE_FIXED_FILE) && defined(CFA_HAVE_IOSQE_IO_DRAIN)
-		#define REGULAR_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_DRAIN)
-	#elif defined(CFA_HAVE_IOSQE_IO_DRAIN) && defined(CFA_HAVE_IOSQE_ASYNC)
-		#define REGULAR_FLAGS (IOSQE_IO_DRAIN | IOSQE_ASYNC)
-	#elif defined(CFA_HAVE_IOSQE_FIXED_FILE)
-		#define REGULAR_FLAGS (IOSQE_FIXED_FILE)
-	#elif defined(CFA_HAVE_IOSQE_IO_DRAIN)
-		#define REGULAR_FLAGS (IOSQE_IO_DRAIN)
-	#elif defined(CFA_HAVE_IOSQE_ASYNC)
-		#define REGULAR_FLAGS (IOSQE_ASYNC)
-	#else
-		#define REGULAR_FLAGS (0)
-	#endif
-
-	#if defined(CFA_HAVE_IOSQE_IO_LINK) && defined(CFA_HAVE_IOSQE_IO_HARDLINK)
-		#define LINK_FLAGS (IOSQE_IO_LINK | IOSQE_IO_HARDLINK)
-	#elif defined(CFA_HAVE_IOSQE_IO_LINK)
-		#define LINK_FLAGS (IOSQE_IO_LINK)
-	#elif defined(CFA_HAVE_IOSQE_IO_HARDLINK)
-		#define LINK_FLAGS (IOSQE_IO_HARDLINK)
-	#else
-		#define LINK_FLAGS (0)
-	#endif
-
-	#if defined(CFA_HAVE_SPLICE_F_FD_IN_FIXED)
-		#define SPLICE_FLAGS (SPLICE_F_FD_IN_FIXED)
-	#else
-		#define SPLICE_FLAGS (0)
-	#endif
+	static const __u8 REGULAR_FLAGS = 0
+		#if defined(CFA_HAVE_IOSQE_FIXED_FILE)
+			| IOSQE_FIXED_FILE
+		#endif
+		#if defined(CFA_HAVE_IOSQE_IO_DRAIN)
+			| IOSQE_IO_DRAIN
+		#endif
+		#if defined(CFA_HAVE_IOSQE_ASYNC)
+			| IOSQE_ASYNC
+		#endif
+	;
+
+	static const __u32 LINK_FLAGS = 0
+		#if defined(CFA_HAVE_IOSQE_IO_LINK)
+			| IOSQE_IO_LINK
+		#endif
+		#if defined(CFA_HAVE_IOSQE_IO_HARDLINK)
+			| IOSQE_IO_HARDLINK
+		#endif
+	;
+
+	static const __u32 SPLICE_FLAGS = 0
+		#if defined(CFA_HAVE_SPLICE_F_FD_IN_FIXED)
+			| SPLICE_F_FD_IN_FIXED
+		#endif
+	;
 
 	extern [* struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data );
Index: libcfa/src/concurrency/io/setup.cfa
===================================================================
--- libcfa/src/concurrency/io/setup.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/concurrency/io/setup.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -149,4 +149,5 @@
 		id.full_proc = false;
 		id.id = doregister(&id);
+		kernelTLS.this_proc_id = &id;
 		__cfaabi_dbg_print_safe( "Kernel : IO poller thread starting\n" );
 
@@ -180,5 +181,5 @@
 					kernelTLS.this_stats = io_ctx->self.curr_cluster->stats;
 				#endif
-				__post( io_ctx->sem, &id );
+				post( io_ctx->sem );
 			}
 		}
@@ -235,5 +236,5 @@
 			if( thrd.state == Ready || thrd.preempted != __NO_PREEMPTION ) {
 
-				ready_schedule_lock( (struct __processor_id_t *)active_processor() );
+				ready_schedule_lock();
 
 					// This is the tricky case
@@ -250,8 +251,8 @@
 					// Fixup the thread state
 					thrd.state = Blocked;
-					thrd.ticket = 0;
+					thrd.ticket = TICKET_BLOCKED;
 					thrd.preempted = __NO_PREEMPTION;
 
-				ready_schedule_unlock( (struct __processor_id_t *)active_processor() );
+				ready_schedule_unlock();
 
 				// Pretend like the thread was blocked all along
@@ -275,5 +276,5 @@
 			}
 		} else {
-			unpark( &thrd );
+			post( this.thrd.sem );
 		}
 
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/concurrency/kernel.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -108,5 +108,5 @@
 static $thread * __next_thread_slow(cluster * this);
 static void __run_thread(processor * this, $thread * dst);
-static void __wake_one(struct __processor_id_t * id, cluster * cltr);
+static void __wake_one(cluster * cltr);
 
 static void push  (__cluster_idles & idles, processor & proc);
@@ -252,4 +252,5 @@
 		/* paranoid */ verify( kernelTLS.this_thread == thrd_dst );
 		/* paranoid */ verify( thrd_dst->context.SP );
+		/* paranoid */ verify( thrd_dst->state != Halted );
 		/* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) < ((uintptr_t)__get_stack(thrd_dst->curr_cor)->base ) || thrd_dst->curr_cor == proc_cor, "ERROR : Destination $thread %p has been corrupted.\n StackPointer too small.\n", thrd_dst ); // add escape condition if we are setting up the processor
 		/* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) > ((uintptr_t)__get_stack(thrd_dst->curr_cor)->limit) || thrd_dst->curr_cor == proc_cor, "ERROR : Destination $thread %p has been corrupted.\n StackPointer too large.\n", thrd_dst ); // add escape condition if we are setting up the processor
@@ -281,5 +282,5 @@
 		if(unlikely(thrd_dst->preempted != __NO_PREEMPTION)) {
 			// The thread was preempted, reschedule it and reset the flag
-			__schedule_thread( (__processor_id_t*)this, thrd_dst );
+			__schedule_thread( thrd_dst );
 			break RUNNING;
 		}
@@ -287,7 +288,6 @@
 		if(unlikely(thrd_dst->state == Halted)) {
 			// The thread has halted, it should never be scheduled/run again
-			// We may need to wake someone up here since
-			unpark( this->destroyer );
-			this->destroyer = 0p;
+			// finish the thread
+			__thread_finish( thrd_dst );
 			break RUNNING;
 		}
@@ -299,8 +299,8 @@
 		int old_ticket = __atomic_fetch_sub(&thrd_dst->ticket, 1, __ATOMIC_SEQ_CST);
 		switch(old_ticket) {
-			case 1:
+			case TICKET_RUNNING:
 				// This is case 1, the regular case, nothing more is needed
 				break RUNNING;
-			case 2:
+			case TICKET_UNBLOCK:
 				// This is case 2, the racy case, someone tried to run this thread before it finished blocking
 				// In this case, just run it again.
@@ -358,8 +358,9 @@
 // Scheduler routines
 // KERNEL ONLY
-void __schedule_thread( struct __processor_id_t * id, $thread * thrd ) {
+void __schedule_thread( $thread * thrd ) {
 	/* paranoid */ verify( thrd );
 	/* paranoid */ verify( thrd->state != Halted );
 	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( kernelTLS.this_proc_id );
 	/* paranoid */ #if defined( __CFA_WITH_VERIFY__ )
 	/* paranoid */ 	if( thrd->state == Blocked || thrd->state == Start ) assertf( thrd->preempted == __NO_PREEMPTION,
@@ -374,8 +375,8 @@
 	if (thrd->preempted == __NO_PREEMPTION) thrd->state = Ready;
 
-	ready_schedule_lock  ( id );
+	ready_schedule_lock();
 		push( thrd->curr_cluster, thrd );
-		__wake_one(id, thrd->curr_cluster);
-	ready_schedule_unlock( id );
+		__wake_one(thrd->curr_cluster);
+	ready_schedule_unlock();
 
 	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
@@ -384,11 +385,13 @@
 // KERNEL ONLY
 static inline $thread * __next_thread(cluster * this) with( *this ) {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-
-	ready_schedule_lock  ( (__processor_id_t*)kernelTLS.this_processor );
+	/* paranoid */ verify( kernelTLS.this_proc_id );
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+
+	ready_schedule_lock();
 		$thread * thrd = pop( this );
-	ready_schedule_unlock( (__processor_id_t*)kernelTLS.this_processor );
-
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	ready_schedule_unlock();
+
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( kernelTLS.this_proc_id );
 	return thrd;
 }
@@ -396,40 +399,44 @@
 // KERNEL ONLY
 static inline $thread * __next_thread_slow(cluster * this) with( *this ) {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-
-	ready_schedule_lock  ( (__processor_id_t*)kernelTLS.this_processor );
+	/* paranoid */ verify( kernelTLS.this_proc_id );
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+
+	ready_schedule_lock();
 		$thread * thrd = pop_slow( this );
-	ready_schedule_unlock( (__processor_id_t*)kernelTLS.this_processor );
-
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	ready_schedule_unlock();
+
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( kernelTLS.this_proc_id );
 	return thrd;
 }
 
-// KERNEL ONLY unpark with out disabling interrupts
-void __unpark(  struct __processor_id_t * id, $thread * thrd ) {
+void unpark( $thread * thrd ) {
+	if( !thrd ) return;
+
+	/* paranoid */ verify( kernelTLS.this_proc_id );
+	bool full = kernelTLS.this_proc_id->full_proc;
+	if(full) disable_interrupts();
+
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 	int old_ticket = __atomic_fetch_add(&thrd->ticket, 1, __ATOMIC_SEQ_CST);
 	switch(old_ticket) {
-		case 1:
+		case TICKET_RUNNING:
 			// Wake won the race, the thread will reschedule/rerun itself
 			break;
-		case 0:
+		case TICKET_BLOCKED:
 			/* paranoid */ verify( ! thrd->preempted != __NO_PREEMPTION );
 			/* paranoid */ verify( thrd->state == Blocked );
 
 			// Wake lost the race,
-			__schedule_thread( id, thrd );
+			__schedule_thread( thrd );
 			break;
 		default:
 			// This makes no sense, something is wrong abort
-			abort();
-	}
-}
-
-void unpark( $thread * thrd ) {
-	if( !thrd ) return;
-
-	disable_interrupts();
-	__unpark( (__processor_id_t*)kernelTLS.this_processor, thrd );
-	enable_interrupts( __cfaabi_dbg_ctx );
+			abort("Thread %p (%s) has mismatch park/unpark\n", thrd, thrd->self_cor.name);
+	}
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+
+	if(full) enable_interrupts( __cfaabi_dbg_ctx );
+	/* paranoid */ verify( kernelTLS.this_proc_id );
 }
 
@@ -448,9 +455,28 @@
 }
 
-// KERNEL ONLY
-void __leave_thread() {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	returnToKernel();
-	abort();
+extern "C" {
+	// Leave the thread monitor
+	// last routine called by a thread.
+	// Should never return
+	void __cfactx_thrd_leave() {
+		$thread * thrd = TL_GET( this_thread );
+		$monitor * this = &thrd->self_mon;
+
+		// Lock the monitor now
+		lock( this->lock __cfaabi_dbg_ctx2 );
+
+		disable_interrupts();
+
+		thrd->state = Halted;
+		if( TICKET_RUNNING != thrd->ticket ) { abort( "Thread terminated with pending unpark" ); }
+		if( thrd != this->owner || this->recursion != 1) { abort( "Thread internal monitor has unbalanced recursion" ); }
+
+		// Leave the thread
+		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+		returnToKernel();
+		abort();
+
+		// Control flow should never reach here!
+	}
 }
 
@@ -486,7 +512,7 @@
 //=============================================================================================
 // Wake a thread from the front if there are any
-static void __wake_one(struct __processor_id_t * id, cluster * this) {
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	/* paranoid */ verify( ready_schedule_islocked( id ) );
+static void __wake_one(cluster * this) {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( ready_schedule_islocked() );
 
 	// Check if there is a sleeping processor
@@ -506,5 +532,5 @@
 	#endif
 
-	/* paranoid */ verify( ready_schedule_islocked( id ) );
+	/* paranoid */ verify( ready_schedule_islocked() );
 	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
 
@@ -709,4 +735,8 @@
 		this.print_halts = true;
 	}
+
+	void print_stats_now( cluster & this, int flags ) {
+		__print_stats( this.stats, this.print_stats, true, this.name, (void*)&this );
+	}
 #endif
 // Local Variables: //
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/concurrency/kernel.hfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -79,8 +79,4 @@
 	// Handle to pthreads
 	pthread_t kernel_thread;
-
-	// RunThread data
-	// Action to do after a thread is ran
-	$thread * destroyer;
 
 	// Preemption data
Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -35,7 +35,8 @@
 	extern "Cforall" {
 		extern __attribute__((aligned(128))) thread_local struct KernelThreadData {
-			struct $thread    * volatile this_thread;
-			struct processor  * volatile this_processor;
-			struct __stats_t  * volatile this_stats;
+			struct $thread          * volatile this_thread;
+			struct processor        * volatile this_processor;
+			struct __processor_id_t * volatile this_proc_id;
+			struct __stats_t        * volatile this_stats;
 
 			struct {
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -122,4 +122,5 @@
 	NULL,
 	NULL,
+	NULL,
 	{ 1, false, false },
 };
@@ -212,4 +213,5 @@
 	//initialize the global state variables
 	kernelTLS.this_processor = mainProcessor;
+	kernelTLS.this_proc_id   = (__processor_id_t*)mainProcessor;
 	kernelTLS.this_thread    = mainThread;
 
@@ -227,5 +229,5 @@
 	// Add the main thread to the ready queue
 	// once resume is called on mainProcessor->runner the mainThread needs to be scheduled like any normal thread
-	__schedule_thread((__processor_id_t *)mainProcessor, mainThread);
+	__schedule_thread(mainThread);
 
 	// SKULLDUGGERY: Force a context switch to the main processor to set the main thread's context to the current UNIX
@@ -324,4 +326,5 @@
 	processor * proc = (processor *) arg;
 	kernelTLS.this_processor = proc;
+	kernelTLS.this_proc_id   = (__processor_id_t*)proc;
 	kernelTLS.this_thread    = 0p;
 	kernelTLS.preemption_state.[enabled, disable_count] = [false, 1];
@@ -441,5 +444,5 @@
 
 static void ?{}( $thread & this, current_stack_info_t * info) with( this ) {
-	ticket = 1;
+	ticket = TICKET_RUNNING;
 	state = Start;
 	self_cor{ info };
@@ -474,5 +477,4 @@
 	this.cltr = &_cltr;
 	full_proc = true;
-	destroyer = 0p;
 	do_terminate = false;
 	preemption_alarm = 0p;
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -33,12 +33,12 @@
 }
 
-void __schedule_thread( struct __processor_id_t *, $thread * )
+void __schedule_thread( $thread * )
 #if defined(NDEBUG) || (!defined(__CFA_DEBUG__) && !defined(__CFA_VERIFY__))
-	__attribute__((nonnull (2)))
+	__attribute__((nonnull (1)))
 #endif
 ;
 
-//Block current thread and release/wake-up the following resources
-void __leave_thread() __attribute__((noreturn));
+//release/wake-up the following resources
+void __thread_finish( $thread * thrd );
 
 //-----------------------------------------------------------------------------
@@ -63,24 +63,7 @@
 )
 
-// KERNEL ONLY unpark with out disabling interrupts
-void __unpark( struct __processor_id_t *, $thread * thrd );
-
-static inline bool __post(single_sem & this, struct __processor_id_t * id) {
-	for() {
-		struct $thread * expected = this.ptr;
-		if(expected == 1p) return false;
-		if(expected == 0p) {
-			if(__atomic_compare_exchange_n(&this.ptr, &expected, 1p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-				return false;
-			}
-		}
-		else {
-			if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-				__unpark( id, expected );
-				return true;
-			}
-		}
-	}
-}
+#define TICKET_BLOCKED (-1) // thread is blocked
+#define TICKET_RUNNING ( 0) // thread is running
+#define TICKET_UNBLOCK ( 1) // thread should ignore next block
 
 //-----------------------------------------------------------------------------
@@ -197,7 +180,9 @@
 // Reader side : acquire when using the ready queue to schedule but not
 //  creating/destroying queues
-static inline void ready_schedule_lock( struct __processor_id_t * proc) with(*__scheduler_lock) {
-	unsigned iproc = proc->id;
-	/*paranoid*/ verify(data[iproc].handle == proc);
+static inline void ready_schedule_lock(void) with(*__scheduler_lock) {
+	/*paranoid*/ verify( kernelTLS.this_proc_id );
+
+	unsigned iproc = kernelTLS.this_proc_id->id;
+	/*paranoid*/ verify(data[iproc].handle == kernelTLS.this_proc_id);
 	/*paranoid*/ verify(iproc < ready);
 
@@ -221,7 +206,9 @@
 }
 
-static inline void ready_schedule_unlock( struct __processor_id_t * proc) with(*__scheduler_lock) {
-	unsigned iproc = proc->id;
-	/*paranoid*/ verify(data[iproc].handle == proc);
+static inline void ready_schedule_unlock(void) with(*__scheduler_lock) {
+	/*paranoid*/ verify( kernelTLS.this_proc_id );
+
+	unsigned iproc = kernelTLS.this_proc_id->id;
+	/*paranoid*/ verify(data[iproc].handle == kernelTLS.this_proc_id);
 	/*paranoid*/ verify(iproc < ready);
 	/*paranoid*/ verify(data[iproc].lock);
@@ -235,5 +222,7 @@
 
 #ifdef __CFA_WITH_VERIFY__
-	static inline bool ready_schedule_islocked( struct __processor_id_t * proc) {
+	static inline bool ready_schedule_islocked(void) {
+		/*paranoid*/ verify( kernelTLS.this_proc_id );
+		__processor_id_t * proc = kernelTLS.this_proc_id;
 		return __scheduler_lock->data[proc->id].owned;
 	}
Index: libcfa/src/concurrency/monitor.cfa
===================================================================
--- libcfa/src/concurrency/monitor.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/concurrency/monitor.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -281,58 +281,29 @@
 }
 
-extern "C" {
-	// Leave the thread monitor
-	// last routine called by a thread.
-	// Should never return
-	void __cfactx_thrd_leave() {
-		$thread * thrd = TL_GET( this_thread );
-		$monitor * this = &thrd->self_mon;
-
-		// Lock the monitor now
-		lock( this->lock __cfaabi_dbg_ctx2 );
-
-		disable_interrupts();
-
-		thrd->state = Halted;
-
-		/* paranoid */ verifyf( thrd == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", thrd, this->owner, this->recursion, this );
-
-		// Leaving a recursion level, decrement the counter
-		this->recursion -= 1;
-
-		// If we haven't left the last level of recursion
-		// it must mean there is an error
-		if( this->recursion != 0) { abort( "Thread internal monitor has unbalanced recursion" ); }
-
-		// Fetch the next thread, can be null
-		$thread * new_owner = next_thread( this );
-
-		// Release the monitor lock
-		unlock( this->lock );
-
-		// Unpark the next owner if needed
-		/* paranoid */ verifyf( !new_owner || new_owner == this->owner, "Expected owner to be %p, got %p (m: %p)", new_owner, this->owner, this );
-		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-		/* paranoid */ verify( ! kernelTLS.this_processor->destroyer );
-		/* paranoid */ verify( thrd->state == Halted );
-
-		kernelTLS.this_processor->destroyer = new_owner;
-
-		// Leave the thread
-		__leave_thread();
-
-		// Control flow should never reach here!
-	}
-}
-
-// Join a thread
-forall( dtype T | is_thread(T) )
-T & join( T & this ) {
-	$monitor *    m = get_monitor(this);
-	void (*dtor)(T& mutex this) = ^?{};
-	monitor_dtor_guard_t __guard = { &m, (fptr_t)dtor, true };
-	{
-		return this;
-	}
+void __thread_finish( $thread * thrd ) {
+	$monitor * this = &thrd->self_mon;
+
+	// Lock the monitor now
+	/* paranoid */ verify( this->lock.lock );
+	/* paranoid */ verifyf( thrd == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", thrd, this->owner, this->recursion, this );
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( thrd->state == Halted );
+	/* paranoid */ verify( this->recursion == 1 );
+
+	// Leaving a recursion level, decrement the counter
+	this->recursion -= 1;
+	this->owner = 0p;
+
+	// Fetch the next thread, can be null
+	$thread * new_owner = next_thread( this );
+
+	// Release the monitor lock
+	unlock( this->lock );
+
+	// Unpark the next owner if needed
+	/* paranoid */ verifyf( !new_owner || new_owner == this->owner, "Expected owner to be %p, got %p (m: %p)", new_owner, this->owner, this );
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( thrd->state == Halted );
+	unpark( new_owner );
 }
 
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/concurrency/preemption.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -38,5 +38,5 @@
 // FwdDeclarations : timeout handlers
 static void preempt( processor   * this );
-static void timeout( struct __processor_id_t * id, $thread * this );
+static void timeout( $thread * this );
 
 // FwdDeclarations : Signal handlers
@@ -91,5 +91,5 @@
 
 // Tick one frame of the Discrete Event Simulation for alarms
-static void tick_preemption( struct __processor_id_t * id ) {
+static void tick_preemption(void) {
 	alarm_node_t * node = 0p;							// Used in the while loop but cannot be declared in the while condition
 	alarm_list_t * alarms = &event_kernel->alarms;		// Local copy for ease of reading
@@ -109,9 +109,9 @@
 		}
 		else if( node->type == User ) {
-			timeout( id, node->thrd );
+			timeout( node->thrd );
 		}
 		else {
 			bool unpark_thd = node->callback(*node);
-			if (unpark_thd) timeout( id, node->thrd );
+			if (unpark_thd) timeout( node->thrd );
 		}
 
@@ -274,9 +274,9 @@
 
 // reserved for future use
-static void timeout( struct __processor_id_t * id, $thread * this ) {
+static void timeout( $thread * this ) {
 	#if !defined( __CFA_NO_STATISTICS__ )
 		kernelTLS.this_stats = this->curr_cluster->stats;
 	#endif
-	__unpark( id, this );
+	unpark( this );
 }
 
@@ -417,4 +417,5 @@
 	id.full_proc = false;
 	id.id = doregister(&id);
+	kernelTLS.this_proc_id = &id;
 
 	// Block sigalrms to control when they arrive
@@ -462,5 +463,5 @@
 			// __cfaabi_dbg_print_safe( "Kernel : Preemption thread tick\n" );
 			lock( event_kernel->lock __cfaabi_dbg_ctx2 );
-			tick_preemption( &id );
+			tick_preemption();
 			unlock( event_kernel->lock );
 			break;
Index: libcfa/src/concurrency/snzi.hfa
===================================================================
--- libcfa/src/concurrency/snzi.hfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/concurrency/snzi.hfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -36,5 +36,5 @@
 static inline void depart( __snzi_node_t & );
 
-#define __snzi_half -1
+static const int __snzi_half = -1;
 
 //--------------------------------------------------
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/concurrency/thread.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -19,4 +19,5 @@
 
 #include "kernel_private.hfa"
+#include "exception.hfa"
 
 #define __CFA_INVOKE_PRIVATE__
@@ -28,5 +29,5 @@
 	context{ 0p, 0p };
 	self_cor{ name, storage, storageSize };
-	ticket = 1;
+	ticket = TICKET_RUNNING;
 	state = Start;
 	preempted = __NO_PREEMPTION;
@@ -58,4 +59,60 @@
 }
 
+FORALL_DATA_INSTANCE(ThreadCancelled, (dtype thread_t), (thread_t))
+
+forall(dtype T)
+void copy(ThreadCancelled(T) * dst, ThreadCancelled(T) * src) {
+	dst->virtual_table = src->virtual_table;
+	dst->the_thread = src->the_thread;
+	dst->the_exception = src->the_exception;
+}
+
+forall(dtype T)
+const char * msg(ThreadCancelled(T) *) {
+	return "ThreadCancelled";
+}
+
+forall(dtype T)
+static void default_thread_cancel_handler(ThreadCancelled(T) & ) {
+	abort( "Unhandled thread cancellation.\n" );
+}
+
+forall(dtype T | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)))
+void ?{}( thread_dtor_guard_t & this,
+		T & thrd, void(*defaultResumptionHandler)(ThreadCancelled(T) &)) {
+	$monitor * m = get_monitor(thrd);
+	void (*dtor)(T& mutex this) = ^?{};
+	bool join = defaultResumptionHandler != (void(*)(ThreadCancelled(T)&))0;
+	(this.mg){&m, (void(*)())dtor, join};
+
+	// After the guard set-up and any wait, check for cancellation.
+	$thread * desc = get_thread(thrd);
+	struct _Unwind_Exception * cancellation = desc->self_cor.cancellation;
+	if ( likely( 0p == cancellation ) ) {
+		return;
+	} else if ( Cancelled == desc->state ) {
+		return;
+	}
+	desc->state = Cancelled;
+	if (!join) {
+		defaultResumptionHandler = default_thread_cancel_handler;
+	}
+
+	ThreadCancelled(T) except;
+	// TODO: Remove explitate vtable set once trac#186 is fixed.
+	except.virtual_table = &get_exception_vtable(&except);
+	except.the_thread = &thrd;
+	except.the_exception = __cfaehm_cancellation_exception( cancellation );
+	throwResume except;
+
+	except.the_exception->virtual_table->free( except.the_exception );
+	free( cancellation );
+	desc->self_cor.cancellation = 0p;
+}
+
+void ^?{}( thread_dtor_guard_t & this ) {
+	^(this.mg){};
+}
+
 //-----------------------------------------------------------------------------
 // Starting and stopping threads
@@ -70,5 +127,5 @@
 	verify( this_thrd->context.SP );
 
-	__schedule_thread( (__processor_id_t *)kernelTLS.this_processor, this_thrd);
+	__schedule_thread( this_thrd );
 	enable_interrupts( __cfaabi_dbg_ctx );
 }
@@ -93,4 +150,11 @@
 }
 
+//-----------------------------------------------------------------------------
+forall(dtype T | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)))
+T & join( T & this ) {
+	thread_dtor_guard_t guard = { this, defaultResumptionHandler };
+	return this;
+}
+
 // Local Variables: //
 // mode: c //
Index: libcfa/src/concurrency/thread.hfa
===================================================================
--- libcfa/src/concurrency/thread.hfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/concurrency/thread.hfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -22,12 +22,24 @@
 #include "kernel.hfa"
 #include "monitor.hfa"
+#include "exception.hfa"
 
 //-----------------------------------------------------------------------------
 // thread trait
 trait is_thread(dtype T) {
-      void ^?{}(T& mutex this);
-      void main(T& this);
-      $thread* get_thread(T& this);
+	void ^?{}(T& mutex this);
+	void main(T& this);
+	$thread* get_thread(T& this);
 };
+
+FORALL_DATA_EXCEPTION(ThreadCancelled, (dtype thread_t), (thread_t)) (
+	thread_t * the_thread;
+	exception_t * the_exception;
+);
+
+forall(dtype T)
+void copy(ThreadCancelled(T) * dst, ThreadCancelled(T) * src);
+
+forall(dtype T)
+const char * msg(ThreadCancelled(T) *);
 
 // define that satisfies the trait without using the thread keyword
@@ -65,4 +77,12 @@
 static inline void ?{}($thread & this, const char * const name, struct cluster & cl )                   { this{ name, cl, 0p, 65000 }; }
 static inline void ?{}($thread & this, const char * const name, struct cluster & cl, size_t stackSize ) { this{ name, cl, 0p, stackSize }; }
+
+struct thread_dtor_guard_t {
+	monitor_dtor_guard_t mg;
+};
+
+forall( dtype T | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)) )
+void ?{}( thread_dtor_guard_t & this, T & thrd, void(*)(ThreadCancelled(T) &) );
+void ^?{}( thread_dtor_guard_t & this );
 
 //-----------------------------------------------------------------------------
@@ -108,5 +128,5 @@
 //----------
 // join
-forall( dtype T | is_thread(T) )
+forall( dtype T | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)) )
 T & join( T & this );
 
Index: libcfa/src/exception.c
===================================================================
--- libcfa/src/exception.c	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/exception.c	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -9,7 +9,7 @@
 // Author           : Andrew Beach
 // Created On       : Mon Jun 26 15:13:00 2017
-// Last Modified By : Peter A. Buhr
-// Last Modified On : Sat Aug 29 15:52:22 2020
-// Update Count     : 34
+// Last Modified By : Andrew Beach
+// Last Modified On : Tue Oct 27 16:27:00 2020
+// Update Count     : 35
 //
 
@@ -17,9 +17,10 @@
 #include <stddef.h> // for size_t
 
+#include <unwind.h> // for struct _Unwind_Exception {...};
+
 #include "exception.h"
 
 #include <stdlib.h>
 #include <stdio.h>
-#include <unwind.h>
 #include <bits/debug.hfa>
 #include "concurrency/invoke.h"
@@ -113,10 +114,4 @@
 
 // MEMORY MANAGEMENT =========================================================
-
-struct __cfaehm_node {
-	struct _Unwind_Exception unwind_exception;
-	struct __cfaehm_node * next;
-	int handler_index;
-};
 
 #define NODE_TO_EXCEPT(node) ((exception_t *)(1 + (node)))
Index: libcfa/src/exception.h
===================================================================
--- libcfa/src/exception.h	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/exception.h	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -5,19 +5,25 @@
 // file "LICENCE" distributed with Cforall.
 //
-// exception.h -- Builtins for exception handling.
+// exception.h -- Internal exception handling definitions.
 //
 // Author           : Andrew Beach
 // Created On       : Mon Jun 26 15:11:00 2017
 // Last Modified By : Andrew Beach
-// Last Modified On : Tue May 19 14:17:00 2020
-// Update Count     : 10
+// Last Modified On : Tue Oct 27 14:45:00 2020
+// Update Count     : 11
 //
 
 #pragma once
 
+// This could be considered several headers. All are internal to the exception
+// system but needed to depending on whether they are C/Cforall code and
+// whether or not they are part of the builtins.
 
 #ifdef __cforall
 extern "C" {
 #endif
+
+// Included in C code or the built-ins.
+#if !defined(__cforall) || defined(__cforall_builtins__)
 
 struct __cfaehm_base_exception_t;
@@ -47,7 +53,7 @@
 // Function catches termination exceptions.
 void __cfaehm_try_terminate(
-    void (*try_block)(),
-    void (*catch_block)(int index, exception_t * except),
-    int (*match_block)(exception_t * except));
+	void (*try_block)(),
+	void (*catch_block)(int index, exception_t * except),
+	int (*match_block)(exception_t * except));
 
 // Clean-up the exception in catch blocks.
@@ -56,20 +62,39 @@
 // Data structure creates a list of resume handlers.
 struct __cfaehm_try_resume_node {
-    struct __cfaehm_try_resume_node * next;
-    _Bool (*handler)(exception_t * except);
+	struct __cfaehm_try_resume_node * next;
+	_Bool (*handler)(exception_t * except);
 };
 
 // These act as constructor and destructor for the resume node.
 void __cfaehm_try_resume_setup(
-    struct __cfaehm_try_resume_node * node,
-    _Bool (*handler)(exception_t * except));
+	struct __cfaehm_try_resume_node * node,
+	_Bool (*handler)(exception_t * except));
 void __cfaehm_try_resume_cleanup(
-    struct __cfaehm_try_resume_node * node);
+	struct __cfaehm_try_resume_node * node);
 
 // Check for a standard way to call fake deconstructors.
 struct __cfaehm_cleanup_hook {};
 
+#endif
+
+// Included in C code and the library.
+#if !defined(__cforall) || !defined(__cforall_builtins__)
+struct __cfaehm_node {
+	struct _Unwind_Exception unwind_exception;
+	struct __cfaehm_node * next;
+	int handler_index;
+};
+
+static inline exception_t * __cfaehm_cancellation_exception(
+		struct _Unwind_Exception * unwind_exception ) {
+	return (exception_t *)(1 + (struct __cfaehm_node *)unwind_exception);
+}
+#endif
+
 #ifdef __cforall
 }
+
+// Built-ins not visible in C.
+#if defined(__cforall_builtins__)
 
 // Not all the built-ins can be expressed in C. These can't be
@@ -124,2 +149,4 @@
 
 #endif
+
+#endif
Index: libcfa/src/stdhdr/unwind.h
===================================================================
--- libcfa/src/stdhdr/unwind.h	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ libcfa/src/stdhdr/unwind.h	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,31 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// unwind.h -- Safely include unwind.h from Cforall.
+//
+// Author           : Andrew Beach
+// Created On       : Wed Oct 28 11:25:00 2020
+// Last Modified By : Andrew Beach
+// Last Modified On : Wed Oct 28 14:11:00 2020
+// Update Count     : 0
+//
+
+#pragma once
+
+extern "C" {
+// This prevents some GCC pragmas that CFA can't handle from being generated.
+#define HIDE_EXPORTS
+
+// Always include the header and use its header guard.
+#include_next <unwind.h>
+
+#undef HIDE_EXPORTS
+}
+
+// Local Variables: //
+// mode: c //
+// tab-width: 4 //
+// End: //
Index: libcfa/src/stdlib.cfa
===================================================================
--- libcfa/src/stdlib.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/stdlib.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -58,6 +58,7 @@
 
 forall( dtype T | sized(T) | { void ^?{}( T & ); } )
-void adelete( size_t dim, T arr[] ) {
+void adelete( T arr[] ) {
 	if ( arr ) {										// ignore null
+		size_t dim = malloc_size( arr ) / sizeof( T );
 		for ( int i = dim - 1; i >= 0; i -= 1 ) {		// reverse allocation order, must be unsigned
 			^(arr[i]){};								// run destructor
@@ -68,6 +69,7 @@
 
 forall( dtype T | sized(T) | { void ^?{}( T & ); }, ttype Params | { void adelete( Params ); } )
-void adelete( size_t dim, T arr[], Params rest ) {
+void adelete( T arr[], Params rest ) {
 	if ( arr ) {										// ignore null
+		size_t dim = malloc_size( arr ) / sizeof( T );
 		for ( int i = dim - 1; i >= 0; i -= 1 ) {		// reverse allocation order, must be unsigned
 			^(arr[i]){};								// run destructor
Index: libcfa/src/stdlib.hfa
===================================================================
--- libcfa/src/stdlib.hfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ libcfa/src/stdlib.hfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -263,6 +263,6 @@
 // Cforall allocation/deallocation and constructor/destructor, array types
 forall( dtype T | sized(T), ttype Params | { void ?{}( T &, Params ); } ) T * anew( size_t dim, Params p );
-forall( dtype T | sized(T) | { void ^?{}( T & ); } ) void adelete( size_t dim, T arr[] );
-forall( dtype T | sized(T) | { void ^?{}( T & ); }, ttype Params | { void adelete( Params ); } ) void adelete( size_t dim, T arr[], Params rest );
+forall( dtype T | sized(T) | { void ^?{}( T & ); } ) void adelete( T arr[] );
+forall( dtype T | sized(T) | { void ^?{}( T & ); }, ttype Params | { void adelete( Params ); } ) void adelete( T arr[], Params rest );
 
 //---------------------------------------
Index: src/AST/Convert.cpp
===================================================================
--- src/AST/Convert.cpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/AST/Convert.cpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -25,4 +25,5 @@
 #include "AST/Init.hpp"
 #include "AST/Stmt.hpp"
+#include "AST/TranslationUnit.hpp"
 #include "AST/TypeSubstitution.hpp"
 
@@ -47,8 +48,11 @@
 
 //================================================================================================
-namespace {
+namespace ast {
 
 // This is to preserve the FindSpecialDecls hack. It does not (and perhaps should not)
 // allow us to use the same stratagy in the new ast.
+// xxx - since convert back pass works, this concern seems to be unnecessary.
+
+// these need to be accessed in new FixInit now
 ast::Type * sizeType = nullptr;
 ast::FunctionDecl * dereferenceOperator = nullptr;
@@ -63,4 +67,9 @@
 	using Cache = std::unordered_map< const ast::Node *, BaseSyntaxNode * >;
 	Cache cache;
+
+	// Statements can no longer be shared.
+	// however, since StmtExprResult is now implemented, need to still maintain
+	// readonly references.
+	Cache readonlyCache;
 
 	template<typename T>
@@ -154,12 +163,12 @@
 	}
 
-	const ast::DeclWithType * visit( const ast::ObjectDecl * node ) override final {
-		auto&& bfwd = get<Expression>().accept1( node->bitfieldWidth );
-		auto&& type = get<Type>().accept1( node->type );
-		auto&& init = get<Initializer>().accept1( node->init );
-		auto&& attr = get<Attribute>().acceptL( node->attributes );
+	const ast::DeclWithType * visit( const ast::ObjectDecl * node ) override final {	
 		if ( inCache( node ) ) {
 			return nullptr;
 		}
+		auto bfwd = get<Expression>().accept1( node->bitfieldWidth );
+		auto type = get<Type>().accept1( node->type );
+		auto attr = get<Attribute>().acceptL( node->attributes );
+
 		auto decl = new ObjectDecl(
 			node->name,
@@ -168,9 +177,17 @@
 			bfwd,
 			type->clone(),
-			init,
+			nullptr, // prevent infinite loop
 			attr,
 			Type::FuncSpecifiers( node->funcSpec.val )
 		);
-		return declWithTypePostamble( decl, node );
+
+		// handles the case where node->init references itself
+		// xxx - does it really happen?
+		declWithTypePostamble(decl, node);
+		auto init = get<Initializer>().accept1( node->init );
+		decl->init = init;
+		
+		this->node = decl;
+		return nullptr;
 	}
 
@@ -205,8 +222,8 @@
 		decl->statements = get<CompoundStmt>().accept1( node->stmts );
 		decl->withExprs = get<Expression>().acceptL( node->withExprs );
-		if ( dereferenceOperator == node ) {
+		if ( ast::dereferenceOperator == node ) {
 			Validate::dereferenceOperator = decl;
 		}
-		if ( dtorStructDestroy == node ) {
+		if ( ast::dtorStructDestroy == node ) {
 			Validate::dtorStructDestroy = decl;
 		}
@@ -267,5 +284,5 @@
 		);
 
-		if ( dtorStruct == node ) {
+		if ( ast::dtorStruct == node ) {
 			Validate::dtorStruct = decl;
 		}
@@ -320,5 +337,7 @@
 
 	const ast::Stmt * stmtPostamble( Statement * stmt, const ast::Stmt * node ) {
-		cache.emplace( node, stmt );
+		// force statements in old tree to be unique.
+		// cache.emplace( node, stmt );
+		readonlyCache.emplace( node, stmt );
 		stmt->location = node->location;
 		stmt->labels = makeLabelL( stmt, node->labels );
@@ -337,5 +356,4 @@
 		if ( inCache( node ) ) return nullptr;
 		auto stmt = new ExprStmt( nullptr );
-		cache.emplace( node, stmt );
 		stmt->expr = get<Expression>().accept1( node->expr );
 		return stmtPostamble( stmt, node );
@@ -1011,7 +1029,6 @@
 		auto stmts = node->stmts;
 		// disable sharing between multiple StmtExprs explicitly.
-		if (inCache(stmts)) {
-			stmts = ast::deepCopy(stmts.get());
-		}
+		// this should no longer be true.
+
 		auto rslt = new StmtExpr(
 			get<CompoundStmt>().accept1(stmts)
@@ -1020,4 +1037,8 @@
 		rslt->returnDecls = get<ObjectDecl>().acceptL(node->returnDecls);
 		rslt->dtors       = get<Expression>().acceptL(node->dtors);
+		if (node->resultExpr) {
+			// this MUST be found by children visit
+			rslt->resultExpr  = strict_dynamic_cast<ExprStmt *>(readonlyCache.at(node->resultExpr));
+		}
 
 		auto expr = visitBaseExpr( node, rslt );
@@ -1036,5 +1057,5 @@
 
 		auto expr = visitBaseExpr( node, rslt );
-		this->node = expr;
+		this->node = expr->clone();
 		return nullptr;
 	}
@@ -1126,5 +1147,5 @@
 		auto type = new BasicType{ cv( node ), (BasicType::Kind)(unsigned)node->kind };
 		// I believe this should always be a BasicType.
-		if ( sizeType == node ) {
+		if ( ast::sizeType == node ) {
 			Validate::SizeType = type;
 		}
@@ -1384,8 +1405,8 @@
 };
 
-std::list< Declaration * > convert( const std::list< ast::ptr< ast::Decl > > && translationUnit ) {
+std::list< Declaration * > convert( const ast::TranslationUnit && translationUnit ) {
 	ConverterNewToOld c;
 	std::list< Declaration * > decls;
-	for(auto d : translationUnit) {
+	for(auto d : translationUnit.decls) {
 		decls.emplace_back( c.decl( d ) );
 	}
@@ -1529,4 +1550,6 @@
 
 		// function type is now derived from parameter decls instead of storing them
+
+		/*
 		auto ftype = new ast::FunctionType((ast::ArgumentFlag)old->type->isVarArgs, cv(old->type));
 		ftype->params.reserve(paramVars.size());
@@ -1540,5 +1563,8 @@
 		}
 		ftype->forall = std::move(forall);
-		visitType(old->type, ftype);
+		*/
+
+		// can function type have attributes? seems not to be the case.
+		// visitType(old->type, ftype);
 
 		auto decl = new ast::FunctionDecl{
@@ -1546,4 +1572,5 @@
 			old->name,
 			// GET_ACCEPT_1(type, FunctionType),
+			std::move(forall),
 			std::move(paramVars),
 			std::move(returnVars),
@@ -1552,8 +1579,9 @@
 			{ old->linkage.val },
 			GET_ACCEPT_V(attributes, Attribute),
-			{ old->get_funcSpec().val }
+			{ old->get_funcSpec().val },
+			old->type->isVarArgs
 		};
 
-		decl->type = ftype;
+		// decl->type = ftype;
 		cache.emplace( old, decl );
 
@@ -1570,9 +1598,9 @@
 
 		if ( Validate::dereferenceOperator == old ) {
-			dereferenceOperator = decl;
+			ast::dereferenceOperator = decl;
 		}
 
 		if ( Validate::dtorStructDestroy == old ) {
-			dtorStructDestroy = decl;
+			ast::dtorStructDestroy = decl;
 		}
 	}
@@ -1599,5 +1627,5 @@
 
 		if ( Validate::dtorStruct == old ) {
-			dtorStruct = decl;
+			ast::dtorStruct = decl;
 		}
 	}
@@ -2531,5 +2559,5 @@
 		// I believe this should always be a BasicType.
 		if ( Validate::SizeType == old ) {
-			sizeType = type;
+			ast::sizeType = type;
 		}
 		visitType( old, type );
@@ -2776,12 +2804,12 @@
 #undef GET_ACCEPT_1
 
-std::list< ast::ptr< ast::Decl > > convert( const std::list< Declaration * > && translationUnit ) {
+ast::TranslationUnit convert( const std::list< Declaration * > && translationUnit ) {
 	ConverterOldToNew c;
-	std::list< ast::ptr< ast::Decl > > decls;
+	ast::TranslationUnit unit;
 	for(auto d : translationUnit) {
 		d->accept( c );
-		decls.emplace_back( c.decl() );
+		unit.decls.emplace_back( c.decl() );
 	}
 	deleteAll(translationUnit);
-	return decls;
+	return unit;
 }
Index: src/AST/Convert.hpp
===================================================================
--- src/AST/Convert.hpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/AST/Convert.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -18,11 +18,9 @@
 #include <list>
 
-#include "AST/Node.hpp"
-
 class Declaration;
 namespace ast {
-	class Decl;
+	class TranslationUnit;
 };
 
-std::list< Declaration * > convert( const std::list< ast::ptr< ast::Decl > > && translationUnit );
-std::list< ast::ptr< ast::Decl > > convert( const std::list< Declaration * > && translationUnit );
+std::list< Declaration * > convert( const ast::TranslationUnit && translationUnit );
+ast::TranslationUnit convert( const std::list< Declaration * > && translationUnit );
Index: src/AST/Decl.cpp
===================================================================
--- src/AST/Decl.cpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/AST/Decl.cpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -48,4 +48,23 @@
 
 // --- FunctionDecl
+
+FunctionDecl::FunctionDecl( const CodeLocation & loc, const std::string & name, 
+		std::vector<ptr<TypeDecl>>&& forall,
+		std::vector<ptr<DeclWithType>>&& params, std::vector<ptr<DeclWithType>>&& returns,
+		CompoundStmt * stmts, Storage::Classes storage, Linkage::Spec linkage,
+		std::vector<ptr<Attribute>>&& attrs, Function::Specs fs, bool isVarArgs)
+	: DeclWithType( loc, name, storage, linkage, std::move(attrs), fs ), params(std::move(params)), returns(std::move(returns)),
+	  stmts( stmts ) {
+		  FunctionType * ftype = new FunctionType(static_cast<ArgumentFlag>(isVarArgs));
+		  for (auto & param : this->params) {
+			  ftype->params.emplace_back(param->get_type());
+		  }
+		  for (auto & ret : this->returns) {
+			  ftype->returns.emplace_back(ret->get_type());
+		  }
+		  ftype->forall = std::move(forall);
+		  this->type = ftype;
+	  }
+
 
 const Type * FunctionDecl::get_type() const { return type.get(); }
Index: src/AST/Decl.hpp
===================================================================
--- src/AST/Decl.hpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/AST/Decl.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -131,10 +131,10 @@
 	std::vector< ptr<Expr> > withExprs;
 
-	FunctionDecl( const CodeLocation & loc, const std::string & name, 
+	FunctionDecl( const CodeLocation & loc, const std::string & name, std::vector<ptr<TypeDecl>>&& forall,
 		std::vector<ptr<DeclWithType>>&& params, std::vector<ptr<DeclWithType>>&& returns,
 		CompoundStmt * stmts, Storage::Classes storage = {}, Linkage::Spec linkage = Linkage::C,
-		std::vector<ptr<Attribute>>&& attrs = {}, Function::Specs fs = {})
-	: DeclWithType( loc, name, storage, linkage, std::move(attrs), fs ), params(std::move(params)), returns(std::move(returns)),
-	  stmts( stmts ) {}
+		std::vector<ptr<Attribute>>&& attrs = {}, Function::Specs fs = {}, bool isVarArgs = false);
+	// : DeclWithType( loc, name, storage, linkage, std::move(attrs), fs ), params(std::move(params)), returns(std::move(returns)),
+	//  stmts( stmts ) {}
 
 	const Type * get_type() const override;
Index: src/AST/DeclReplacer.cpp
===================================================================
--- src/AST/DeclReplacer.cpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/AST/DeclReplacer.cpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -38,4 +38,14 @@
 			const ast::TypeInstType * previsit( const ast::TypeInstType * );
 		};
+
+		struct VarExprReplacer {
+		private:
+			const ExprMap & exprMap;
+			
+		public:
+			VarExprReplacer(const ExprMap & exprMap): exprMap (exprMap) {}
+
+			const Expr * postvisit (const VariableExpr *);
+		};
 	}
 
@@ -54,4 +64,9 @@
 		DeclMap declMap;
 		return replace( node, declMap, typeMap, debug );
+	}
+
+	const ast::Node * replace( const ast::Node * node, const ExprMap & exprMap) {
+		Pass<VarExprReplacer> replacer = {exprMap};
+		return node->accept( replacer );
 	}
 
@@ -88,4 +103,11 @@
 			return ninst;
 		}
+
+		const Expr * VarExprReplacer::postvisit( const VariableExpr * expr ) {
+			if (!exprMap.count(expr->var)) return expr;
+
+			return exprMap.at(expr->var);
+		}
+
 	}
 }
Index: src/AST/DeclReplacer.hpp
===================================================================
--- src/AST/DeclReplacer.hpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/AST/DeclReplacer.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -23,12 +23,15 @@
 	class DeclWithType;
 	class TypeDecl;
+	class Expr;
 
 	namespace DeclReplacer {
 		using DeclMap = std::unordered_map< const DeclWithType *, const DeclWithType * >;
 		using TypeMap = std::unordered_map< const TypeDecl *, const TypeDecl * >;
+		using ExprMap = std::unordered_map< const DeclWithType *, const Expr * >;
 
 		const Node * replace( const Node * node, const DeclMap & declMap, bool debug = false );
 		const Node * replace( const Node * node, const TypeMap & typeMap, bool debug = false );
 		const Node * replace( const Node * node, const DeclMap & declMap, const TypeMap & typeMap, bool debug = false );
+		const Node * replace( const Node * node, const ExprMap & exprMap);
 	}
 }
Index: src/AST/Expr.cpp
===================================================================
--- src/AST/Expr.cpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/AST/Expr.cpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -67,5 +67,5 @@
 // --- UntypedExpr
 
-UntypedExpr * UntypedExpr::createDeref( const CodeLocation & loc, Expr * arg ) {
+UntypedExpr * UntypedExpr::createDeref( const CodeLocation & loc, const Expr * arg ) {
 	assert( arg );
 
@@ -92,5 +92,5 @@
 }
 
-UntypedExpr * UntypedExpr::createAssign( const CodeLocation & loc, Expr * lhs, Expr * rhs ) {
+UntypedExpr * UntypedExpr::createAssign( const CodeLocation & loc, const Expr * lhs, const Expr * rhs ) {
 	assert( lhs && rhs );
 
@@ -102,4 +102,29 @@
 	}
 	return ret;
+}
+
+// --- VariableExpr
+
+VariableExpr::VariableExpr( const CodeLocation & loc )
+: Expr( loc ), var( nullptr ) {}
+
+VariableExpr::VariableExpr( const CodeLocation & loc, const DeclWithType * v )
+: Expr( loc ), var( v ) {
+	assert( var );
+	assert( var->get_type() );
+	result = shallowCopy( var->get_type() );
+}
+
+bool VariableExpr::get_lvalue() const {
+	// It isn't always an lvalue, but it is never an rvalue.
+	return true;
+}
+
+VariableExpr * VariableExpr::functionPointer(
+		const CodeLocation & loc, const FunctionDecl * decl ) {
+	// wrap usually-determined result type in a pointer
+	VariableExpr * funcExpr = new VariableExpr{ loc, decl };
+	funcExpr->result = new PointerType{ funcExpr->result };
+	return funcExpr;
 }
 
@@ -238,29 +263,4 @@
 }
 
-// --- VariableExpr
-
-VariableExpr::VariableExpr( const CodeLocation & loc )
-: Expr( loc ), var( nullptr ) {}
-
-VariableExpr::VariableExpr( const CodeLocation & loc, const DeclWithType * v )
-: Expr( loc ), var( v ) {
-	assert( var );
-	assert( var->get_type() );
-	result = shallowCopy( var->get_type() );
-}
-
-bool VariableExpr::get_lvalue() const {
-	// It isn't always an lvalue, but it is never an rvalue.
-	return true;
-}
-
-VariableExpr * VariableExpr::functionPointer(
-		const CodeLocation & loc, const FunctionDecl * decl ) {
-	// wrap usually-determined result type in a pointer
-	VariableExpr * funcExpr = new VariableExpr{ loc, decl };
-	funcExpr->result = new PointerType{ funcExpr->result };
-	return funcExpr;
-}
-
 // --- ConstantExpr
 
Index: src/AST/Expr.hpp
===================================================================
--- src/AST/Expr.hpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/AST/Expr.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -226,7 +226,7 @@
 
 	/// Creates a new dereference expression
-	static UntypedExpr * createDeref( const CodeLocation & loc, Expr * arg );
+	static UntypedExpr * createDeref( const CodeLocation & loc, const Expr * arg );
 	/// Creates a new assignment expression
-	static UntypedExpr * createAssign( const CodeLocation & loc, Expr * lhs, Expr * rhs );
+	static UntypedExpr * createAssign( const CodeLocation & loc, const Expr * lhs, const Expr * rhs );
 
 	const Expr * accept( Visitor & v ) const override { return v.visit( this ); }
@@ -247,4 +247,23 @@
 private:
 	NameExpr * clone() const override { return new NameExpr{ *this }; }
+	MUTATE_FRIEND
+};
+
+/// A reference to a named variable.
+class VariableExpr final : public Expr {
+public:
+	readonly<DeclWithType> var;
+
+	VariableExpr( const CodeLocation & loc );
+	VariableExpr( const CodeLocation & loc, const DeclWithType * v );
+
+	bool get_lvalue() const final;
+
+	/// generates a function pointer for a given function
+	static VariableExpr * functionPointer( const CodeLocation & loc, const FunctionDecl * decl );
+
+	const Expr * accept( Visitor & v ) const override { return v.visit( this ); }
+private:
+	VariableExpr * clone() const override { return new VariableExpr{ *this }; }
 	MUTATE_FRIEND
 };
@@ -392,23 +411,4 @@
 };
 
-/// A reference to a named variable.
-class VariableExpr final : public Expr {
-public:
-	readonly<DeclWithType> var;
-
-	VariableExpr( const CodeLocation & loc );
-	VariableExpr( const CodeLocation & loc, const DeclWithType * v );
-
-	bool get_lvalue() const final;
-
-	/// generates a function pointer for a given function
-	static VariableExpr * functionPointer( const CodeLocation & loc, const FunctionDecl * decl );
-
-	const Expr * accept( Visitor & v ) const override { return v.visit( this ); }
-private:
-	VariableExpr * clone() const override { return new VariableExpr{ *this }; }
-	MUTATE_FRIEND
-};
-
 /// A compile-time constant.
 /// Mostly carries C-source text from parse to code-gen, without interpretation.  E.g. strings keep their outer quotes and never have backslashes interpreted.
@@ -422,5 +422,5 @@
 		const CodeLocation & loc, const Type * ty, const std::string & r,
 			std::optional<unsigned long long> i )
-	: Expr( loc, ty ), rep( r ), ival( i ) {}
+	: Expr( loc, ty ), rep( r ), ival( i ), underlyer(ty) {}
 
 	/// Gets the integer value of this constant, if one is appropriate to its type.
@@ -617,5 +617,5 @@
 
 	ImplicitCopyCtorExpr( const CodeLocation& loc, const ApplicationExpr * call )
-	: Expr( loc, call->result ) { assert( call ); }
+	: Expr( loc, call->result ), callExpr(call) { assert( call ); assert(call->result); }
 
 	const Expr * accept( Visitor & v ) const override { return v.visit( this ); }
@@ -742,4 +742,6 @@
 	std::vector<ptr<Expr>> dtors;              ///< destructor(s) for return variable(s)
 
+	readonly<ExprStmt> resultExpr;
+
 	StmtExpr( const CodeLocation & loc, const CompoundStmt * ss );
 
Index: src/AST/Fwd.hpp
===================================================================
--- src/AST/Fwd.hpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/AST/Fwd.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -137,3 +137,10 @@
 typedef unsigned int UniqueId;
 
+class TranslationUnit;
+// TODO: Get from the TranslationUnit:
+extern Type * sizeType;
+extern FunctionDecl * dereferenceOperator;
+extern StructDecl   * dtorStruct;
+extern FunctionDecl * dtorStructDestroy;
+
 }
Index: src/AST/Node.hpp
===================================================================
--- src/AST/Node.hpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/AST/Node.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -49,4 +49,5 @@
 
 	bool unique() const { return strong_count == 1; }
+	bool isManaged() const {return strong_count > 0; }
 
 private:
Index: src/AST/Pass.hpp
===================================================================
--- src/AST/Pass.hpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/AST/Pass.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -103,5 +103,5 @@
 	/// Construct and run a pass on a translation unit.
 	template< typename... Args >
-	static void run( std::list< ptr<Decl> > & decls, Args &&... args ) {
+	static void run( TranslationUnit & decls, Args &&... args ) {
 		Pass<core_t> visitor( std::forward<Args>( args )... );
 		accept_all( decls, visitor );
@@ -119,5 +119,5 @@
 	// Versions of the above for older compilers.
 	template< typename... Args >
-	static void run( std::list< ptr<Decl> > & decls ) {
+	static void run( TranslationUnit & decls ) {
 		Pass<core_t> visitor;
 		accept_all( decls, visitor );
@@ -228,4 +228,9 @@
 	template<typename core_type>
 	friend void accept_all( std::list< ptr<Decl> > & decls, Pass<core_type>& visitor );
+
+	bool isInFunction() const {
+		return inFunction;
+	}
+
 private:
 
@@ -235,4 +240,8 @@
 	const ast::Stmt * call_accept( const ast::Stmt * );
 	const ast::Expr * call_accept( const ast::Expr * );
+
+	// requests WithStmtsToAdd directly add to this statement, as if it is a compound.
+
+	const ast::Stmt * call_accept_as_compound(const ast::Stmt *);
 
 	template< typename node_t >
@@ -257,4 +266,7 @@
 	template<typename node_t, typename parent_t, typename child_t>
 	void maybe_accept(const node_t * &, child_t parent_t::* child);
+
+	template<typename node_t, typename parent_t, typename child_t>
+	void maybe_accept_as_compound(const node_t * &, child_t parent_t::* child);
 
 private:
@@ -284,4 +296,5 @@
 private:
 	bool inFunction = false;
+	bool atFunctionTop = false;
 };
 
@@ -289,4 +302,7 @@
 template<typename core_t>
 void accept_all( std::list< ast::ptr<ast::Decl> > &, ast::Pass<core_t> & visitor );
+
+template<typename core_t>
+void accept_all( ast::TranslationUnit &, ast::Pass<core_t> & visitor );
 
 //-------------------------------------------------------------------------------------------------
@@ -371,4 +387,8 @@
 struct WithVisitorRef {
 	Pass<core_t> * const visitor = nullptr;
+
+	bool isInFunction() const {
+		return visitor->isInFunction();
+	}
 };
 
Index: src/AST/Pass.impl.hpp
===================================================================
--- src/AST/Pass.impl.hpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/AST/Pass.impl.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -20,4 +20,5 @@
 #include <unordered_map>
 
+#include "AST/TranslationUnit.hpp"
 #include "AST/TypeSubstitution.hpp"
 
@@ -167,4 +168,12 @@
 		__pedantic_pass_assert( stmt );
 
+		return stmt->accept( *this );
+	}
+
+	template< typename core_t >
+	const ast::Stmt * ast::Pass< core_t >::call_accept_as_compound( const ast::Stmt * stmt ) {
+		__pedantic_pass_assert( __visit_children() );
+		__pedantic_pass_assert( stmt );
+
 		// add a few useful symbols to the scope
 		using __pass::empty;
@@ -334,4 +343,28 @@
 	}
 
+	template< typename core_t >
+	template<typename node_t, typename parent_t, typename child_t>
+	void ast::Pass< core_t >::maybe_accept_as_compound(
+		const node_t * & parent,
+		child_t parent_t::*child
+	) {
+		static_assert( std::is_base_of<parent_t, node_t>::value, "Error deducing member object" );
+
+		if(__pass::skip(parent->*child)) return;
+		const auto & old_val = __pass::get(parent->*child, 0);
+
+		static_assert( !std::is_same<const ast::Node * &, decltype(old_val)>::value, "ERROR");
+
+		auto new_val = call_accept_as_compound( old_val );
+
+		static_assert( !std::is_same<const ast::Node *, decltype(new_val)>::value || std::is_same<int, decltype(old_val)>::value, "ERROR");
+
+		if( __pass::differs(old_val, new_val) ) {
+			auto new_parent = __pass::mutate<core_t>(parent);
+			new_parent->*child = new_val;
+			parent = new_parent;
+		}
+	}
+
 
 	template< typename core_t >
@@ -398,4 +431,9 @@
 	pass_visitor_stats.depth--;
 	if ( !errors.isEmpty() ) { throw errors; }
+}
+
+template< typename core_t >
+inline void ast::accept_all( ast::TranslationUnit & unit, ast::Pass< core_t > & visitor ) {
+	return ast::accept_all( unit.decls, visitor );
 }
 
@@ -470,8 +508,11 @@
 				// foralls are still in function type
 				maybe_accept( node, &FunctionDecl::type );
-				// function body needs to have the same scope as parameters - CompoundStmt will not enter
-				// a new scope if inFunction is true
+				// First remember that we are now within a function.
 				ValueGuard< bool > oldInFunction( inFunction );
 				inFunction = true;
+				// The function body needs to have the same scope as parameters.
+				// A CompoundStmt will not enter a new scope if atFunctionTop is true.
+				ValueGuard< bool > oldAtFunctionTop( atFunctionTop );
+				atFunctionTop = true;
 				maybe_accept( node, &FunctionDecl::stmts );
 				maybe_accept( node, &FunctionDecl::attributes );
@@ -639,16 +680,16 @@
 const ast::CompoundStmt * ast::Pass< core_t >::visit( const ast::CompoundStmt * node ) {
 	VISIT_START( node );
-	VISIT({
-		// do not enter a new scope if inFunction is true - needs to check old state before the assignment
-		auto guard1 = makeFuncGuard( [this, inFunctionCpy = this->inFunction]() {
-			if ( ! inFunctionCpy ) __pass::symtab::enter(core, 0);
-		}, [this, inFunctionCpy = this->inFunction]() {
-			if ( ! inFunctionCpy ) __pass::symtab::leave(core, 0);
+	VISIT(
+		// Do not enter (or leave) a new scope if atFunctionTop. Remember to save the result.
+		auto guard1 = makeFuncGuard( [this, enterScope = !this->atFunctionTop]() {
+			if ( enterScope ) __pass::symtab::enter(core, 0);
+		}, [this, leaveScope = !this->atFunctionTop]() {
+			if ( leaveScope ) __pass::symtab::leave(core, 0);
 		});
-		ValueGuard< bool > guard2( inFunction );
+		ValueGuard< bool > guard2( atFunctionTop );
+		atFunctionTop = false;
 		guard_scope guard3 { *this };
-		inFunction = false;
 		maybe_accept( node, &CompoundStmt::kids );
-	})
+	)
 	VISIT_END( CompoundStmt, node );
 }
@@ -703,6 +744,6 @@
 		maybe_accept( node, &IfStmt::inits    );
 		maybe_accept( node, &IfStmt::cond     );
-		maybe_accept( node, &IfStmt::thenPart );
-		maybe_accept( node, &IfStmt::elsePart );
+		maybe_accept_as_compound( node, &IfStmt::thenPart );
+		maybe_accept_as_compound( node, &IfStmt::elsePart );
 	})
 
@@ -721,5 +762,5 @@
 		maybe_accept( node, &WhileStmt::inits );
 		maybe_accept( node, &WhileStmt::cond  );
-		maybe_accept( node, &WhileStmt::body  );
+		maybe_accept_as_compound( node, &WhileStmt::body  );
 	})
 
@@ -736,8 +777,9 @@
 		// for statements introduce a level of scope (for the initialization)
 		guard_symtab guard { *this };
+		// xxx - old ast does not create WithStmtsToAdd scope for loop inits. should revisit this later.
 		maybe_accept( node, &ForStmt::inits );
 		maybe_accept( node, &ForStmt::cond  );
 		maybe_accept( node, &ForStmt::inc   );
-		maybe_accept( node, &ForStmt::body  );
+		maybe_accept_as_compound( node, &ForStmt::body  );
 	})
 
@@ -834,5 +876,5 @@
 		maybe_accept( node, &CatchStmt::decl );
 		maybe_accept( node, &CatchStmt::cond );
-		maybe_accept( node, &CatchStmt::body );
+		maybe_accept_as_compound( node, &CatchStmt::body );
 	})
 
Index: src/AST/Pass.proto.hpp
===================================================================
--- src/AST/Pass.proto.hpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/AST/Pass.proto.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -22,4 +22,6 @@
 template<typename core_t>
 class Pass;
+
+class TranslationUnit;
 
 struct PureVisitor;
Index: src/AST/SymbolTable.cpp
===================================================================
--- src/AST/SymbolTable.cpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/AST/SymbolTable.cpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -335,11 +335,11 @@
 }
 
-/*
-void SymbolTable::addFunctionType( const FunctionType * ftype ) {
-	addTypes( ftype->forall );
-	addIds( ftype->returns );
-	addIds( ftype->params );
-}
-*/
+
+void SymbolTable::addFunction( const FunctionDecl * func ) {
+	addTypes( func->type->forall );
+	addIds( func->returns );
+	addIds( func->params );
+}
+
 
 void SymbolTable::lazyInitScope() {
Index: src/AST/SymbolTable.hpp
===================================================================
--- src/AST/SymbolTable.hpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/AST/SymbolTable.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -145,5 +145,5 @@
 
 	/// convenience function for adding all of the declarations in a function type to the indexer
-	// void addFunctionType( const FunctionType * ftype );
+	void addFunction( const FunctionDecl * );
 
 private:
Index: src/AST/TranslationUnit.hpp
===================================================================
--- src/AST/TranslationUnit.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ src/AST/TranslationUnit.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,44 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2015 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// TranslationUnit.hpp --
+//
+// Author           : Andrew Beach
+// Created On       : Tue Jun 11 15:30:00 2019
+// Last Modified By : Andrew Beach
+// Last Modified On : Tue Jun 11 15:42:00 2019
+// Update Count     : 0
+//
+
+#pragma once
+
+#include <map>
+#include <vector>
+
+#include "Fwd.hpp"
+
+namespace ast {
+
+struct TranslationUnit {
+	std::list< ptr< Decl > > decls;
+
+	struct Globals {
+		std::map< UniqueId, Decl * > idMap;
+
+		Type * sizeType;
+		FunctionDecl * dereference;
+		StructDecl * dtorStruct;
+		FunctionDecl * dtorDestroy;
+	} global;
+};
+
+}
+
+// Local Variables: //
+// tab-width: 4 //
+// mode: c++ //
+// compile-command: "make install" //
+// End: //
Index: src/AST/Type.cpp
===================================================================
--- src/AST/Type.cpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/AST/Type.cpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -157,4 +157,10 @@
 
 template<typename decl_t>
+SueInstType<decl_t>::SueInstType(
+	const base_type * b, std::vector<ptr<Expr>> && params,
+	CV::Qualifiers q, std::vector<ptr<Attribute>> && as )
+: BaseInstType( b->name, std::move(params), q, std::move(as) ), base( b ) {}
+
+template<typename decl_t>
 bool SueInstType<decl_t>::isComplete() const {
 	return base ? base->body : false;
Index: src/AST/Type.hpp
===================================================================
--- src/AST/Type.hpp	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/AST/Type.hpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -302,7 +302,4 @@
 class FunctionType final : public ParameterizedType {
 public:
-//	std::vector<ptr<DeclWithType>> returns;
-//	std::vector<ptr<DeclWithType>> params;
-
 	std::vector<ptr<Type>> returns;
 	std::vector<ptr<Type>> params;
@@ -345,4 +342,9 @@
 	: ParameterizedType(q, std::move(as)), params(), name(n) {}
 
+	BaseInstType(
+		const std::string& n, std::vector<ptr<Expr>> && params,
+		CV::Qualifiers q = {}, std::vector<ptr<Attribute>> && as = {} )
+	: ParameterizedType(q, std::move(as)), params(std::move(params)), name(n) {}
+
 	BaseInstType( const BaseInstType & o );
 
@@ -369,5 +371,9 @@
 
 	SueInstType(
-		const decl_t * b, CV::Qualifiers q = {}, std::vector<ptr<Attribute>> && as = {} );
+		const base_type * b, CV::Qualifiers q = {}, std::vector<ptr<Attribute>> && as = {} );
+
+	SueInstType(
+		const base_type * b, std::vector<ptr<Expr>> && params,
+		CV::Qualifiers q = {}, std::vector<ptr<Attribute>> && as = {} );
 
 	bool isComplete() const override;
Index: src/AST/porting.md
===================================================================
--- src/AST/porting.md	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/AST/porting.md	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -30,4 +30,8 @@
   * Base nodes now override `const Node * accept( Visitor & v ) const = 0` with, e.g. `const Stmt * accept( Visitor & v ) const override = 0`
 * `PassVisitor` is replaced with `ast::Pass`
+  * Most one shot uses can use `ast::Pass::run` and `ast::Pass::read`.
+
+`WithConstTypeSubstitution`
+* `env` => `typeSubs`
 
 ## Structural Changes ##
@@ -146,4 +150,9 @@
   * allows `newObject` as just default settings
 
+`FunctionDecl`
+* `params` and `returns` added.
+  * Contain the declarations of the parameters and return variables.
+  * Types should match (even be shared with) the fields of `type`.
+
 `NamedTypeDecl`
 * `parameters` => `params`
@@ -154,4 +163,7 @@
 `AggregateDecl`
 * `parameters` => `params`
+
+`StructDecl`
+* `makeInst` replaced by better constructor on `StructInstType`.
 
 `Expr`
@@ -245,5 +257,5 @@
 * **TODO** move `kind`, `typeNames` into code generator
 
-`ReferenceToType`
+`ReferenceToType` => `BaseInstType`
 * deleted `get_baseParameters()` from children
   * replace with `aggr() ? aggr()->params : nullptr`
@@ -261,5 +273,10 @@
 * `returnVals` => `returns`
 * `parameters` => `params`
+  * Both now just point at types.
 * `bool isVarArgs;` => `enum ArgumentFlag { FixedArgs, VariableArgs }; ArgumentFlag isVarArgs;`
+
+`SueInstType`
+* Template class, with specializations and using to implement some other types:
+  * `StructInstType`, `UnionInstType` & `EnumInstType`
 
 `TypeInstType`
Index: src/Common/PassVisitor.h
===================================================================
--- src/Common/PassVisitor.h	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/Common/PassVisitor.h	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -354,6 +354,11 @@
 	virtual TypeSubstitution * mutate( TypeSubstitution * sub ) final;
 
+	bool isInFunction() const {
+		return inFunction;
+	}
+
 private:
 	bool inFunction = false;
+	bool atFunctionTop = false;
 
 	template<typename pass_t> friend void acceptAll( std::list< Declaration* > &decls, PassVisitor< pass_t >& visitor );
@@ -526,4 +531,8 @@
 public:
 	PassVisitor<pass_type> * const visitor = nullptr;
+
+	bool isInFunction() const {
+		return visitor->isInFunction();
+	}
 };
 
Index: src/Common/PassVisitor.impl.h
===================================================================
--- src/Common/PassVisitor.impl.h	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/Common/PassVisitor.impl.h	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -532,8 +532,11 @@
 			indexerAddId( &func );
 			maybeAccept_impl( node->type, *this );
-			// function body needs to have the same scope as parameters - CompoundStmt will not enter
-			// a new scope if inFunction is true
+			// First remember that we are now within a function.
 			ValueGuard< bool > oldInFunction( inFunction );
 			inFunction = true;
+			// The function body needs to have the same scope as parameters.
+			// A CompoundStmt will not enter a new scope if atFunctionTop is true.
+			ValueGuard< bool > oldAtFunctionTop( atFunctionTop );
+			atFunctionTop = true;
 			maybeAccept_impl( node->statements, *this );
 			maybeAccept_impl( node->attributes, *this );
@@ -567,8 +570,11 @@
 			indexerAddId( &func );
 			maybeAccept_impl( node->type, *this );
-			// function body needs to have the same scope as parameters - CompoundStmt will not enter
-			// a new scope if inFunction is true
+			// First remember that we are now within a function.
 			ValueGuard< bool > oldInFunction( inFunction );
 			inFunction = true;
+			// The function body needs to have the same scope as parameters.
+			// A CompoundStmt will not enter a new scope if atFunctionTop is true.
+			ValueGuard< bool > oldAtFunctionTop( atFunctionTop );
+			atFunctionTop = true;
 			maybeAccept_impl( node->statements, *this );
 			maybeAccept_impl( node->attributes, *this );
@@ -601,8 +607,11 @@
 			indexerAddId( &func );
 			maybeMutate_impl( node->type, *this );
-			// function body needs to have the same scope as parameters - CompoundStmt will not enter
-			// a new scope if inFunction is true
+			// First remember that we are now within a function.
 			ValueGuard< bool > oldInFunction( inFunction );
 			inFunction = true;
+			// The function body needs to have the same scope as parameters.
+			// A CompoundStmt will not enter a new scope if atFunctionTop is true.
+			ValueGuard< bool > oldAtFunctionTop( atFunctionTop );
+			atFunctionTop = true;
 			maybeMutate_impl( node->statements, *this );
 			maybeMutate_impl( node->attributes, *this );
@@ -1007,9 +1016,9 @@
 	VISIT_START( node );
 	{
-		// do not enter a new scope if inFunction is true - needs to check old state before the assignment
-		ValueGuard< bool > oldInFunction( inFunction );
-		auto guard1 = makeFuncGuard( [this, &oldInFunction]() { if ( ! oldInFunction.old ) indexerScopeEnter(); }, [this, &oldInFunction]() { if ( ! oldInFunction.old ) indexerScopeLeave(); } );
+		// Do not enter a new scope if atFunctionTop is true, don't leave one either.
+		ValueGuard< bool > oldAtFunctionTop( atFunctionTop );
+		auto guard1 = makeFuncGuard( [this, go = !atFunctionTop]() { if ( go ) indexerScopeEnter(); }, [this, go = !atFunctionTop]() { if ( go ) indexerScopeLeave(); } );
 		auto guard2 = makeFuncGuard( [this]() { call_beginScope();   }, [this]() { call_endScope();     } );
-		inFunction = false;
+		atFunctionTop = false;
 		visitStatementList( node->kids );
 	}
@@ -1021,9 +1030,9 @@
 	VISIT_START( node );
 	{
-		// do not enter a new scope if inFunction is true - needs to check old state before the assignment
-		ValueGuard< bool > oldInFunction( inFunction );
-		auto guard1 = makeFuncGuard( [this, &oldInFunction]() { if ( ! oldInFunction.old ) indexerScopeEnter(); }, [this, &oldInFunction]() { if ( ! oldInFunction.old ) indexerScopeLeave(); } );
+		// Do not enter a new scope if atFunctionTop is true, don't leave one either.
+		ValueGuard< bool > oldAtFunctionTop( atFunctionTop );
+		auto guard1 = makeFuncGuard( [this, go = !atFunctionTop]() { if ( go ) indexerScopeEnter(); }, [this, go = !atFunctionTop]() { if ( go ) indexerScopeLeave(); } );
 		auto guard2 = makeFuncGuard( [this]() { call_beginScope();   }, [this]() { call_endScope();     } );
-		inFunction = false;
+		atFunctionTop = false;
 		visitStatementList( node->kids );
 	}
@@ -1035,9 +1044,9 @@
 	MUTATE_START( node );
 	{
-		// do not enter a new scope if inFunction is true - needs to check old state before the assignment
-		ValueGuard< bool > oldInFunction( inFunction );
-		auto guard1 = makeFuncGuard( [this, &oldInFunction]() { if ( ! oldInFunction.old ) indexerScopeEnter(); }, [this, &oldInFunction]() { if ( ! oldInFunction.old ) indexerScopeLeave(); } );
+		// Do not enter a new scope if atFunctionTop is true, don't leave one either.
+		ValueGuard< bool > oldAtFunctionTop( atFunctionTop );
+		auto guard1 = makeFuncGuard( [this, go = !atFunctionTop]() { if ( go ) indexerScopeEnter(); }, [this, go = !atFunctionTop]() { if ( go ) indexerScopeLeave(); } );
 		auto guard2 = makeFuncGuard( [this]() { call_beginScope();   }, [this]() { call_endScope();     } );
-		inFunction = false;
+		atFunctionTop = false;
 		mutateStatementList( node->kids );
 	}
Index: src/Common/utility.h
===================================================================
--- src/Common/utility.h	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/Common/utility.h	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -360,7 +360,8 @@
 	reverse_iterate_t( T & ref ) : ref(ref) {}
 
-	typedef typename T::reverse_iterator iterator;
-	iterator begin() { return ref.rbegin(); }
-	iterator end() { return ref.rend(); }
+	// this does NOT work on const T!!!
+	// typedef typename T::reverse_iterator iterator;
+	auto begin() { return ref.rbegin(); }
+	auto end() { return ref.rend(); }
 };
 
Index: src/Concurrency/Keywords.cc
===================================================================
--- src/Concurrency/Keywords.cc	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/Concurrency/Keywords.cc	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -46,4 +46,12 @@
 	}
 
+	// Only detects threads constructed with the keyword thread.
+	inline static bool isThread( DeclarationWithType * decl ) {
+		Type * baseType = decl->get_type()->stripDeclarator();
+		StructInstType * instType = dynamic_cast<StructInstType *>( baseType );
+		if ( nullptr == instType ) { return false; }
+		return instType->baseStruct->is_thread();
+	}
+
 	//=============================================================================================
 	// Pass declarations
@@ -119,5 +127,5 @@
 			"get_thread",
 			"thread keyword requires threads to be in scope, add #include <thread.hfa>\n",
-			"",
+			"ThreadCancelled",
 			true,
 			AggregateDecl::Thread
@@ -290,6 +298,7 @@
 		std::list<DeclarationWithType*> findMutexArgs( FunctionDecl*, bool & first );
 		void validate( DeclarationWithType * );
-		void addDtorStatments( FunctionDecl* func, CompoundStmt *, const std::list<DeclarationWithType * > &);
-		void addStatments( FunctionDecl* func, CompoundStmt *, const std::list<DeclarationWithType * > &);
+		void addDtorStatements( FunctionDecl* func, CompoundStmt *, const std::list<DeclarationWithType * > &);
+		void addStatements( FunctionDecl* func, CompoundStmt *, const std::list<DeclarationWithType * > &);
+		void addThreadDtorStatements( FunctionDecl* func, CompoundStmt * body, const std::list<DeclarationWithType * > & args );
 
 		static void implement( std::list< Declaration * > & translationUnit ) {
@@ -302,4 +311,5 @@
 		StructDecl* guard_decl = nullptr;
 		StructDecl* dtor_guard_decl = nullptr;
+		StructDecl* thread_guard_decl = nullptr;
 
 		static std::unique_ptr< Type > generic_func;
@@ -801,5 +811,5 @@
 		bool first = false;
 		std::list<DeclarationWithType*> mutexArgs = findMutexArgs( decl, first );
-		bool isDtor = CodeGen::isDestructor( decl->name );
+		bool const isDtor = CodeGen::isDestructor( decl->name );
 
 		// Is this function relevant to monitors
@@ -849,9 +859,15 @@
 
 		// Instrument the body
-		if( isDtor ) {
-			addDtorStatments( decl, body, mutexArgs );
+		if ( isDtor && isThread( mutexArgs.front() ) ) {
+			if( !thread_guard_decl ) {
+				SemanticError( decl, "thread destructor requires threads to be in scope, add #include <thread.hfa>\n" );
+			}
+			addThreadDtorStatements( decl, body, mutexArgs );
+		}
+		else if ( isDtor ) {
+			addDtorStatements( decl, body, mutexArgs );
 		}
 		else {
-			addStatments( decl, body, mutexArgs );
+			addStatements( decl, body, mutexArgs );
 		}
 	}
@@ -870,4 +886,8 @@
 			assert( !dtor_guard_decl );
 			dtor_guard_decl = decl;
+		}
+		else if( decl->name == "thread_dtor_guard_t" && decl->body ) {
+			assert( !thread_guard_decl );
+			thread_guard_decl = decl;
 		}
 	}
@@ -908,5 +928,5 @@
 	}
 
-	void MutexKeyword::addDtorStatments( FunctionDecl* func, CompoundStmt * body, const std::list<DeclarationWithType * > & args ) {
+	void MutexKeyword::addDtorStatements( FunctionDecl* func, CompoundStmt * body, const std::list<DeclarationWithType * > & args ) {
 		Type * arg_type = args.front()->get_type()->clone();
 		arg_type->set_mutex( false );
@@ -957,8 +977,44 @@
 
 		//$monitor * __monitors[] = { get_monitor(a), get_monitor(b) };
-		body->push_front( new DeclStmt( monitors) );
-	}
-
-	void MutexKeyword::addStatments( FunctionDecl* func, CompoundStmt * body, const std::list<DeclarationWithType * > & args ) {
+		body->push_front( new DeclStmt( monitors ) );
+	}
+
+	void MutexKeyword::addThreadDtorStatements(
+			FunctionDecl*, CompoundStmt * body,
+			const std::list<DeclarationWithType * > & args ) {
+		assert( args.size() == 1 );
+		DeclarationWithType * arg = args.front();
+		Type * arg_type = arg->get_type()->clone();
+		assert( arg_type->get_mutex() );
+		arg_type->set_mutex( false );
+
+		// thread_dtor_guard_t __guard = { this, intptr( 0 ) };
+		body->push_front(
+			new DeclStmt( new ObjectDecl(
+				"__guard",
+				noStorageClasses,
+				LinkageSpec::Cforall,
+				nullptr,
+				new StructInstType(
+					noQualifiers,
+					thread_guard_decl
+				),
+				new ListInit(
+					{
+						new SingleInit( new CastExpr( new VariableExpr( arg ), arg_type ) ),
+						new SingleInit( new UntypedExpr(
+							new NameExpr( "intptr" ), {
+								new ConstantExpr( Constant::from_int( 0 ) ),
+							}
+						) ),
+					},
+					noDesignators,
+					true
+				)
+			))
+		);
+	}
+
+	void MutexKeyword::addStatements( FunctionDecl* func, CompoundStmt * body, const std::list<DeclarationWithType * > & args ) {
 		ObjectDecl * monitors = new ObjectDecl(
 			"__monitors",
Index: src/GenPoly/GenPoly.cc
===================================================================
--- src/GenPoly/GenPoly.cc	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/GenPoly/GenPoly.cc	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -46,4 +46,12 @@
 		}
 
+		bool hasPolyParams( const std::vector<ast::ptr<ast::Expr>> & params, const ast::TypeSubstitution * env) {
+			for (auto &param : params) {
+				auto paramType = param.strict_as<ast::TypeExpr>();
+				if (isPolyType(paramType->type, env)) return true;
+			}
+			return false;
+		}
+
 		/// Checks a parameter list for polymorphic parameters from tyVars; will substitute according to env if present
 		bool hasPolyParams( std::list< Expression* >& params, const TyVarMap &tyVars, const TypeSubstitution *env ) {
@@ -56,4 +64,12 @@
 		}
 
+		bool hasPolyParams( const std::vector<ast::ptr<ast::Expr>> & params, const TyVarMap & tyVars, const ast::TypeSubstitution * env) {
+			for (auto &param : params) {
+				auto paramType = param.strict_as<ast::TypeExpr>();
+				if (isPolyType(paramType->type, tyVars, env)) return true;
+			}
+			return false;
+		}
+
 		/// Checks a parameter list for dynamic-layout parameters from tyVars; will substitute according to env if present
 		bool hasDynParams( std::list< Expression* >& params, const TyVarMap &tyVars, const TypeSubstitution *env ) {
@@ -92,4 +108,13 @@
 			Type *newType = env->lookup( typeInst->get_name() );
 			if ( newType ) return newType;
+		}
+		return type;
+	}
+
+	const ast::Type * replaceTypeInst(const ast::Type * type, const ast::TypeSubstitution * env) {
+		if (!env) return type;
+		if (auto typeInst = dynamic_cast<const ast::TypeInstType*> (type)) {
+			auto newType = env->lookup(typeInst->name);
+			if (newType) return newType;
 		}
 		return type;
@@ -111,4 +136,19 @@
 	}
 
+	const ast::Type * isPolyType(const ast::Type * type, const ast::TypeSubstitution * env) {
+		type = replaceTypeInst( type, env );
+
+		if ( dynamic_cast< const ast::TypeInstType * >( type ) ) {
+			return type;
+		} else if ( auto arrayType = dynamic_cast< const ast::ArrayType * >( type ) ) {
+			return isPolyType( arrayType->base, env );
+		} else if ( auto structType = dynamic_cast< const ast::StructInstType* >( type ) ) {
+			if ( hasPolyParams( structType->params, env ) ) return type;
+		} else if ( auto unionType = dynamic_cast< const ast::UnionInstType* >( type ) ) {
+			if ( hasPolyParams( unionType->params, env ) ) return type;
+		}
+		return 0;
+	}
+
 	Type *isPolyType( Type *type, const TyVarMap &tyVars, const TypeSubstitution *env ) {
 		type = replaceTypeInst( type, env );
@@ -126,4 +166,19 @@
 		}
 		return 0;
+	}
+
+	const ast::Type * isPolyType(const ast::Type * type, const TyVarMap & tyVars, const ast::TypeSubstitution * env) {
+		type = replaceTypeInst( type, env );
+
+		if ( auto typeInst = dynamic_cast< const ast::TypeInstType * >( type ) ) {
+			return tyVars.find(typeInst->name) != tyVars.end() ? type : nullptr;
+		} else if ( auto arrayType = dynamic_cast< const ast::ArrayType * >( type ) ) {
+			return isPolyType( arrayType->base, env );
+		} else if ( auto structType = dynamic_cast< const ast::StructInstType* >( type ) ) {
+			if ( hasPolyParams( structType->params, env ) ) return type;
+		} else if ( auto unionType = dynamic_cast< const ast::UnionInstType* >( type ) ) {
+			if ( hasPolyParams( unionType->params, env ) ) return type;
+		}
+		return nullptr;
 	}
 
@@ -449,4 +504,12 @@
 	}
 
+	namespace {
+		// temporary hack to avoid re-implementing anything related to TyVarMap
+		// does this work? these two structs have identical definitions.
+		inline TypeDecl::Data convData(const ast::TypeDecl::Data & data) {
+			return *reinterpret_cast<const TypeDecl::Data *>(&data);
+		}
+	}
+
 	bool needsBoxing( Type * param, Type * arg, const TyVarMap &exprTyVars, const TypeSubstitution * env ) {
 		// is parameter is not polymorphic, don't need to box
@@ -459,4 +522,13 @@
 	}
 
+	bool needsBoxing( const ast::Type * param, const ast::Type * arg, const TyVarMap &exprTyVars, const ast::TypeSubstitution * env) {
+		// is parameter is not polymorphic, don't need to box
+		if ( ! isPolyType( param, exprTyVars ) ) return false;
+		ast::ptr<ast::Type> newType = arg;
+		if ( env ) env->apply( newType );
+		// if the argument's type is polymorphic, we don't need to box again!
+		return ! isPolyType( newType );
+	}
+
 	bool needsBoxing( Type * param, Type * arg, ApplicationExpr * appExpr, const TypeSubstitution * env ) {
 		FunctionType * function = getFunctionType( appExpr->function->result );
@@ -467,6 +539,19 @@
 	}
 
+	bool needsBoxing( const ast::Type * param, const ast::Type * arg, const ast::ApplicationExpr * appExpr, const ast::TypeSubstitution * env) {
+		const ast::FunctionType * function = getFunctionType(appExpr->func->result);
+		assertf( function, "ApplicationExpr has non-function type: %s", toString( appExpr->func->result ).c_str() );
+		TyVarMap exprTyVars(TypeDecl::Data{});
+		makeTyVarMap(function, exprTyVars);
+		return needsBoxing(param, arg, exprTyVars, env);
+
+	}
+
 	void addToTyVarMap( TypeDecl * tyVar, TyVarMap &tyVarMap ) {
 		tyVarMap.insert( tyVar->name, TypeDecl::Data{ tyVar } );
+	}
+
+	void addToTyVarMap( const ast::TypeDecl * tyVar, TyVarMap & tyVarMap) {
+		tyVarMap.insert(tyVar->name, convData(ast::TypeDecl::Data{tyVar}));
 	}
 
@@ -478,4 +563,16 @@
 		if ( PointerType *pointer = dynamic_cast< PointerType* >( type ) ) {
 			makeTyVarMap( pointer->get_base(), tyVarMap );
+		}
+	}
+
+	void makeTyVarMap(const ast::Type * type, TyVarMap & tyVarMap) {
+		if (auto ptype = dynamic_cast<const ast::ParameterizedType *>(type)) {
+ 			for (auto & tyVar : ptype->forall) {
+				assert (tyVar);
+				addToTyVarMap(tyVar, tyVarMap);
+			}
+		}
+		if (auto pointer = dynamic_cast<const ast::PointerType *>(type)) {
+			makeTyVarMap(pointer->base, tyVarMap);
 		}
 	}
Index: src/GenPoly/GenPoly.h
===================================================================
--- src/GenPoly/GenPoly.h	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/GenPoly/GenPoly.h	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -26,6 +26,6 @@
 
 namespace GenPoly {
+
 	typedef ErasableScopedMap< std::string, TypeDecl::Data > TyVarMap;
-
 	/// Replaces a TypeInstType by its referrent in the environment, if applicable
 	Type* replaceTypeInst( Type* type, const TypeSubstitution* env );
@@ -33,7 +33,9 @@
 	/// returns polymorphic type if is polymorphic type, NULL otherwise; will look up substitution in env if provided
 	Type *isPolyType( Type *type, const TypeSubstitution *env = 0 );
+	const ast::Type * isPolyType(const ast::Type * type, const ast::TypeSubstitution * env = nullptr);
 
 	/// returns polymorphic type if is polymorphic type in tyVars, NULL otherwise; will look up substitution in env if provided
 	Type *isPolyType( Type *type, const TyVarMap &tyVars, const TypeSubstitution *env = 0 );
+	const ast::Type * isPolyType(const ast::Type * type, const TyVarMap & tyVars, const ast::TypeSubstitution * env = nullptr);
 
 	/// returns dynamic-layout type if is dynamic-layout type in tyVars, NULL otherwise; will look up substitution in env if provided
@@ -84,7 +86,9 @@
 	/// true if arg requires boxing given exprTyVars
 	bool needsBoxing( Type * param, Type * arg, const TyVarMap &exprTyVars, const TypeSubstitution * env );
+	bool needsBoxing( const ast::Type * param, const ast::Type * arg, const TyVarMap &exprTyVars, const ast::TypeSubstitution * env);
 
 	/// true if arg requires boxing in the call to appExpr
 	bool needsBoxing( Type * param, Type * arg, ApplicationExpr * appExpr, const TypeSubstitution * env );
+	bool needsBoxing( const ast::Type * param, const ast::Type * arg, const ast::ApplicationExpr * appExpr, const ast::TypeSubstitution * env);
 
 	/// Adds the type variable `tyVar` to `tyVarMap`
@@ -93,4 +97,5 @@
 	/// Adds the declarations in the forall list of type (and its pointed-to type if it's a pointer type) to `tyVarMap`
 	void makeTyVarMap( Type *type, TyVarMap &tyVarMap );
+	void makeTyVarMap(const ast::Type * type, TyVarMap & tyVarMap);
 
 	/// Prints type variable map
Index: src/GenPoly/Specialize.cc
===================================================================
--- src/GenPoly/Specialize.cc	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/GenPoly/Specialize.cc	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -218,4 +218,8 @@
 		thunkFunc->get_attributes().push_back( new Attribute( "unused" ) );
 
+		// Thunks at the global level must be static to avoid collisions between files.
+		// (Conversly thunks inside a function must be unique and not static.)
+		thunkFunc->storageClasses.is_static = !isInFunction();
+
 		// thread thunk parameters into call to actual function, naming thunk parameters as we go
 		UniqueName paramNamer( paramPrefix );
Index: src/InitTweak/FixGlobalInit.cc
===================================================================
--- src/InitTweak/FixGlobalInit.cc	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/InitTweak/FixGlobalInit.cc	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -34,4 +34,8 @@
 #include "SynTree/Visitor.h"       // for acceptAll, Visitor
 
+#include "AST/Expr.hpp"
+#include "AST/Node.hpp"
+#include "AST/Pass.hpp"
+
 namespace InitTweak {
 	class GlobalFixer : public WithShortCircuiting {
@@ -50,4 +54,18 @@
 		FunctionDecl * initFunction;
 		FunctionDecl * destroyFunction;
+	};
+
+	class GlobalFixer_new : public ast::WithShortCircuiting {
+	public:
+		void previsit (const ast::ObjectDecl *);
+		void previsit (const ast::FunctionDecl *) { visit_children = false; }
+		void previsit (const ast::StructDecl *) { visit_children = false; }
+		void previsit (const ast::UnionDecl *) { visit_children = false; }
+		void previsit (const ast::EnumDecl *) { visit_children = false; }
+		void previsit (const ast::TraitDecl *) { visit_children = false; }
+		void previsit (const ast::TypeDecl *) { visit_children = false; }
+
+		std::list< ast::ptr<ast::Stmt> > initStmts;
+		std::list< ast::ptr<ast::Stmt> > destroyStmts;
 	};
 
@@ -91,4 +109,27 @@
 	}
 
+	void fixGlobalInit(ast::TranslationUnit & translationUnit, bool inLibrary) {
+		ast::Pass<GlobalFixer_new> fixer;
+		accept_all(translationUnit, fixer);
+
+		if ( !fixer.core.initStmts.empty() ) {
+			std::vector<ast::ptr<ast::Expr>> ctorParams;
+			if (inLibrary) ctorParams.emplace_back(ast::ConstantExpr::from_int({}, 200));
+			auto initFunction = new ast::FunctionDecl({}, "__global_init__", {}, {}, {}, new ast::CompoundStmt({}, std::move(fixer.core.initStmts)), 
+				ast::Storage::Static, ast::Linkage::C, {new ast::Attribute("constructor", std::move(ctorParams))});
+
+			translationUnit.decls.emplace_back( initFunction );
+		} // if
+
+		if ( !fixer.core.destroyStmts.empty() ) {
+			std::vector<ast::ptr<ast::Expr>> dtorParams;
+			if (inLibrary) dtorParams.emplace_back(ast::ConstantExpr::from_int({}, 200));
+			auto destroyFunction = new ast::FunctionDecl({}, "__global_destroy__", {}, {}, {}, new ast::CompoundStmt({}, std::move(fixer.core.destroyStmts)), 
+				ast::Storage::Static, ast::Linkage::C, {new ast::Attribute("destructor", std::move(dtorParams))});
+
+			translationUnit.decls.emplace_back(destroyFunction);
+		} // if
+	}
+
 	void GlobalFixer::previsit( ObjectDecl *objDecl ) {
 		std::list< Statement * > & initStatements = initFunction->get_statements()->get_kids();
@@ -112,20 +153,5 @@
 			} // if
 			if ( Statement * ctor = ctorInit->ctor ) {
-				// Translation 1: Add this attribute on the global declaration:
-				//    __attribute__((section (".data#")))
-				// which makes gcc put the global in the data section,
-				// so that the global is writeable (via a const cast) in the init function.
-				// The trailing # is an injected assembly comment, to suppress the "a" in
-				//    .section .data,"a"
-				//    .section .data#,"a"
-				// to avoid assembler warning "ignoring changed section attributes for .data"
-				Type *strLitT = new PointerType( Type::Qualifiers( ),
-					new BasicType( Type::Qualifiers( ), BasicType::Char ) );
-				std::list< Expression * > attr_params;
-				attr_params.push_back( 
-					new ConstantExpr( Constant( strLitT, "\".data#\"", std::nullopt ) ) );
-				objDecl->attributes.push_back(new Attribute("section", attr_params));
-				// Translation 2: Move the initizliation off the global declaration,
-				// into the startup function.
+				addDataSectonAttribute( objDecl );
 				initStatements.push_back( ctor );
 				objDecl->init = nullptr;
@@ -142,4 +168,33 @@
 	}
 
+	void GlobalFixer_new::previsit(const ast::ObjectDecl * objDecl) {
+		auto mutDecl = mutate(objDecl);
+		assertf(mutDecl == objDecl, "Global object decl must be unique");
+		if ( auto ctorInit = objDecl->init.as<ast::ConstructorInit>() ) {
+			// a decision should have been made by the resolver, so ctor and init are not both non-NULL
+			assert( ! ctorInit->ctor || ! ctorInit->init );
+
+			const ast::Stmt * dtor = ctorInit->dtor;
+			if ( dtor && ! isIntrinsicSingleArgCallStmt( dtor ) ) {
+				// don't need to call intrinsic dtor, because it does nothing, but
+				// non-intrinsic dtors must be called
+				destroyStmts.push_front( dtor );
+				// ctorInit->dtor = nullptr;
+			} // if
+			if ( const ast::Stmt * ctor = ctorInit->ctor ) {
+				initStmts.push_back( ctor );
+				mutDecl->init = nullptr;
+				// ctorInit->ctor = nullptr;
+			} else if ( const ast::Init * init = ctorInit->init ) {
+				mutDecl->init = init;
+				// ctorInit->init = nullptr;
+			} else {
+				// no constructor and no initializer, which is okay
+				mutDecl->init = nullptr;
+			} // if
+			// delete ctorInit;
+		} // if
+	}
+
 	// only modify global variables
 	void GlobalFixer::previsit( FunctionDecl * ) { visit_children = false; }
Index: src/InitTweak/FixGlobalInit.h
===================================================================
--- src/InitTweak/FixGlobalInit.h	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/InitTweak/FixGlobalInit.h	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -19,4 +19,7 @@
 #include <string>  // for string
 
+#include <AST/Fwd.hpp>
+
+
 class Declaration;
 
@@ -26,4 +29,5 @@
 	/// function is for library code.
 	void fixGlobalInit( std::list< Declaration * > & translationUnit, bool inLibrary );
+	void fixGlobalInit( ast::TranslationUnit & translationUnit, bool inLibrary );
 } // namespace
 
Index: src/InitTweak/FixInit.cc
===================================================================
--- src/InitTweak/FixInit.cc	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/InitTweak/FixInit.cc	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -219,5 +219,5 @@
 		};
 
-		struct SplitExpressions : public WithShortCircuiting, public WithTypeSubstitution, public WithStmtsToAdd {
+		struct SplitExpressions : public WithShortCircuiting, /*public WithTypeSubstitution, */public WithStmtsToAdd {
 			/// add CompoundStmts around top-level expressions so that temporaries are destroyed in the correct places.
 			static void split( std::list< Declaration * > &translationUnit );
@@ -802,4 +802,10 @@
 				if ( Statement * ctor = ctorInit->get_ctor() ) {
 					if ( objDecl->get_storageClasses().is_static ) {
+
+						// The ojbect needs to go in the data section, regardless of dtor complexity below.
+						// The attribute works, and is meant to apply, both for leaving the static local alone,
+						// and for hoisting it out as a static global.
+						addDataSectonAttribute( objDecl );
+
 						// originally wanted to take advantage of gcc nested functions, but
 						// we get memory errors with this approach. To remedy this, the static
Index: src/InitTweak/FixInit.h
===================================================================
--- src/InitTweak/FixInit.h	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/InitTweak/FixInit.h	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -20,8 +20,13 @@
 
 class Declaration;
+namespace ast {
+	class TranslationUnit;
+}
 
 namespace InitTweak {
 	/// replace constructor initializers with expression statements and unwrap basic C-style initializers
 	void fix( std::list< Declaration * > & translationUnit, bool inLibrary );
+
+	void fix( ast::TranslationUnit & translationUnit, bool inLibrary);
 } // namespace
 
Index: src/InitTweak/FixInitNew.cpp
===================================================================
--- src/InitTweak/FixInitNew.cpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ src/InitTweak/FixInitNew.cpp	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,1384 @@
+#include "FixInit.h"
+
+#include <stddef.h>                    // for NULL
+#include <algorithm>                   // for set_difference, copy_if
+#include <cassert>                     // for assert, strict_dynamic_cast
+#include <iostream>                    // for operator<<, ostream, basic_ost...
+#include <iterator>                    // for insert_iterator, back_inserter
+#include <list>                        // for _List_iterator, list, list<>::...
+#include <map>                         // for _Rb_tree_iterator, _Rb_tree_co...
+#include <memory>                      // for allocator_traits<>::value_type
+#include <set>                         // for set, set<>::value_type
+#include <unordered_map>               // for unordered_map, unordered_map<>...
+#include <unordered_set>               // for unordered_set
+#include <utility>                     // for pair
+
+#include "CodeGen/GenType.h"           // for genPrettyType
+#include "CodeGen/OperatorTable.h"
+#include "Common/PassVisitor.h"        // for PassVisitor, WithStmtsToAdd
+#include "Common/SemanticError.h"      // for SemanticError
+#include "Common/UniqueName.h"         // for UniqueName
+#include "Common/utility.h"            // for CodeLocation, ValueGuard, toSt...
+#include "FixGlobalInit.h"             // for fixGlobalInit
+#include "GenInit.h"                   // for genCtorDtor
+#include "GenPoly/GenPoly.h"           // for getFunctionType
+#include "InitTweak.h"                 // for getFunctionName, getCallArg
+#include "ResolvExpr/Resolver.h"       // for findVoidExpression
+#include "ResolvExpr/typeops.h"        // for typesCompatible
+#include "SymTab/Autogen.h"            // for genImplicitCall
+#include "SymTab/Indexer.h"            // for Indexer
+#include "SymTab/Mangler.h"            // for Mangler
+#include "SynTree/LinkageSpec.h"       // for C, Spec, Cforall, isBuiltin
+#include "SynTree/Attribute.h"         // for Attribute
+#include "SynTree/Constant.h"          // for Constant
+#include "SynTree/Declaration.h"       // for ObjectDecl, FunctionDecl, Decl...
+#include "SynTree/Expression.h"        // for UniqueExpr, VariableExpr, Unty...
+#include "SynTree/Initializer.h"       // for ConstructorInit, SingleInit
+#include "SynTree/Label.h"             // for Label, operator<
+#include "SynTree/Mutator.h"           // for mutateAll, Mutator, maybeMutate
+#include "SynTree/Statement.h"         // for ExprStmt, CompoundStmt, Branch...
+#include "SynTree/Type.h"              // for Type, Type::StorageClasses
+#include "SynTree/TypeSubstitution.h"  // for TypeSubstitution, operator<<
+#include "SynTree/DeclReplacer.h"      // for DeclReplacer
+#include "SynTree/Visitor.h"           // for acceptAll, maybeAccept
+#include "Validate/FindSpecialDecls.h" // for dtorStmt, dtorStructDestroy
+
+#include "AST/Expr.hpp"
+#include "AST/Node.hpp"
+#include "AST/Pass.hpp"
+#include "AST/Print.hpp"
+#include "AST/SymbolTable.hpp"
+#include "AST/Type.hpp"
+#include "AST/DeclReplacer.hpp"
+
+extern bool ctordtorp; // print all debug
+extern bool ctorp; // print ctor debug
+extern bool cpctorp; // print copy ctor debug
+extern bool dtorp; // print dtor debug
+#define PRINT( text ) if ( ctordtorp ) { text }
+#define CP_CTOR_PRINT( text ) if ( ctordtorp || cpctorp ) { text }
+#define DTOR_PRINT( text ) if ( ctordtorp || dtorp ) { text }
+
+namespace InitTweak {
+namespace {
+	struct SelfAssignChecker {
+		void previsit( const ast::ApplicationExpr * appExpr );
+	};
+
+	struct StmtExprResult {
+		const ast::StmtExpr * previsit( const ast::StmtExpr * stmtExpr );
+	};
+
+	/// wrap function application expressions as ImplicitCopyCtorExpr nodes so that it is easy to identify which
+	/// function calls need their parameters to be copy constructed
+	struct InsertImplicitCalls : public ast::WithConstTypeSubstitution, public ast::WithShortCircuiting {
+		const ast::Expr * postvisit( const ast::ApplicationExpr * appExpr );
+
+		// only handles each UniqueExpr once
+		// if order of visit does not change, this should be safe
+		void previsit (const ast::UniqueExpr *);
+
+		std::unordered_set<decltype(ast::UniqueExpr::id)> visitedIds;
+	};
+
+	/// generate temporary ObjectDecls for each argument and return value of each ImplicitCopyCtorExpr,
+	/// generate/resolve copy construction expressions for each, and generate/resolve destructors for both
+	/// arguments and return value temporaries
+	struct ResolveCopyCtors final : public ast::WithGuards, public ast::WithStmtsToAdd<>, public ast::WithSymbolTable, public ast::WithShortCircuiting, public ast::WithVisitorRef<ResolveCopyCtors> {
+		const ast::Expr * postvisit( const ast::ImplicitCopyCtorExpr * impCpCtorExpr );
+		const ast::StmtExpr * previsit( const ast::StmtExpr * stmtExpr );
+		const ast::UniqueExpr * previsit( const ast::UniqueExpr * unqExpr );
+
+		/// handles distant mutations of environment manually.
+		/// WithConstTypeSubstitution cannot remember where the environment is from
+
+		/// MUST be called at start of overload previsit
+		void previsit( const ast::Expr * expr);
+		/// MUST be called at return of overload postvisit
+		const ast::Expr * postvisit(const ast::Expr * expr);
+
+		/// create and resolve ctor/dtor expression: fname(var, [cpArg])
+		const ast::Expr * makeCtorDtor( const std::string & fname, const ast::ObjectDecl * var, const ast::Expr * cpArg = nullptr );
+		/// true if type does not need to be copy constructed to ensure correctness
+		bool skipCopyConstruct( const ast::Type * type );
+		ast::ptr< ast::Expr > copyConstructArg( const ast::Expr * arg, const ast::ImplicitCopyCtorExpr * impCpCtorExpr, const ast::Type * formal );
+		ast::Expr * destructRet( const ast::ObjectDecl * ret, const ast::Expr * arg );
+	private:
+		/// hack to implement WithTypeSubstitution while conforming to mutation safety.
+		ast::TypeSubstitution * env;
+		bool                    envModified;
+	};
+
+	/// collects constructed object decls - used as a base class
+	struct ObjDeclCollector : public ast::WithGuards, public ast::WithShortCircuiting {
+		// use ordered data structure to maintain ordering for set_difference and for consistent error messages
+		typedef std::list< const ast::ObjectDecl * > ObjectSet;
+		void previsit( const ast::CompoundStmt *compoundStmt );
+		void previsit( const ast::DeclStmt *stmt );
+
+		// don't go into other functions
+		void previsit( const ast::FunctionDecl * ) { visit_children = false; }
+
+	  protected:
+		ObjectSet curVars;
+	};
+
+	// debug
+	template<typename ObjectSet>
+	struct PrintSet {
+		PrintSet( const ObjectSet & objs ) : objs( objs ) {}
+		const ObjectSet & objs;
+	};
+	template<typename ObjectSet>
+	PrintSet<ObjectSet> printSet( const ObjectSet & objs ) { return PrintSet<ObjectSet>( objs ); }
+	template<typename ObjectSet>
+	std::ostream & operator<<( std::ostream & out, const PrintSet<ObjectSet> & set) {
+		out << "{ ";
+		for ( auto & obj : set.objs ) {
+			out << obj->name << ", " ;
+		} // for
+		out << " }";
+		return out;
+	}
+
+	struct LabelFinder final : public ObjDeclCollector {
+		typedef std::map< std::string, ObjectSet > LabelMap;
+		// map of Label -> live variables at that label
+		LabelMap vars;
+
+		typedef ObjDeclCollector Parent;
+		using Parent::previsit;
+		void previsit( const ast::Stmt * stmt );
+
+		void previsit( const ast::CompoundStmt *compoundStmt );
+		void previsit( const ast::DeclStmt *stmt );
+	};
+
+	/// insert destructor calls at the appropriate places.  must happen before CtorInit nodes are removed
+	/// (currently by FixInit)
+	struct InsertDtors final : public ObjDeclCollector, public ast::WithStmtsToAdd<> {
+		typedef std::list< ObjectDecl * > OrderedDecls;
+		typedef std::list< OrderedDecls > OrderedDeclsStack;
+
+		InsertDtors( ast::Pass<LabelFinder> & finder ) : finder( finder ), labelVars( finder.core.vars ) {}
+
+		typedef ObjDeclCollector Parent;
+		using Parent::previsit;
+
+		void previsit( const ast::FunctionDecl * funcDecl );
+
+		void previsit( const ast::BranchStmt * stmt );
+	private:
+		void handleGoto( const ast::BranchStmt * stmt );
+
+		ast::Pass<LabelFinder> & finder;
+		LabelFinder::LabelMap & labelVars;
+		OrderedDeclsStack reverseDeclOrder;
+	};
+
+	/// expand each object declaration to use its constructor after it is declared.
+	struct FixInit : public ast::WithStmtsToAdd<> {
+		static void fixInitializers( ast::TranslationUnit &translationUnit );
+
+		const ast::DeclWithType * postvisit( const ast::ObjectDecl *objDecl );
+
+		std::list< ast::ptr< ast::Decl > > staticDtorDecls;
+	};
+
+	/// generate default/copy ctor and dtor calls for user-defined struct ctor/dtors
+	/// for any member that is missing a corresponding ctor/dtor call.
+	/// error if a member is used before constructed
+	struct GenStructMemberCalls final : public ast::WithGuards, public ast::WithShortCircuiting, public ast::WithSymbolTable, public ast::WithVisitorRef<GenStructMemberCalls> {
+		void previsit( const ast::FunctionDecl * funcDecl );
+		const ast::DeclWithType * postvisit( const ast::FunctionDecl * funcDecl );
+
+		void previsit( const ast::MemberExpr * memberExpr );
+		void previsit( const ast::ApplicationExpr * appExpr );
+
+		/// Note: this post mutate used to be in a separate visitor. If this pass breaks, one place to examine is whether it is
+		/// okay for this part of the recursion to occur alongside the rest.
+		const ast::Expr * postvisit( const ast::UntypedExpr * expr );
+
+		SemanticErrorException errors;
+	  private:
+		template< typename... Params >
+		void emit( CodeLocation, const Params &... params );
+
+		ast::FunctionDecl * function = nullptr;
+		std::set< const ast::DeclWithType * > unhandled;
+		std::map< const ast::DeclWithType *, CodeLocation > usedUninit;
+		const ast::ObjectDecl * thisParam = nullptr;
+		bool isCtor = false; // true if current function is a constructor
+		const ast::StructDecl * structDecl = nullptr;
+	};
+
+	/// expands ConstructorExpr nodes into comma expressions, using a temporary for the first argument
+	struct FixCtorExprs final : public ast::WithDeclsToAdd<>, public ast::WithSymbolTable, public ast::WithShortCircuiting {
+		const ast::Expr * postvisit( const ast::ConstructorExpr * ctorExpr );
+	};
+
+	/// add CompoundStmts around top-level expressions so that temporaries are destroyed in the correct places.
+	struct SplitExpressions : public ast::WithShortCircuiting {
+		ast::Stmt * postvisit( const ast::ExprStmt * stmt );
+		void previsit( const ast::TupleAssignExpr * expr );
+	};
+} // namespace
+
+void fix( ast::TranslationUnit & translationUnit, bool inLibrary ) {
+	ast::Pass<SelfAssignChecker>::run( translationUnit );
+
+	// fixes StmtExpr to properly link to their resulting expression
+	ast::Pass<StmtExprResult>::run( translationUnit );
+
+	// fixes ConstructorInit for global variables. should happen before fixInitializers.
+	InitTweak::fixGlobalInit( translationUnit, inLibrary );
+
+	// must happen before ResolveCopyCtors because temporaries have to be inserted into the correct scope
+	ast::Pass<SplitExpressions>::run( translationUnit );
+
+	ast::Pass<InsertImplicitCalls>::run( translationUnit );
+
+	// Needs to happen before ResolveCopyCtors, because argument/return temporaries should not be considered in
+	// error checking branch statements
+	{
+		ast::Pass<LabelFinder> finder;
+		ast::Pass<InsertDtors>::run( translationUnit, finder );
+	}
+
+	ast::Pass<ResolveCopyCtors>::run( translationUnit );
+	FixInit::fixInitializers( translationUnit );
+	ast::Pass<GenStructMemberCalls>::run( translationUnit );
+
+	// Needs to happen after GenStructMemberCalls, since otherwise member constructors exprs
+	// don't have the correct form, and a member can be constructed more than once.
+	ast::Pass<FixCtorExprs>::run( translationUnit );
+}
+
+namespace {
+	/// find and return the destructor used in `input`. If `input` is not a simple destructor call, generate a thunk
+	/// that wraps the destructor, insert it into `stmtsToAdd` and return the new function declaration
+	const ast::DeclWithType * getDtorFunc( const ast::ObjectDecl * objDecl, const ast::Stmt * input, std::list< ast::ptr<ast::Stmt> > & stmtsToAdd ) {
+		const CodeLocation loc = input->location;
+		// unwrap implicit statement wrapper
+		// Statement * dtor = input;
+		assert( input );
+		// std::list< const ast::Expr * > matches;
+		auto matches = collectCtorDtorCalls( input );
+
+		if ( dynamic_cast< const ast::ExprStmt * >( input ) ) {
+			// only one destructor call in the expression
+			if ( matches.size() == 1 ) {
+				auto func = getFunction( matches.front() );
+				assertf( func, "getFunction failed to find function in %s", toString( matches.front() ).c_str() );
+
+				// cleanup argument must be a function, not an object (including function pointer)
+				if ( auto dtorFunc = dynamic_cast< const ast::FunctionDecl * > ( func ) ) {
+					if ( dtorFunc->type->forall.empty() ) {
+						// simple case where the destructor is a monomorphic function call - can simply
+						// use that function as the cleanup function.
+						return func;
+					}
+				}
+			}
+		}
+
+		// otherwise the cleanup is more complicated - need to build a single argument cleanup function that
+		// wraps the more complicated code.
+		static UniqueName dtorNamer( "__cleanup_dtor" );
+		std::string name = dtorNamer.newName();
+		ast::FunctionDecl * dtorFunc = SymTab::genDefaultFunc( loc, name, objDecl->type->stripReferences(), false );
+		stmtsToAdd.push_back( new ast::DeclStmt(loc, dtorFunc ) );
+
+		// the original code contains uses of objDecl - replace them with the newly generated 'this' parameter.
+		const ast::ObjectDecl * thisParam = getParamThis( dtorFunc );
+		const ast::Expr * replacement = new ast::VariableExpr( loc, thisParam );
+
+		auto base = replacement->result->stripReferences();
+		if ( dynamic_cast< const ast::ArrayType * >( base ) || dynamic_cast< const ast::TupleType * > ( base ) ) {
+			// need to cast away reference for array types, since the destructor is generated without the reference type,
+			// and for tuple types since tuple indexing does not work directly on a reference
+			replacement = new ast::CastExpr( replacement, base );
+		}
+		auto dtor = ast::DeclReplacer::replace( input, ast::DeclReplacer::ExprMap{ std::make_pair( objDecl, replacement ) } );
+		auto mutStmts = dtorFunc->stmts.get_and_mutate();
+		mutStmts->push_back(strict_dynamic_cast<const ast::Stmt *>( dtor ));
+		dtorFunc->stmts = mutStmts;
+
+		return dtorFunc;
+	}
+
+	void FixInit::fixInitializers( ast::TranslationUnit & translationUnit ) {
+		ast::Pass<FixInit> fixer;
+
+		// can't use mutateAll, because need to insert declarations at top-level
+		// can't use DeclMutator, because sometimes need to insert IfStmt, etc.
+		SemanticErrorException errors;
+		for ( auto i = translationUnit.decls.begin(); i != translationUnit.decls.end(); ++i ) {
+			try {
+				// maybeAccept( *i, fixer ); translationUnit should never contain null
+				*i = (*i)->accept(fixer);
+				translationUnit.decls.splice( i, fixer.core.staticDtorDecls );
+			} catch( SemanticErrorException &e ) {
+				errors.append( e );
+			} // try
+		} // for
+		if ( ! errors.isEmpty() ) {
+			throw errors;
+		} // if
+	}
+
+	const ast::StmtExpr * StmtExprResult::previsit( const ast::StmtExpr * stmtExpr ) {
+		// we might loose the result expression here so add a pointer to trace back
+		assert( stmtExpr->result );
+		const ast::Type * result = stmtExpr->result;
+		if ( ! result->isVoid() ) {
+			auto mutExpr = mutate(stmtExpr);
+			const ast::CompoundStmt * body = mutExpr->stmts;
+			assert( ! body->kids.empty() );
+			mutExpr->resultExpr = body->kids.back().strict_as<ast::ExprStmt>();
+			return mutExpr;
+		}
+		return stmtExpr;
+	}
+
+	ast::Stmt * SplitExpressions::postvisit( const ast::ExprStmt * stmt ) {
+		// wrap each top-level ExprStmt in a block so that destructors for argument and return temporaries are destroyed
+		// in the correct places
+		ast::CompoundStmt * ret = new ast::CompoundStmt( stmt->location, { stmt } );
+		return ret;
+	}
+
+	void SplitExpressions::previsit( const ast::TupleAssignExpr * ) {
+		// don't do this within TupleAssignExpr, since it is already broken up into multiple expressions
+		visit_children = false;
+	}
+
+	// Relatively simple structural comparison for expressions, needed to determine
+	// if two expressions are "the same" (used to determine if self assignment occurs)
+	struct StructuralChecker {
+		// Strip all casts and then dynamic_cast.
+		template<typename T>
+		static const T * cast( const ast::Expr * expr ) {
+			// this might be too permissive. It's possible that only particular casts are relevant.
+			while ( auto cast = dynamic_cast< const ast::CastExpr * >( expr ) ) {
+				expr = cast->arg;
+			}
+			return dynamic_cast< const T * >( expr );
+		}
+
+		void previsit( const ast::Expr * ) {
+			// anything else does not qualify
+			result = false;
+		}
+
+		// ignore casts
+		void previsit( const ast::CastExpr * ) {}
+
+		void previsit( const ast::MemberExpr * memExpr ) {
+			if ( auto otherMember = cast< ast::MemberExpr >( other ) ) {
+				if ( otherMember->member == memExpr->member ) {
+					other = otherMember->aggregate;
+					return;
+				}
+			}
+			result = false;
+		}
+
+		void previsit( const ast::VariableExpr * varExpr ) {
+			if ( auto otherVar = cast< ast::VariableExpr >( other ) ) {
+				if ( otherVar->var == varExpr->var ) {
+					return;
+				}
+			}
+			result = false;
+		}
+
+		void previsit( const ast::AddressExpr * ) {
+			if ( auto addrExpr = cast< ast::AddressExpr >( other ) ) {
+				other = addrExpr->arg;
+				return;
+			}
+			result = false;
+		}
+
+		const ast::Expr * other;
+		bool result = true;
+		StructuralChecker( const ast::Expr * other ) : other(other) {}
+	};
+
+	bool structurallySimilar( const ast::Expr * e1, const ast::Expr * e2 ) {
+		return ast::Pass<StructuralChecker>::read( e1, e2 );
+	}
+
+	void SelfAssignChecker::previsit( const ast::ApplicationExpr * appExpr ) {
+		auto function = getFunction( appExpr );
+		if ( function->name == "?=?" ) { // doesn't use isAssignment, because ?+=?, etc. should not count as self-assignment
+			if ( appExpr->args.size() == 2 ) {
+				// check for structural similarity (same variable use, ignore casts, etc. - but does not look too deeply, anything looking like a function is off limits)
+				if ( structurallySimilar( appExpr->args.front(), appExpr->args.back() ) ) {
+					SemanticWarning( appExpr->location, Warning::SelfAssignment, toCString( appExpr->args.front() ) );
+				}
+			}
+		}
+	}
+
+	const ast::Expr * InsertImplicitCalls::postvisit( const ast::ApplicationExpr * appExpr ) {
+		if ( auto function = appExpr->func.as<ast::VariableExpr>() ) {
+			if ( function->var->linkage.is_builtin ) {
+				// optimization: don't need to copy construct in order to call intrinsic functions
+				return appExpr;
+			} else if ( auto funcDecl = function->var.as<ast::DeclWithType>() ) {
+				auto ftype = dynamic_cast< const ast::FunctionType * >( GenPoly::getFunctionType( funcDecl->get_type() ) );
+				assertf( ftype, "Function call without function type: %s", toString( funcDecl ).c_str() );
+				if ( CodeGen::isConstructor( funcDecl->name ) && ftype->params.size() == 2 ) {
+					auto t1 = getPointerBase( ftype->params.front() );
+					auto t2 = ftype->params.back();
+					assert( t1 );
+
+					if ( ResolvExpr::typesCompatible( t1, t2 ) ) {
+						// optimization: don't need to copy construct in order to call a copy constructor
+						return appExpr;
+					} // if
+				} else if ( CodeGen::isDestructor( funcDecl->name ) ) {
+					// correctness: never copy construct arguments to a destructor
+					return appExpr;
+				} // if
+			} // if
+		} // if
+		CP_CTOR_PRINT( std::cerr << "InsertImplicitCalls: adding a wrapper " << appExpr << std::endl; )
+
+		// wrap each function call so that it is easy to identify nodes that have to be copy constructed
+		ast::ptr<ast::TypeSubstitution> tmp = appExpr->env;
+		auto mutExpr = mutate(appExpr);
+		mutExpr->env = nullptr;
+
+		auto expr = new ast::ImplicitCopyCtorExpr( appExpr->location, mutExpr );
+		// Move the type substitution to the new top-level, if it is attached to the appExpr.
+		// Ensure it is not deleted with the ImplicitCopyCtorExpr by removing it before deletion.
+		// The substitution is needed to obtain the type of temporary variables so that copy constructor
+		// calls can be resolved.
+		assert( typeSubs );
+		// assert (mutExpr->env);
+		expr->env = tmp;
+		// mutExpr->env = nullptr;
+		//std::swap( expr->env, appExpr->env );
+		return expr;
+	}
+
+	void ResolveCopyCtors::previsit(const ast::Expr * expr) {
+		if (expr->env) {
+			GuardValue(env);
+			GuardValue(envModified);
+			env = expr->env->clone();
+			envModified = false;
+		}
+	}
+
+	const ast::Expr * ResolveCopyCtors::postvisit(const ast::Expr * expr) {
+		if (expr->env) {
+			if (envModified) {
+				auto mutExpr = mutate(expr);
+				mutExpr->env = env;
+				return mutExpr;
+			}
+			else {
+				// env was not mutated, skip and delete the shallow copy
+				delete env;
+				return expr;
+			}
+		}
+		else {
+			return expr;
+		}
+	}
+
+	bool ResolveCopyCtors::skipCopyConstruct( const ast::Type * type ) { return ! isConstructable( type ); }
+
+	const ast::Expr * ResolveCopyCtors::makeCtorDtor( const std::string & fname, const ast::ObjectDecl * var, const ast::Expr * cpArg ) {
+		assert( var );
+		assert (var->isManaged());
+		assert (!cpArg || cpArg->isManaged());
+		// arrays are not copy constructed, so this should always be an ExprStmt
+		ast::ptr< ast::Stmt > stmt = genCtorDtor(var->location, fname, var, cpArg );
+		assertf( stmt, "ResolveCopyCtors: genCtorDtor returned nullptr: %s / %s / %s", fname.c_str(), toString( var ).c_str(), toString( cpArg ).c_str() );
+		auto exprStmt = stmt.strict_as<ast::ImplicitCtorDtorStmt>()->callStmt.strict_as<ast::ExprStmt>();
+		ast::ptr<ast::Expr> untyped = exprStmt->expr; // take ownership of expr
+		// exprStmt->expr = nullptr;
+
+		// resolve copy constructor
+		// should only be one alternative for copy ctor and dtor expressions, since all arguments are fixed
+		// (VariableExpr and already resolved expression)
+		CP_CTOR_PRINT( std::cerr << "ResolvingCtorDtor " << untyped << std::endl; )
+		ast::ptr<ast::Expr> resolved = ResolvExpr::findVoidExpression(untyped, symtab);
+		assert( resolved );
+		if ( resolved->env ) {
+			// Extract useful information and discard new environments. Keeping them causes problems in PolyMutator passes.
+			env->add( *resolved->env );
+			envModified = true;
+			// delete resolved->env;
+			auto mut = mutate(resolved.get());
+			assertf(mut == resolved.get(), "newly resolved expression must be unique");
+			mut->env = nullptr;
+		} // if
+		// delete stmt;
+		if ( auto assign = resolved.as<ast::TupleAssignExpr>() ) {
+			// fix newly generated StmtExpr
+			previsit( assign->stmtExpr );
+		}
+		return resolved.release();
+	}
+
+	ast::ptr<ast::Expr> ResolveCopyCtors::copyConstructArg(
+		const ast::Expr * arg, const ast::ImplicitCopyCtorExpr * impCpCtorExpr, const ast::Type * formal )
+	{
+		static UniqueName tempNamer("_tmp_cp");
+		assert( env );
+		const CodeLocation loc = impCpCtorExpr->location;
+		// CP_CTOR_PRINT( std::cerr << "Type Substitution: " << *env << std::endl; )
+		assert( arg->result );
+		ast::ptr<ast::Type> result = arg->result;
+		if ( skipCopyConstruct( result ) ) return arg; // skip certain non-copyable types
+
+		// type may involve type variables, so apply type substitution to get temporary variable's actual type,
+		// since result type may not be substituted (e.g., if the type does not appear in the parameter list)
+		// Use applyFree so that types bound in function pointers are not substituted, e.g. in forall(dtype T) void (*)(T).
+
+		// xxx - this originally mutates arg->result in place. is it correct?
+		result = env->applyFree( result.get() ).node;
+		auto mutResult = result.get_and_mutate();
+		mutResult->set_const(false);
+
+		auto mutArg = mutate(arg);
+		mutArg->result = mutResult;
+
+		ast::ptr<ast::Expr> guard = mutArg;
+
+		ast::ptr<ast::ObjectDecl> tmp = new ast::ObjectDecl({}, "__tmp", mutResult, nullptr );
+
+		// create and resolve copy constructor
+		CP_CTOR_PRINT( std::cerr << "makeCtorDtor for an argument" << std::endl; )
+		auto cpCtor = makeCtorDtor( "?{}", tmp, mutArg );
+
+		if ( auto appExpr = dynamic_cast< const ast::ApplicationExpr * >( cpCtor ) ) {
+			// if the chosen constructor is intrinsic, the copy is unnecessary, so
+			// don't create the temporary and don't call the copy constructor
+			auto function = appExpr->func.strict_as<ast::VariableExpr>();
+			if ( function->var->linkage == ast::Linkage::Intrinsic ) {
+				// arguments that need to be boxed need a temporary regardless of whether the copy constructor is intrinsic,
+				// so that the object isn't changed inside of the polymorphic function
+				if ( ! GenPoly::needsBoxing( formal, result, impCpCtorExpr->callExpr, env ) ) {
+					// xxx - should arg->result be mutated? see comment above.
+					return guard;
+				}
+			}
+		}
+
+		// set a unique name for the temporary once it's certain the call is necessary
+		auto mut = tmp.get_and_mutate();
+		assertf (mut == tmp, "newly created ObjectDecl must be unique");
+		mut->name = tempNamer.newName();
+
+		// replace argument to function call with temporary
+		stmtsToAddBefore.push_back( new ast::DeclStmt(loc, tmp ) );
+		arg = cpCtor;
+		return destructRet( tmp, arg );
+
+		// impCpCtorExpr->dtors.push_front( makeCtorDtor( "^?{}", tmp ) );
+	}
+
+	ast::Expr * ResolveCopyCtors::destructRet( const ast::ObjectDecl * ret, const ast::Expr * arg ) {
+		// TODO: refactor code for generating cleanup attribute, since it's common and reused in ~3-4 places
+		// check for existing cleanup attribute before adding another(?)
+		// need to add __Destructor for _tmp_cp variables as well
+
+		assertf( ast::dtorStruct && ast::dtorStruct->members.size() == 2, "Destructor generation requires __Destructor definition." );
+		assertf( ast::dtorStructDestroy, "Destructor generation requires __destroy_Destructor." );
+
+		const CodeLocation loc = ret->location;
+
+		// generate a __Destructor for ret that calls the destructor
+		auto res = makeCtorDtor( "^?{}", ret );
+		auto dtor = mutate(res);
+
+		// if the chosen destructor is intrinsic, elide the generated dtor handler
+		if ( arg && isIntrinsicCallExpr( dtor ) ) {
+			return new ast::CommaExpr(loc, arg, new ast::VariableExpr(loc, ret ) );
+			// return;
+		}
+
+		if ( ! dtor->env ) dtor->env = maybeClone( env );
+		auto dtorFunc = getDtorFunc( ret, new ast::ExprStmt(loc, dtor ), stmtsToAddBefore );
+
+		auto dtorStructType = new ast::StructInstType(ast::dtorStruct);
+
+		// what does this do???
+		dtorStructType->params.push_back( new ast::TypeExpr(loc, new ast::VoidType() ) );
+
+		// cast destructor pointer to void (*)(void *), to silence GCC incompatible pointer warnings
+		auto dtorFtype = new ast::FunctionType();
+		dtorFtype->params.push_back( new ast::PointerType(new ast::VoidType( ) ) );
+		auto dtorType = new ast::PointerType( dtorFtype );
+
+		static UniqueName namer( "_ret_dtor" );
+		auto retDtor = new ast::ObjectDecl(loc, namer.newName(), dtorStructType, new ast::ListInit(loc, { new ast::SingleInit(loc, ast::ConstantExpr::null(loc) ), new ast::SingleInit(loc, new ast::CastExpr( new ast::VariableExpr(loc, dtorFunc ), dtorType ) ) } ) );
+		retDtor->attributes.push_back( new ast::Attribute( "cleanup", { new ast::VariableExpr(loc, ast::dtorStructDestroy ) } ) );
+		stmtsToAddBefore.push_back( new ast::DeclStmt(loc, retDtor ) );
+
+		if ( arg ) {
+			auto member = new ast::MemberExpr(loc, ast::dtorStruct->members.front().strict_as<ast::DeclWithType>(), new ast::VariableExpr(loc, retDtor ) );
+			auto object = new ast::CastExpr( new ast::AddressExpr( new ast::VariableExpr(loc, ret ) ), new ast::PointerType(new ast::VoidType() ) );
+			ast::Expr * assign = createBitwiseAssignment( member, object );
+			return new ast::CommaExpr(loc, new ast::CommaExpr(loc, arg, assign ), new ast::VariableExpr(loc, ret ) );
+		}
+		return nullptr;
+		// impCpCtorExpr->get_dtors().push_front( makeCtorDtor( "^?{}", ret ) );
+	}
+
+	const ast::Expr * ResolveCopyCtors::postvisit( const ast::ImplicitCopyCtorExpr *impCpCtorExpr ) {
+		CP_CTOR_PRINT( std::cerr << "ResolveCopyCtors: " << impCpCtorExpr << std::endl; )
+
+		ast::ApplicationExpr * appExpr = mutate(impCpCtorExpr->callExpr.get());
+		const ast::ObjectDecl * returnDecl = nullptr;
+		const CodeLocation loc = appExpr->location;
+
+		// take each argument and attempt to copy construct it.
+		auto ftype = GenPoly::getFunctionType( appExpr->func->result );
+		assert( ftype );
+		auto & params = ftype->params;
+		auto iter = params.begin();
+		for ( auto & arg : appExpr->args ) {
+			const ast::Type * formal = nullptr;
+			if ( iter != params.end() ) { // does not copy construct C-style variadic arguments
+				// DeclarationWithType * param = *iter++;
+				formal = *iter++;
+			}
+
+			arg = copyConstructArg( arg, impCpCtorExpr, formal );
+		} // for
+
+		// each return value from the call needs to be connected with an ObjectDecl at the call site, which is
+		// initialized with the return value and is destructed later
+		// xxx - handle named return values?
+		const ast::Type * result = appExpr->result;
+		if ( ! result->isVoid() ) {
+			static UniqueName retNamer("_tmp_cp_ret");
+			// result = result->clone();
+			auto subResult = env->apply( result ).node;
+			auto ret = new ast::ObjectDecl(loc, retNamer.newName(), subResult, nullptr );
+			auto mutType = mutate(ret->type.get());
+			mutType->set_const( false );
+			ret->type = mutType;
+			returnDecl = ret;
+			stmtsToAddBefore.push_back( new ast::DeclStmt(loc, ret ) );
+			CP_CTOR_PRINT( std::cerr << "makeCtorDtor for a return" << std::endl; )
+		} // for
+		CP_CTOR_PRINT( std::cerr << "after Resolving: " << impCpCtorExpr << std::endl; )
+		// ------------------------------------------------------
+
+		CP_CTOR_PRINT( std::cerr << "Coming out the back..." << impCpCtorExpr << std::endl; )
+
+		// detach fields from wrapper node so that it can be deleted without deleting too much
+
+		// xxx - actual env might be somewhere else, need to keep invariant
+
+		// deletion of wrapper should be handled by pass template now
+
+		// impCpCtorExpr->callExpr = nullptr;
+		assert (appExpr->env == nullptr);
+		appExpr->env = impCpCtorExpr->env;
+		// std::swap( impCpCtorExpr->env, appExpr->env );
+		// assert( impCpCtorExpr->env == nullptr );
+		// delete impCpCtorExpr;
+
+		if ( returnDecl ) {
+			ast::Expr * assign = createBitwiseAssignment( new ast::VariableExpr(loc, returnDecl ), appExpr );
+			if ( ! dynamic_cast< const ast::ReferenceType * >( result ) ) {
+				// destructing reference returns is bad because it can cause multiple destructor calls to the same object - the returned object is not a temporary
+				assign = destructRet( returnDecl, assign );
+				assert(assign);
+			} else {
+				assign = new ast::CommaExpr(loc, assign, new ast::VariableExpr(loc, returnDecl ) );
+			}
+			// move env from appExpr to retExpr
+			// std::swap( assign->env, appExpr->env );
+			assign->env = appExpr->env;
+			// actual env is handled by common routine that replaces WithTypeSubstitution
+			return postvisit((const ast::Expr *)assign);
+		} else {
+			return postvisit((const ast::Expr *)appExpr);
+		} // if
+	}
+
+	const ast::StmtExpr * ResolveCopyCtors::previsit( const ast::StmtExpr * _stmtExpr ) {
+		// function call temporaries should be placed at statement-level, rather than nested inside of a new statement expression,
+		// since temporaries can be shared across sub-expressions, e.g.
+		//   [A, A] f();       // decl
+		//   g([A] x, [A] y);  // decl
+		//   g(f());           // call
+		// f is executed once, so the return temporary is shared across the tuple constructors for x and y.
+		// Explicitly mutating children instead of mutating the inner compound statement forces the temporaries to be added
+		// to the outer context, rather than inside of the statement expression.
+
+		// call the common routine that replaces WithTypeSubstitution
+		previsit((const ast::Expr *) _stmtExpr);
+
+		visit_children = false;
+		const CodeLocation loc = _stmtExpr->location;
+
+		assert( env );
+
+		symtab.enterScope();
+		// visit all statements
+		auto stmtExpr = mutate(_stmtExpr);
+		auto mutStmts = mutate(stmtExpr->stmts.get());
+
+		auto & stmts = mutStmts->kids;
+		for ( auto & stmt : stmts ) {
+			stmt = stmt->accept( *visitor );
+		} // for
+		stmtExpr->stmts = mutStmts;
+		symtab.leaveScope();
+
+		assert( stmtExpr->result );
+		// const ast::Type * result = stmtExpr->result;
+		if ( ! stmtExpr->result->isVoid() ) {
+			static UniqueName retNamer("_tmp_stmtexpr_ret");
+
+			// result = result->clone();
+			auto result = env->apply( stmtExpr->result.get() ).node;
+			if ( ! InitTweak::isConstructable( result ) ) {
+				// delete result;
+				return stmtExpr;
+			}
+			auto mutResult = result.get_and_mutate();
+			mutResult->set_const(false);
+
+			// create variable that will hold the result of the stmt expr
+			auto ret = new ast::ObjectDecl(loc, retNamer.newName(), mutResult, nullptr );
+			stmtsToAddBefore.push_back( new ast::DeclStmt(loc, ret ) );
+
+			assertf(
+				stmtExpr->resultExpr,
+				"Statement-Expression should have a resulting expression at %s:%d",
+				stmtExpr->location.filename.c_str(),
+				stmtExpr->location.first_line
+			);
+
+			const ast::ExprStmt * last = stmtExpr->resultExpr;
+			// xxx - if this is non-unique, need to copy while making resultExpr ref
+			assertf(last->unique(), "attempt to modify weakly shared statement");
+			auto mutLast = mutate(last);
+			// above assertion means in-place mutation is OK
+			try {
+				mutLast->expr = makeCtorDtor( "?{}", ret, mutLast->expr );
+			} catch(...) {
+				std::cerr << "*CFA internal error: ";
+				std::cerr << "can't resolve implicit constructor";
+				std::cerr << " at " << stmtExpr->location.filename;
+				std::cerr << ":" << stmtExpr->location.first_line << std::endl;
+
+				abort();
+			}
+
+			// add destructors after current statement
+			stmtsToAddAfter.push_back( new ast::ExprStmt(loc, makeCtorDtor( "^?{}", ret ) ) );
+
+			// must have a non-empty body, otherwise it wouldn't have a result
+			assert( ! stmts.empty() );
+
+			// if there is a return decl, add a use as the last statement; will not have return decl on non-constructable returns
+			stmts.push_back( new ast::ExprStmt(loc, new ast::VariableExpr(loc, ret ) ) );
+		} // if
+
+		assert( stmtExpr->returnDecls.empty() );
+		assert( stmtExpr->dtors.empty() );
+
+		return stmtExpr;
+	}
+
+	// to prevent warnings ('_unq0' may be used uninitialized in this function),
+	// insert an appropriate zero initializer for UniqueExpr temporaries.
+	ast::Init * makeInit( const ast::Type * t ) {
+		if ( auto inst = dynamic_cast< const ast::StructInstType * >( t ) ) {
+			// initizer for empty struct must be empty
+			if ( inst->base->members.empty() ) return new ast::ListInit({}, {});
+		} else if ( auto inst = dynamic_cast< const ast::UnionInstType * >( t ) ) {
+			// initizer for empty union must be empty
+			if ( inst->base->members.empty() ) return new ast::ListInit({}, {});
+		}
+
+		return new ast::ListInit( {}, { new ast::SingleInit( {}, ast::ConstantExpr::from_int({}, 0) ) } );
+	}
+
+	const ast::UniqueExpr * ResolveCopyCtors::previsit( const ast::UniqueExpr * unqExpr ) {
+		visit_children = false;
+		// xxx - hack to prevent double-handling of unique exprs, otherwise too many temporary variables and destructors are generated
+		static std::unordered_map< int, const ast::UniqueExpr * > unqMap;
+		auto mutExpr = mutate(unqExpr);
+		if ( ! unqMap.count( unqExpr->id ) ) {
+			// resolve expr and find its
+
+			auto impCpCtorExpr = mutExpr->expr.as<ast::ImplicitCopyCtorExpr>();
+			// PassVisitor<ResolveCopyCtors> fixer;
+
+			mutExpr->expr = mutExpr->expr->accept( *visitor );
+			// it should never be necessary to wrap a void-returning expression in a UniqueExpr - if this assumption changes, this needs to be rethought
+			assert( unqExpr->result );
+			if ( impCpCtorExpr ) {
+				auto comma = unqExpr->expr.strict_as<ast::CommaExpr>();
+				auto var = comma->arg2.strict_as<ast::VariableExpr>();
+				// note the variable used as the result from the call
+				mutExpr->var = var;
+			} else {
+				// expr isn't a call expr, so create a new temporary variable to use to hold the value of the unique expression
+				mutExpr->object = new ast::ObjectDecl( mutExpr->location, toString("_unq", mutExpr->id), mutExpr->result, makeInit( mutExpr->result ) );
+				mutExpr->var = new ast::VariableExpr( mutExpr->location, mutExpr->object );
+			}
+
+			// stmtsToAddBefore.splice( stmtsToAddBefore.end(), fixer.pass.stmtsToAddBefore );
+			// stmtsToAddAfter.splice( stmtsToAddAfter.end(), fixer.pass.stmtsToAddAfter );
+			unqMap[mutExpr->id] = mutExpr;
+		} else {
+			// take data from other UniqueExpr to ensure consistency
+			// delete unqExpr->get_expr();
+			mutExpr->expr = unqMap[mutExpr->id]->expr;
+			// delete unqExpr->result;
+			mutExpr->result = mutExpr->expr->result;
+		}
+		return mutExpr;
+	}
+
+	const ast::DeclWithType * FixInit::postvisit( const ast::ObjectDecl *_objDecl ) {
+		const CodeLocation loc = _objDecl->location;
+
+		// since this removes the init field from objDecl, it must occur after children are mutated (i.e. postvisit)
+		if ( ast::ptr<ast::ConstructorInit> ctorInit = _objDecl->init.as<ast::ConstructorInit>() ) {
+			auto objDecl = mutate(_objDecl);
+
+			// could this be non-unique?
+			if (objDecl != _objDecl) {
+				std::cerr << "FixInit: non-unique object decl " << objDecl->location << objDecl->name << std::endl;
+			}
+			// a decision should have been made by the resolver, so ctor and init are not both non-NULL
+			assert( ! ctorInit->ctor || ! ctorInit->init );
+			if ( const ast::Stmt * ctor = ctorInit->ctor ) {
+				if ( objDecl->storage.is_static ) {
+					// originally wanted to take advantage of gcc nested functions, but
+					// we get memory errors with this approach. To remedy this, the static
+					// variable is hoisted when the destructor needs to be called.
+					//
+					// generate:
+					// static T __objName_static_varN;
+					// void __objName_dtor_atexitN() {
+					//   __dtor__...;
+					// }
+					// int f(...) {
+					//   ...
+					//   static bool __objName_uninitialized = true;
+					//   if (__objName_uninitialized) {
+					//     __ctor(__objName);
+					//     __objName_uninitialized = false;
+					//     atexit(__objName_dtor_atexitN);
+					//   }
+					//   ...
+					// }
+
+					static UniqueName dtorCallerNamer( "_dtor_atexit" );
+
+					// static bool __objName_uninitialized = true
+					auto boolType = new ast::BasicType( ast::BasicType::Kind::Bool );
+					auto boolInitExpr = new ast::SingleInit(loc, ast::ConstantExpr::from_int(loc, 1 ) );
+					auto isUninitializedVar = new ast::ObjectDecl(loc, objDecl->mangleName + "_uninitialized", boolType, boolInitExpr, ast::Storage::Static, ast::Linkage::Cforall);
+					isUninitializedVar->fixUniqueId();
+
+					// __objName_uninitialized = false;
+					auto setTrue = new ast::UntypedExpr(loc, new ast::NameExpr(loc, "?=?" ) );
+					setTrue->args.push_back( new ast::VariableExpr(loc, isUninitializedVar ) );
+					setTrue->args.push_back( ast::ConstantExpr::from_int(loc, 0 ) );
+
+					// generate body of if
+					auto initStmts = new ast::CompoundStmt(loc);
+					auto & body = initStmts->kids;
+					body.push_back( ctor );
+					body.push_back( new ast::ExprStmt(loc, setTrue ) );
+
+					// put it all together
+					auto ifStmt = new ast::IfStmt(loc, new ast::VariableExpr(loc, isUninitializedVar ), initStmts, 0 );
+					stmtsToAddAfter.push_back( new ast::DeclStmt(loc, isUninitializedVar ) );
+					stmtsToAddAfter.push_back( ifStmt );
+
+					const ast::Stmt * dtor = ctorInit->dtor;
+
+					// these should be automatically managed once reassigned
+					// objDecl->set_init( nullptr );
+					// ctorInit->set_ctor( nullptr );
+					// ctorInit->set_dtor( nullptr );
+					if ( dtor ) {
+						// if the object has a non-trivial destructor, have to
+						// hoist it and the object into the global space and
+						// call the destructor function with atexit.
+
+						// Statement * dtorStmt = dtor->clone();
+
+						// void __objName_dtor_atexitN(...) {...}
+						ast::FunctionDecl * dtorCaller = new ast::FunctionDecl(loc, objDecl->mangleName + dtorCallerNamer.newName(), {}, {}, {}, new ast::CompoundStmt(loc, {dtor}), ast::Storage::Static, ast::Linkage::C );
+						dtorCaller->fixUniqueId();
+						// dtorCaller->stmts->push_back( dtor );
+
+						// atexit(dtor_atexit);
+						auto callAtexit = new ast::UntypedExpr(loc, new ast::NameExpr(loc, "atexit" ) );
+						callAtexit->args.push_back( new ast::VariableExpr(loc, dtorCaller ) );
+
+						body.push_back( new ast::ExprStmt(loc, callAtexit ) );
+
+						// hoist variable and dtor caller decls to list of decls that will be added into global scope
+						staticDtorDecls.push_back( objDecl );
+						staticDtorDecls.push_back( dtorCaller );
+
+						// need to rename object uniquely since it now appears
+						// at global scope and there could be multiple function-scoped
+						// static variables with the same name in different functions.
+						// Note: it isn't sufficient to modify only the mangleName, because
+						// then subsequent Indexer passes can choke on seeing the object's name
+						// if another object has the same name and type. An unfortunate side-effect
+						// of renaming the object is that subsequent NameExprs may fail to resolve,
+						// but there shouldn't be any remaining past this point.
+						static UniqueName staticNamer( "_static_var" );
+						objDecl->name = objDecl->name + staticNamer.newName();
+						objDecl->mangleName = Mangle::mangle( objDecl );
+
+						// xxx - temporary hack: need to return a declaration, but want to hoist the current object out of this scope
+						// create a new object which is never used
+						static UniqueName dummyNamer( "_dummy" );
+						auto dummy = new ast::ObjectDecl(loc, dummyNamer.newName(), new ast::PointerType(new ast::VoidType()), nullptr, ast::Storage::Static, ast::Linkage::Cforall, 0, { new ast::Attribute("unused") } );
+						// delete ctorInit;
+						return dummy;
+					} else {
+						objDecl->init = nullptr;
+						return objDecl;
+					}
+				} else {
+					auto implicit = strict_dynamic_cast< const ast::ImplicitCtorDtorStmt * > ( ctor );
+					auto ctorStmt = implicit->callStmt.as<ast::ExprStmt>();
+					const ast::ApplicationExpr * ctorCall = nullptr;
+					if ( ctorStmt && (ctorCall = isIntrinsicCallExpr( ctorStmt->expr )) && ctorCall->args.size() == 2 ) {
+						// clean up intrinsic copy constructor calls by making them into SingleInits
+						const ast::Expr * ctorArg = ctorCall->args.back();
+						// ctorCall should be gone afterwards
+						auto mutArg = mutate(ctorArg);
+						mutArg->env = ctorCall->env;
+						// std::swap( ctorArg->env, ctorCall->env );
+						objDecl->init = new ast::SingleInit(loc, mutArg );
+
+						// ctorCall->args.pop_back();
+					} else {
+						stmtsToAddAfter.push_back( ctor );
+						objDecl->init = nullptr;
+						// ctorInit->ctor = nullptr;
+					}
+
+					const ast::Stmt * dtor = ctorInit->dtor;
+					if ( dtor ) {
+						auto implicit = strict_dynamic_cast< const ast::ImplicitCtorDtorStmt * >( dtor );
+						const ast::Stmt * dtorStmt = implicit->callStmt;
+
+						// don't need to call intrinsic dtor, because it does nothing, but
+						// non-intrinsic dtors must be called
+						if ( ! isIntrinsicSingleArgCallStmt( dtorStmt ) ) {
+							// set dtor location to the object's location for error messages
+							auto dtorFunc = getDtorFunc( objDecl, dtorStmt, stmtsToAddBefore );
+							objDecl->attributes.push_back( new ast::Attribute( "cleanup", { new ast::VariableExpr(loc, dtorFunc ) } ) );
+							// ctorInit->dtor = nullptr;
+						} // if
+					}
+				} // if
+			} else if ( const ast::Init * init = ctorInit->init ) {
+				objDecl->init = init;
+				// ctorInit->init = nullptr;
+			} else {
+				// no constructor and no initializer, which is okay
+				objDecl->init = nullptr;
+			} // if
+			// delete ctorInit;
+			return objDecl;
+		} // if
+		return _objDecl;
+	}
+
+	void ObjDeclCollector::previsit( const ast::CompoundStmt * ) {
+		GuardValue( curVars );
+	}
+
+	void ObjDeclCollector::previsit( const ast::DeclStmt * stmt ) {
+		// keep track of all variables currently in scope
+		if ( auto objDecl = stmt->decl.as<ast::ObjectDecl>() ) {
+			curVars.push_back( objDecl );
+		} // if
+	}
+
+	void LabelFinder::previsit( const ast::Stmt * stmt ) {
+		// for each label, remember the variables in scope at that label.
+		for ( auto l : stmt->labels ) {
+			vars[l] = curVars;
+		} // for
+	}
+
+	void LabelFinder::previsit( const ast::CompoundStmt * stmt ) {
+		previsit( (const ast::Stmt *) stmt );
+		Parent::previsit( stmt );
+	}
+
+	void LabelFinder::previsit( const ast::DeclStmt * stmt ) {
+		previsit( (const ast::Stmt *)stmt );
+		Parent::previsit( stmt );
+	}
+
+
+	void InsertDtors::previsit( const ast::FunctionDecl * funcDecl ) {
+		// each function needs to have its own set of labels
+		GuardValue( labelVars );
+		labelVars.clear();
+		// LabelFinder does not recurse into FunctionDecl, so need to visit
+		// its children manually.
+		if (funcDecl->type) funcDecl->type->accept(finder);
+		// maybeAccept( funcDecl->type, finder );
+		if (funcDecl->stmts) funcDecl->stmts->accept(finder) ;
+
+		// all labels for this function have been collected, insert destructors as appropriate via implicit recursion.
+	}
+
+	// Handle break/continue/goto in the same manner as C++.  Basic idea: any objects that are in scope at the
+	// BranchStmt but not at the labelled (target) statement must be destructed.  If there are any objects in scope
+	// at the target location but not at the BranchStmt then those objects would be uninitialized so notify the user
+	// of the error.  See C++ Reference 6.6 Jump Statements for details.
+	void InsertDtors::handleGoto( const ast::BranchStmt * stmt ) {
+		// can't do anything for computed goto
+		if ( stmt->computedTarget ) return;
+
+		assertf( stmt->target.name != "", "BranchStmt missing a label: %s", toString( stmt ).c_str() );
+		// S_L = lvars = set of objects in scope at label definition
+		// S_G = curVars = set of objects in scope at goto statement
+		ObjectSet & lvars = labelVars[ stmt->target ];
+
+		DTOR_PRINT(
+			std::cerr << "at goto label: " << stmt->target.name << std::endl;
+			std::cerr << "S_G = " << printSet( curVars ) << std::endl;
+			std::cerr << "S_L = " << printSet( lvars ) << std::endl;
+		)
+
+
+		// std::set_difference requires that the inputs be sorted.
+		lvars.sort();
+		curVars.sort();
+
+		ObjectSet diff;
+		// S_L-S_G results in set of objects whose construction is skipped - it's an error if this set is non-empty
+		std::set_difference( lvars.begin(), lvars.end(), curVars.begin(), curVars.end(), std::inserter( diff, diff.begin() ) );
+		DTOR_PRINT(
+			std::cerr << "S_L-S_G = " << printSet( diff ) << std::endl;
+		)
+		if ( ! diff.empty() ) {
+			SemanticError( stmt, std::string("jump to label '") + stmt->target.name + "' crosses initialization of " + (*diff.begin())->name + " " );
+		} // if
+	}
+
+	void InsertDtors::previsit( const ast::BranchStmt * stmt ) {
+		switch( stmt->kind ) {
+		  case ast::BranchStmt::Continue:
+		  case ast::BranchStmt::Break:
+			// could optimize the break/continue case, because the S_L-S_G check is unnecessary (this set should
+			// always be empty), but it serves as a small sanity check.
+		  case ast::BranchStmt::Goto:
+			handleGoto( stmt );
+			break;
+		  default:
+			assert( false );
+		} // switch
+	}
+
+	bool checkWarnings( const ast::FunctionDecl * funcDecl ) {
+		// only check for warnings if the current function is a user-defined
+		// constructor or destructor
+		if ( ! funcDecl ) return false;
+		if ( ! funcDecl->stmts ) return false;
+		return CodeGen::isCtorDtor( funcDecl->name ) && ! funcDecl->linkage.is_overrideable;
+	}
+
+	void GenStructMemberCalls::previsit( const ast::FunctionDecl * funcDecl ) {
+		GuardValue( function );
+		GuardValue( unhandled );
+		GuardValue( usedUninit );
+		GuardValue( thisParam );
+		GuardValue( isCtor );
+		GuardValue( structDecl );
+		errors = SemanticErrorException();  // clear previous errors
+
+		// need to start with fresh sets
+		unhandled.clear();
+		usedUninit.clear();
+
+		function = mutate(funcDecl);
+		// could this be non-unique?
+		if (function != funcDecl) {
+			std::cerr << "GenStructMemberCalls: non-unique FunctionDecl " << funcDecl->location << funcDecl->name << std::endl;
+		}
+
+		isCtor = CodeGen::isConstructor( function->name );
+		if ( checkWarnings( function ) ) {
+			// const ast::FunctionType * type = function->type;
+			// assert( ! type->params.empty() );
+			thisParam = function->params.front().strict_as<ast::ObjectDecl>();
+			auto thisType = getPointerBase( thisParam->get_type() );
+			auto structType = dynamic_cast< const ast::StructInstType * >( thisType );
+			if ( structType ) {
+				structDecl = structType->base;
+				for ( auto & member : structDecl->members ) {
+					if ( auto field = member.as<ast::ObjectDecl>() ) {
+						// record all of the struct type's members that need to be constructed or
+						// destructed by the end of the function
+						unhandled.insert( field );
+					}
+				}
+			}
+		}
+	}
+
+	const ast::DeclWithType * GenStructMemberCalls::postvisit( const ast::FunctionDecl * funcDecl ) {
+		// remove the unhandled objects from usedUninit, because a call is inserted
+		// to handle them - only objects that are later constructed are used uninitialized.
+		std::map< const ast::DeclWithType *, CodeLocation > diff;
+		// need the comparator since usedUninit and unhandled have different types
+		struct comp_t {
+			typedef decltype(usedUninit)::value_type usedUninit_t;
+			typedef decltype(unhandled)::value_type unhandled_t;
+			bool operator()(usedUninit_t x, unhandled_t y) { return x.first < y; }
+			bool operator()(unhandled_t x, usedUninit_t y) { return x < y.first; }
+		} comp;
+		std::set_difference( usedUninit.begin(), usedUninit.end(), unhandled.begin(), unhandled.end(), std::inserter( diff, diff.begin() ), comp );
+		for ( auto p : diff ) {
+			auto member = p.first;
+			auto loc = p.second;
+			// xxx - make error message better by also tracking the location that the object is constructed at?
+			emit( loc, "in ", function->name, ", field ", member->name, " used before being constructed" );
+		}
+
+		const CodeLocation loc = funcDecl->location;
+
+		if ( ! unhandled.empty() ) {
+			auto mutStmts = function->stmts.get_and_mutate();
+			// need to explicitly re-add function parameters to the indexer in order to resolve copy constructors
+			auto guard = makeFuncGuard( [this]() { symtab.enterScope(); }, [this]() { symtab.leaveScope(); } );
+			symtab.addFunction( function );
+
+			// need to iterate through members in reverse in order for
+			// ctor/dtor statements to come out in the right order
+			for ( auto & member : reverseIterate( structDecl->members ) ) {
+				auto field = member.as<ast::ObjectDecl>();
+				// skip non-DWT members
+				if ( ! field ) continue;
+				// skip non-constructable members
+				if ( ! tryConstruct( field ) ) continue;
+				// skip handled members
+				if ( ! unhandled.count( field ) ) continue;
+
+				// insert and resolve default/copy constructor call for each field that's unhandled
+				// std::list< const ast::Stmt * > stmt;
+				ast::Expr * arg2 = nullptr;
+				if ( function->name == "?{}" && isCopyFunction( function ) ) {
+					// if copy ctor, need to pass second-param-of-this-function.field
+					// std::list< DeclarationWithType * > & params = function->get_functionType()->get_parameters();
+					assert( function->params.size() == 2 );
+					arg2 = new ast::MemberExpr(funcDecl->location, field, new ast::VariableExpr(funcDecl->location, function->params.back() ) );
+				}
+				InitExpander_new srcParam( arg2 );
+				// cast away reference type and construct field.
+				ast::Expr * thisExpr = new ast::CastExpr(funcDecl->location, new ast::VariableExpr(funcDecl->location, thisParam ), thisParam->get_type()->stripReferences());
+				ast::Expr * memberDest = new ast::MemberExpr(funcDecl->location, field, thisExpr );
+				ast::ptr<ast::Stmt> callStmt = SymTab::genImplicitCall( srcParam, memberDest, loc, function->name, field, static_cast<SymTab::LoopDirection>(isCtor) );
+
+				if ( callStmt ) {
+					// auto & callStmt = stmt.front();
+
+					try {
+						callStmt = callStmt->accept( *visitor );
+						if ( isCtor ) {
+							mutStmts->push_front( callStmt );
+						} else { // TODO: don't generate destructor function/object for intrinsic calls
+							// destructor statements should be added at the end
+							// function->get_statements()->push_back( callStmt );
+
+							// Optimization: do not need to call intrinsic destructors on members
+							if ( isIntrinsicSingleArgCallStmt( callStmt ) ) continue;
+
+							// __Destructor _dtor0 = { (void *)&b.a1, (void (*)(void *)_destroy_A };
+							std::list< ast::ptr<ast::Stmt> > stmtsToAdd;
+
+							static UniqueName memberDtorNamer = { "__memberDtor" };
+							assertf( Validate::dtorStruct, "builtin __Destructor not found." );
+							assertf( Validate::dtorStructDestroy, "builtin __destroy_Destructor not found." );
+
+							ast::Expr * thisExpr = new ast::CastExpr( new ast::AddressExpr( new ast::VariableExpr(loc, thisParam ) ), new ast::PointerType( new ast::VoidType(), ast::CV::Qualifiers() ) );
+							ast::Expr * dtorExpr = new ast::VariableExpr(loc, getDtorFunc( thisParam, callStmt, stmtsToAdd ) );
+
+							// cast destructor pointer to void (*)(void *), to silence GCC incompatible pointer warnings
+							auto dtorFtype = new ast::FunctionType();
+							dtorFtype->params.emplace_back( new ast::PointerType( new ast::VoidType() ) );
+							auto dtorType = new ast::PointerType( dtorFtype );
+
+							auto destructor = new ast::ObjectDecl(loc, memberDtorNamer.newName(), new ast::StructInstType( ast::dtorStruct ), new ast::ListInit(loc, { new ast::SingleInit(loc, thisExpr ), new ast::SingleInit(loc, new ast::CastExpr( dtorExpr, dtorType ) ) } ) );
+							destructor->attributes.push_back( new ast::Attribute( "cleanup", { new ast::VariableExpr({}, ast::dtorStructDestroy ) } ) );
+							mutStmts->push_front( new ast::DeclStmt(loc, destructor ) );
+							mutStmts->kids.splice( mutStmts->kids.begin(), stmtsToAdd );
+						}
+					} catch ( SemanticErrorException & error ) {
+						emit( funcDecl->location, "in ", function->name , ", field ", field->name, " not explicitly ", isCtor ? "constructed" : "destructed",  " and no ", isCtor ? "default constructor" : "destructor", " found" );
+					}
+				}
+			}
+			function->stmts = mutStmts;
+		}
+		if (! errors.isEmpty()) {
+			throw errors;
+		}
+		// return funcDecl;
+		return function;
+	}
+
+	/// true if expr is effectively just the 'this' parameter
+	bool isThisExpression( const ast::Expr * expr, const ast::DeclWithType * thisParam ) {
+		// TODO: there are more complicated ways to pass 'this' to a constructor, e.g. &*, *&, etc.
+		if ( auto varExpr = dynamic_cast< const ast::VariableExpr * >( expr ) ) {
+			return varExpr->var == thisParam;
+		} else if ( auto castExpr = dynamic_cast< const ast::CastExpr * > ( expr ) ) {
+			return isThisExpression( castExpr->arg, thisParam );
+		}
+		return false;
+	}
+
+	/// returns a MemberExpr if expr is effectively just member access on the 'this' parameter, else nullptr
+	const ast::MemberExpr * isThisMemberExpr( const ast::Expr * expr, const ast::DeclWithType * thisParam ) {
+		if ( auto memberExpr = dynamic_cast< const ast::MemberExpr * >( expr ) ) {
+			if ( isThisExpression( memberExpr->aggregate, thisParam ) ) {
+				return memberExpr;
+			}
+		} else if ( auto castExpr = dynamic_cast< const ast::CastExpr * >( expr ) ) {
+			return isThisMemberExpr( castExpr->arg, thisParam );
+		}
+		return nullptr;
+	}
+
+	void GenStructMemberCalls::previsit( const ast::ApplicationExpr * appExpr ) {
+		if ( ! checkWarnings( function ) ) {
+			visit_children = false;
+			return;
+		}
+
+		std::string fname = getFunctionName( appExpr );
+		if ( fname == function->name ) {
+			// call to same kind of function
+			const ast::Expr * firstParam = appExpr->args.front();
+
+			if ( isThisExpression( firstParam, thisParam ) ) {
+				// if calling another constructor on thisParam, assume that function handles
+				// all members - if it doesn't a warning will appear in that function.
+				unhandled.clear();
+			} else if ( auto memberExpr = isThisMemberExpr( firstParam, thisParam ) ) {
+				// if first parameter is a member expression on the this parameter,
+				// then remove the member from unhandled set.
+				if ( isThisExpression( memberExpr->aggregate, thisParam ) ) {
+					unhandled.erase( memberExpr->member );
+				}
+			}
+		}
+	}
+
+	void GenStructMemberCalls::previsit( const ast::MemberExpr * memberExpr ) {
+		if ( ! checkWarnings( function ) || ! isCtor ) {
+			visit_children = false;
+			return;
+		}
+
+		if ( isThisExpression( memberExpr->aggregate, thisParam ) ) {
+			if ( unhandled.count( memberExpr->member ) ) {
+				// emit a warning because a member was used before it was constructed
+				usedUninit.insert( { memberExpr->member, memberExpr->location } );
+			}
+		}
+	}
+
+	template< typename Visitor, typename... Params >
+	void error( Visitor & v, CodeLocation loc, const Params &... params ) {
+		SemanticErrorException err( loc, toString( params... ) );
+		v.errors.append( err );
+	}
+
+	template< typename... Params >
+	void GenStructMemberCalls::emit( CodeLocation loc, const Params &... params ) {
+		// toggle warnings vs. errors here.
+		// warn( params... );
+		error( *this, loc, params... );
+	}
+
+	const ast::Expr * GenStructMemberCalls::postvisit( const ast::UntypedExpr * untypedExpr ) {
+		// Expression * newExpr = untypedExpr;
+		// xxx - functions returning ast::ptr seems wrong...
+		auto res = ResolvExpr::findVoidExpression( untypedExpr, symtab );
+		return res.release();
+		// return newExpr;
+	}
+
+	void InsertImplicitCalls::previsit(const ast::UniqueExpr * unqExpr) {
+		if (visitedIds.count(unqExpr->id)) visit_children = false;
+		else visitedIds.insert(unqExpr->id);
+	}
+
+	const ast::Expr * FixCtorExprs::postvisit( const ast::ConstructorExpr * ctorExpr ) {
+		const CodeLocation loc = ctorExpr->location;
+		static UniqueName tempNamer( "_tmp_ctor_expr" );
+		// xxx - is the size check necessary?
+		assert( ctorExpr->result && ctorExpr->result->size() == 1 );
+
+		// xxx - this can be TupleAssignExpr now. Need to properly handle this case.
+		// take possession of expr and env
+		ast::ptr<ast::ApplicationExpr> callExpr = ctorExpr->callExpr.strict_as<ast::ApplicationExpr>();
+		ast::ptr<ast::TypeSubstitution> env = ctorExpr->env;
+		// ctorExpr->set_callExpr( nullptr );
+		// ctorExpr->set_env( nullptr );
+
+		// xxx - ideally we would reuse the temporary generated from the copy constructor passes from within firstArg if it exists and not generate a temporary if it's unnecessary.
+		auto tmp = new ast::ObjectDecl(loc, tempNamer.newName(), callExpr->args.front()->result );
+		declsToAddBefore.push_back( tmp );
+		// delete ctorExpr;
+
+		// build assignment and replace constructor's first argument with new temporary
+		auto mutCallExpr = callExpr.get_and_mutate();
+		const ast::Expr * firstArg = callExpr->args.front();
+		ast::Expr * assign = new ast::UntypedExpr(loc, new ast::NameExpr(loc, "?=?" ), { new ast::AddressExpr(loc, new ast::VariableExpr(loc, tmp ) ), new ast::AddressExpr( firstArg ) } );
+		firstArg = new ast::VariableExpr(loc, tmp );
+		mutCallExpr->args.front() = firstArg;
+
+		// resolve assignment and dispose of new env
+		auto resolved = ResolvExpr::findVoidExpression( assign, symtab );
+		auto mut = resolved.get_and_mutate();
+		assertf(resolved.get() == mut, "newly resolved expression must be unique");
+		mut->env = nullptr;
+
+		// for constructor expr:
+		//   T x;
+		//   x{};
+		// results in:
+		//   T x;
+		//   T & tmp;
+		//   &tmp = &x, ?{}(tmp), tmp
+		ast::CommaExpr * commaExpr = new ast::CommaExpr(loc, resolved, new ast::CommaExpr(loc, mutCallExpr, new ast::VariableExpr(loc, tmp ) ) );
+		commaExpr->env = env;
+		return commaExpr;
+	}
+} // namespace
+} // namespace InitTweak
+
+// Local Variables: //
+// tab-width: 4 //
+// mode: c++ //
+// compile-command: "make install" //
+// End: //
Index: src/InitTweak/GenInit.cc
===================================================================
--- src/InitTweak/GenInit.cc	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/InitTweak/GenInit.cc	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -283,4 +283,11 @@
 		assert( stmts.size() <= 1 );
 		return stmts.size() == 1 ? strict_dynamic_cast< ImplicitCtorDtorStmt * >( stmts.front() ) : nullptr;
+
+	}
+
+	ast::ptr<ast::Stmt> genCtorDtor (const CodeLocation & loc, const std::string & fname, const ast::ObjectDecl * objDecl, const ast::Expr * arg) {
+		assertf(objDecl, "genCtorDtor passed null objDecl");
+		InitExpander_new srcParam(arg);
+		return SymTab::genImplicitCall(srcParam, new ast::VariableExpr(loc, objDecl), loc, fname, objDecl);
 	}
 
Index: src/InitTweak/GenInit.h
===================================================================
--- src/InitTweak/GenInit.h	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/InitTweak/GenInit.h	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -33,4 +33,5 @@
 	/// generates a single ctor/dtor statement using objDecl as the 'this' parameter and arg as the optional argument
 	ImplicitCtorDtorStmt * genCtorDtor( const std::string & fname, ObjectDecl * objDecl, Expression * arg = nullptr );
+	ast::ptr<ast::Stmt> genCtorDtor (const CodeLocation & loc, const std::string & fname, const ast::ObjectDecl * objDecl, const ast::Expr * arg = nullptr);
 
 	/// creates an appropriate ConstructorInit node which contains a constructor, destructor, and C-initializer
Index: src/InitTweak/InitTweak.cc
===================================================================
--- src/InitTweak/InitTweak.cc	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/InitTweak/InitTweak.cc	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -498,4 +498,11 @@
 	}
 
+	const ast::ObjectDecl * getParamThis(const ast::FunctionDecl * func) {
+		assertf( func, "getParamThis: nullptr ftype" );
+		auto & params = func->params;
+		assertf( ! params.empty(), "getParamThis: ftype with 0 parameters: %s", toString( func ).c_str());
+		return params.front().strict_as<ast::ObjectDecl>();
+	}
+
 	bool tryConstruct( DeclarationWithType * dwt ) {
 		ObjectDecl * objDecl = dynamic_cast< ObjectDecl * >( dwt );
@@ -511,4 +518,18 @@
 	}
 
+	bool tryConstruct( const ast::DeclWithType * dwt ) {
+		auto objDecl = dynamic_cast< const ast::ObjectDecl * >( dwt );
+		if ( ! objDecl ) return false;
+		return (objDecl->init == nullptr ||
+				( objDecl->init != nullptr && objDecl->init->maybeConstructed ))
+			&& ! objDecl->storage.is_extern
+			&& isConstructable( objDecl->type );
+	}
+
+	bool isConstructable( const ast::Type * type ) {
+		return ! dynamic_cast< const ast::VarArgsType * >( type ) && ! dynamic_cast< const ast::ReferenceType * >( type ) 
+		&& ! dynamic_cast< const ast::FunctionType * >( type ) && ! Tuples::isTtype( type );
+	}
+
 	struct CallFinder_old {
 		CallFinder_old( const std::list< std::string > & names ) : names( names ) {}
@@ -536,5 +557,5 @@
 
 	struct CallFinder_new final {
-		std::vector< ast::ptr< ast::Expr > > matches;
+		std::vector< const ast::Expr * > matches;
 		const std::vector< std::string > names;
 
@@ -558,5 +579,5 @@
 	}
 
-	std::vector< ast::ptr< ast::Expr > > collectCtorDtorCalls( const ast::Stmt * stmt ) {
+	std::vector< const ast::Expr * > collectCtorDtorCalls( const ast::Stmt * stmt ) {
 		ast::Pass< CallFinder_new > finder{ std::vector< std::string >{ "?{}", "^?{}" } };
 		maybe_accept( stmt, finder );
@@ -696,5 +717,5 @@
 		template <typename Predicate>
 		bool allofCtorDtor( const ast::Stmt * stmt, const Predicate & pred ) {
-			std::vector< ast::ptr< ast::Expr > > callExprs = collectCtorDtorCalls( stmt );
+			std::vector< const ast::Expr * > callExprs = collectCtorDtorCalls( stmt );
 			return std::all_of( callExprs.begin(), callExprs.end(), pred );
 		}
@@ -939,4 +960,31 @@
 	}
 
+	// looks like some other such codegen uses UntypedExpr and does not create fake function. should revisit afterwards
+	// following passes may accidentally resolve this expression if returned as untyped...
+	ast::Expr * createBitwiseAssignment (const ast::Expr * dst, const ast::Expr * src) {
+		static ast::ptr<ast::FunctionDecl> assign = nullptr;
+		if (!assign) {
+			auto td = new ast::TypeDecl({}, "T", {}, nullptr, ast::TypeDecl::Dtype, true);
+			assign = new ast::FunctionDecl({}, "?=?", {}, 
+			{ new ast::ObjectDecl({}, "_dst", new ast::ReferenceType(new ast::TypeInstType("T", td))),
+			  new ast::ObjectDecl({}, "_src", new ast::TypeInstType("T", td))},
+			{ new ast::ObjectDecl({}, "_ret", new ast::TypeInstType("T", td))}, nullptr, {}, ast::Linkage::Intrinsic);
+		}
+		if (dst->result.as<ast::ReferenceType>()) {
+			for (int depth = dst->result->referenceDepth(); depth > 0; depth--) {
+				dst = new ast::AddressExpr(dst);
+			}
+		}
+		else {
+			dst = new ast::CastExpr(dst, new ast::ReferenceType(dst->result, {}));
+		}
+		if (src->result.as<ast::ReferenceType>()) {
+			for (int depth = src->result->referenceDepth(); depth > 0; depth--) {
+				src = new ast::AddressExpr(src);
+			}
+		}
+		return new ast::ApplicationExpr(dst->location, ast::VariableExpr::functionPointer(dst->location, assign), {dst, src});
+	}
+
 	struct ConstExprChecker : public WithShortCircuiting {
 		// most expressions are not const expr
@@ -1055,3 +1103,13 @@
 		return isCopyFunction( decl, "?{}" );
 	}
+
+	void addDataSectonAttribute( ObjectDecl * objDecl ) {
+		Type *strLitT = new PointerType( Type::Qualifiers( ),
+			new BasicType( Type::Qualifiers( ), BasicType::Char ) );
+		std::list< Expression * > attr_params;
+		attr_params.push_back( 
+			new ConstantExpr( Constant( strLitT, "\".data#\"", std::nullopt ) ) );
+		objDecl->attributes.push_back(new Attribute("section", attr_params));
+	}
+
 }
Index: src/InitTweak/InitTweak.h
===================================================================
--- src/InitTweak/InitTweak.h	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/InitTweak/InitTweak.h	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -38,7 +38,10 @@
 	/// returns the first parameter of a constructor/destructor/assignment function
 	ObjectDecl * getParamThis( FunctionType * ftype );
+	const ast::ObjectDecl * getParamThis(const ast::FunctionDecl * func);
 
 	/// generate a bitwise assignment operation.
 	ApplicationExpr * createBitwiseAssignment( Expression * dst, Expression * src );
+
+	ast::Expr * createBitwiseAssignment( const ast::Expr * dst, const ast::Expr * src);
 
 	/// transform Initializer into an argument list that can be passed to a call expression
@@ -48,7 +51,9 @@
 	/// True if the resolver should try to construct dwt
 	bool tryConstruct( DeclarationWithType * dwt );
+	bool tryConstruct( const ast::DeclWithType * dwt );
 
 	/// True if the type can have a user-defined constructor
 	bool isConstructable( Type * t );
+	bool isConstructable( const ast::Type * t );
 
 	/// True if the Initializer contains designations
@@ -79,5 +84,5 @@
 	/// get all Ctor/Dtor call expressions from a Statement
 	void collectCtorDtorCalls( Statement * stmt, std::list< Expression * > & matches );
-	std::vector< ast::ptr< ast::Expr > > collectCtorDtorCalls( const ast::Stmt * stmt );
+	std::vector< const ast::Expr * > collectCtorDtorCalls( const ast::Stmt * stmt );
 
 	/// get the Ctor/Dtor call expression from a Statement that looks like a generated ctor/dtor call
@@ -102,4 +107,15 @@
 	bool isConstExpr( Expression * expr );
 	bool isConstExpr( Initializer * init );
+
+	/// Modifies objDecl to have:
+	///    __attribute__((section (".data#")))
+	/// which makes gcc put the declared variable in the data section,
+	/// which is helpful for global constants on newer gcc versions,
+	/// so that CFA's generated initialization won't segfault when writing it via a const cast.
+	/// The trailing # is an injected assembly comment, to suppress the "a" in
+	///    .section .data,"a"
+	///    .section .data#,"a"
+	/// to avoid assembler warning "ignoring changed section attributes for .data"
+	void addDataSectonAttribute( ObjectDecl * objDecl );
 
 	class InitExpander_old {
Index: src/InitTweak/module.mk
===================================================================
--- src/InitTweak/module.mk	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/InitTweak/module.mk	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -23,5 +23,6 @@
 	InitTweak/GenInit.h \
 	InitTweak/InitTweak.cc \
-	InitTweak/InitTweak.h
+	InitTweak/InitTweak.h \
+	InitTweak/FixInitNew.cpp
 
 SRCDEMANGLE += \
Index: src/Parser/ParseNode.h
===================================================================
--- src/Parser/ParseNode.h	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/Parser/ParseNode.h	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -10,6 +10,6 @@
 // Created On       : Sat May 16 13:28:16 2015
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Mon Jul  6 09:33:32 2020
-// Update Count     : 892
+// Last Modified On : Sat Oct 24 03:53:54 2020
+// Update Count     : 895
 //
 
@@ -205,6 +205,5 @@
 struct TypeData;
 
-class DeclarationNode : public ParseNode {
-  public:
+struct DeclarationNode : public ParseNode {
 	// These enumerations must harmonize with their names in DeclarationNode.cc.
 	enum BasicType { Void, Bool, Char, Int, Int128,
@@ -304,5 +303,5 @@
 	bool get_inLine() const { return inLine; }
 	DeclarationNode * set_inLine( bool inL ) { inLine = inL; return this; }
-  public:
+
 	DeclarationNode * get_last() { return (DeclarationNode *)ParseNode::get_last(); }
 
@@ -360,6 +359,5 @@
 //##############################################################################
 
-class StatementNode final : public ParseNode {
-  public:
+struct StatementNode final : public ParseNode {
 	StatementNode() { stmt = nullptr; }
 	StatementNode( Statement * stmt ) : stmt( stmt ) {}
@@ -382,5 +380,5 @@
 		os << stmt.get() << std::endl;
 	}
-  private:
+
 	std::unique_ptr<Statement> stmt;
 }; // StatementNode
@@ -426,4 +424,5 @@
 Statement * build_finally( StatementNode * stmt );
 Statement * build_compound( StatementNode * first );
+StatementNode * maybe_build_compound( StatementNode * first );
 Statement * build_asm( bool voltile, Expression * instruction, ExpressionNode * output = nullptr, ExpressionNode * input = nullptr, ExpressionNode * clobber = nullptr, LabelNode * gotolabels = nullptr );
 Statement * build_directive( std::string * directive );
Index: src/Parser/StatementNode.cc
===================================================================
--- src/Parser/StatementNode.cc	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/Parser/StatementNode.cc	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -10,6 +10,6 @@
 // Created On       : Sat May 16 14:59:41 2015
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Sat Aug  4 09:39:25 2018
-// Update Count     : 363
+// Last Modified On : Sat Oct 24 04:20:55 2020
+// Update Count     : 383
 //
 
@@ -345,4 +345,19 @@
 } // build_compound
 
+// A single statement in a control structure is always converted to a compound statement so subsequent generated code
+// can be placed within this compound statement. Otherwise, code generation has to constantly check for a single
+// statement and wrap it into a compound statement to insert additional code. Hence, all control structures have a
+// conical form for code generation.
+StatementNode * maybe_build_compound( StatementNode * first ) {
+	// Optimization: if the control-structure statement is a compound statement, do not wrap it.
+	// e.g., if (...) {...} do not wrap the existing compound statement.
+	if ( ! dynamic_cast<CompoundStmt *>( first->stmt.get() ) ) { // unique_ptr
+		CompoundStmt * cs = new CompoundStmt();
+		buildMoveList( first, cs->get_kids() );
+		return new StatementNode( cs );
+	} // if
+	return first;
+} // maybe_build_compound
+
 Statement * build_asm( bool voltile, Expression * instruction, ExpressionNode * output, ExpressionNode * input, ExpressionNode * clobber, LabelNode * gotolabels ) {
 	std::list< Expression * > out, in;
Index: src/Parser/parser.yy
===================================================================
--- src/Parser/parser.yy	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/Parser/parser.yy	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -10,6 +10,6 @@
 // Created On       : Sat Sep  1 20:22:55 2001
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Fri Oct  9 18:09:09 2020
-// Update Count     : 4614
+// Last Modified On : Sat Oct 24 08:21:14 2020
+// Update Count     : 4624
 //
 
@@ -1080,7 +1080,7 @@
 	IF '(' if_control_expression ')' statement			%prec THEN
 		// explicitly deal with the shift/reduce conflict on if/else
-		{ $$ = new StatementNode( build_if( $3, $5, nullptr ) ); }
+		{ $$ = new StatementNode( build_if( $3, maybe_build_compound( $5 ), nullptr ) ); }
 	| IF '(' if_control_expression ')' statement ELSE statement
-		{ $$ = new StatementNode( build_if( $3, $5, $7 ) ); }
+		{ $$ = new StatementNode( build_if( $3, maybe_build_compound( $5 ), maybe_build_compound( $7 ) ) ); }
 	;
 
@@ -1130,5 +1130,5 @@
 
 case_clause:											// CFA
-	case_label_list statement					{ $$ = $1->append_last_case( new StatementNode( build_compound( $2 ) ) ); }
+	case_label_list statement					{ $$ = $1->append_last_case( maybe_build_compound( $2 ) ); }
 	;
 
@@ -1148,15 +1148,15 @@
 iteration_statement:
 	WHILE '(' push if_control_expression ')' statement pop
-		{ $$ = new StatementNode( build_while( $4, $6 ) ); }
+		{ $$ = new StatementNode( build_while( $4, maybe_build_compound( $6 ) ) ); }
 	| WHILE '(' ')' statement							// CFA => while ( 1 )
-		{ $$ = new StatementNode( build_while( new IfCtrl( nullptr, new ExpressionNode( build_constantInteger( *new string( "1" ) ) ) ), $4 ) ); }
+		{ $$ = new StatementNode( build_while( new IfCtrl( nullptr, new ExpressionNode( build_constantInteger( *new string( "1" ) ) ) ), maybe_build_compound( $4 ) ) ); }
 	| DO statement WHILE '(' comma_expression ')' ';'
-		{ $$ = new StatementNode( build_do_while( $5, $2 ) ); }
+		{ $$ = new StatementNode( build_do_while( $5, maybe_build_compound( $2 ) ) ); }
 	| DO statement WHILE '(' ')' ';'					// CFA => do while( 1 )
-		{ $$ = new StatementNode( build_do_while( new ExpressionNode( build_constantInteger( *new string( "1" ) ) ), $2 ) ); }
+		{ $$ = new StatementNode( build_do_while( new ExpressionNode( build_constantInteger( *new string( "1" ) ) ), maybe_build_compound( $2 ) ) ); }
 	| FOR '(' push for_control_expression_list ')' statement pop
-		{ $$ = new StatementNode( build_for( $4, $6 ) ); }
+		{ $$ = new StatementNode( build_for( $4, maybe_build_compound( $6 ) ) ); }
 	| FOR '(' ')' statement								// CFA => for ( ;; )
-		{ $$ = new StatementNode( build_for( new ForCtrl( (ExpressionNode * )nullptr, (ExpressionNode * )nullptr, (ExpressionNode * )nullptr ), $4 ) ); }
+		{ $$ = new StatementNode( build_for( new ForCtrl( (ExpressionNode * )nullptr, (ExpressionNode * )nullptr, (ExpressionNode * )nullptr ), maybe_build_compound( $4 ) ) ); }
 	;
 
@@ -1341,16 +1341,16 @@
 waitfor_clause:
 	when_clause_opt waitfor statement					%prec THEN
- 		{ $$ = build_waitfor( $2, $3, $1 ); }
+		{ $$ = build_waitfor( $2, maybe_build_compound( $3 ), $1 ); }
 	| when_clause_opt waitfor statement WOR waitfor_clause
- 		{ $$ = build_waitfor( $2, $3, $1, $5 ); }
+		{ $$ = build_waitfor( $2, maybe_build_compound( $3 ), $1, $5 ); }
 	| when_clause_opt timeout statement					%prec THEN
- 		{ $$ = build_waitfor_timeout( $2, $3, $1 ); }
+		{ $$ = build_waitfor_timeout( $2, maybe_build_compound( $3 ), $1 ); }
 	| when_clause_opt ELSE statement
- 		{ $$ = build_waitfor_timeout( nullptr, $3, $1 ); }
+		{ $$ = build_waitfor_timeout( nullptr, maybe_build_compound( $3 ), $1 ); }
 		// "else" must be conditional after timeout or timeout is never triggered (i.e., it is meaningless)
 	| when_clause_opt timeout statement WOR ELSE statement
 		{ SemanticError( yylloc, "else clause must be conditional after timeout or timeout never triggered." ); $$ = nullptr; }
 	| when_clause_opt timeout statement WOR when_clause ELSE statement
- 		{ $$ = build_waitfor_timeout( $2, $3, $1, $7, $5 ); }
+		{ $$ = build_waitfor_timeout( $2, maybe_build_compound( $3 ), $1, maybe_build_compound( $7 ), $5 ); }
 	;
 
Index: src/ResolvExpr/Resolver.cc
===================================================================
--- src/ResolvExpr/Resolver.cc	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/ResolvExpr/Resolver.cc	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -1105,5 +1105,7 @@
 		}
 
-		/// Establish post-resolver invariants for expressions
+		
+	} // anonymous namespace
+/// Establish post-resolver invariants for expressions
 		void finishExpr(
 			ast::ptr< ast::Expr > & expr, const ast::TypeEnvironment & env,
@@ -1118,6 +1120,4 @@
 			StripCasts_new::strip( expr );
 		}
-	} // anonymous namespace
-
 
 	ast::ptr< ast::Expr > resolveInVoidContext(
@@ -1139,6 +1139,5 @@
 	}
 
-	namespace {
-		/// Resolve `untyped` to the expression whose candidate is the best match for a `void`
+	/// Resolve `untyped` to the expression whose candidate is the best match for a `void`
 		/// context.
 		ast::ptr< ast::Expr > findVoidExpression(
@@ -1151,4 +1150,7 @@
 			return newExpr;
 		}
+
+	namespace {
+		
 
 		/// resolve `untyped` to the expression whose candidate satisfies `pred` with the
@@ -1162,5 +1164,5 @@
 			CandidateRef choice =
 				findUnfinishedKindExpression( untyped, symtab, kind, pred, mode );
-			finishExpr( choice->expr, choice->env, untyped->env );
+			ResolvExpr::finishExpr( choice->expr, choice->env, untyped->env );
 			return std::move( choice->expr );
 		}
@@ -1272,5 +1274,5 @@
 	// size_t Resolver_new::traceId = Stats::Heap::new_stacktrace_id("Resolver");
 
-	void resolve( std::list< ast::ptr< ast::Decl > >& translationUnit ) {
+	void resolve( ast::TranslationUnit& translationUnit ) {
 		ast::Pass< Resolver_new >::run( translationUnit );
 	}
Index: src/ResolvExpr/Resolver.h
===================================================================
--- src/ResolvExpr/Resolver.h	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/ResolvExpr/Resolver.h	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -35,4 +35,5 @@
 	class StmtExpr;
 	class SymbolTable;
+	class TranslationUnit;
 	class Type;
 	class TypeEnvironment;
@@ -55,5 +56,5 @@
 
 	/// Checks types and binds syntactic constructs to typed representations
-	void resolve( std::list< ast::ptr<ast::Decl> >& translationUnit );
+	void resolve( ast::TranslationUnit& translationUnit );
 	/// Searches expr and returns the first DeletedExpr found, otherwise nullptr
 	const ast::DeletedExpr * findDeletedExpr( const ast::Expr * expr );
@@ -66,9 +67,11 @@
 	ast::ptr< ast::Expr > findSingleExpression(
 		const ast::Expr * untyped, const ast::Type * type, const ast::SymbolTable & symtab );
+	ast::ptr< ast::Expr > findVoidExpression(
+		const ast::Expr * untyped, const ast::SymbolTable & symtab);
 	/// Resolves a constructor init expression
-	ast::ptr< ast::Init > resolveCtorInit( 
+	ast::ptr< ast::Init > resolveCtorInit(
 		const ast::ConstructorInit * ctorInit, const ast::SymbolTable & symtab );
 	/// Resolves a statement expression
-	ast::ptr< ast::Expr > resolveStmtExpr( 
+	ast::ptr< ast::Expr > resolveStmtExpr(
 		const ast::StmtExpr * stmtExpr, const ast::SymbolTable & symtab );
 } // namespace ResolvExpr
Index: src/SymTab/Autogen.cc
===================================================================
--- src/SymTab/Autogen.cc	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/SymTab/Autogen.cc	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -233,4 +233,15 @@
 	}
 
+	// shallow copy the pointer list for return
+	std::vector<ast::ptr<ast::TypeDecl>> getGenericParams (const ast::Type * t) {
+		if (auto structInst = dynamic_cast<const ast::StructInstType*>(t)) {
+			return structInst->base->params;
+		}
+		if (auto unionInst = dynamic_cast<const ast::UnionInstType*>(t)) {
+			return unionInst->base->params;
+		}
+		return {};
+	}
+
 	/// given type T, generate type of default ctor/dtor, i.e. function type void (*) (T *)
 	FunctionType * genDefaultType( Type * paramType, bool maybePolymorphic ) {
@@ -244,4 +255,12 @@
 		ftype->parameters.push_back( dstParam );
 		return ftype;
+	}
+
+	/// 
+	ast::FunctionDecl * genDefaultFunc(const CodeLocation loc, const std::string fname, const ast::Type * paramType, bool maybePolymorphic) {
+		std::vector<ast::ptr<ast::TypeDecl>> typeParams;
+		if (maybePolymorphic) typeParams = getGenericParams(paramType);
+		auto dstParam = new ast::ObjectDecl(loc, "_dst", new ast::ReferenceType(paramType), nullptr, {}, ast::Linkage::Cforall);
+		return new ast::FunctionDecl(loc, fname, std::move(typeParams), {dstParam}, {}, new ast::CompoundStmt(loc));
 	}
 
Index: src/SymTab/Autogen.h
===================================================================
--- src/SymTab/Autogen.h	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/SymTab/Autogen.h	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -55,4 +55,6 @@
 	/// maybePolymorphic is true if the resulting FunctionType is allowed to be polymorphic
 	FunctionType * genDefaultType( Type * paramType, bool maybePolymorphic = true );
+
+	ast::FunctionDecl * genDefaultFunc(const CodeLocation loc, const std::string fname, const ast::Type * paramType, bool maybePolymorphic = true);
 
 	/// generate the type of a copy constructor for paramType.
Index: src/SynTree/Expression.h
===================================================================
--- src/SynTree/Expression.h	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/SynTree/Expression.h	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -163,4 +163,29 @@
 };
 
+/// VariableExpr represents an expression that simply refers to the value of a named variable.
+/// Does not take ownership of var.
+class VariableExpr : public Expression {
+  public:
+	DeclarationWithType * var;
+
+	VariableExpr();
+	VariableExpr( DeclarationWithType * var );
+	VariableExpr( const VariableExpr & other );
+	virtual ~VariableExpr();
+
+	bool get_lvalue() const final;
+
+	DeclarationWithType * get_var() const { return var; }
+	void set_var( DeclarationWithType * newValue ) { var = newValue; }
+
+	static VariableExpr * functionPointer( FunctionDecl * decl );
+
+	virtual VariableExpr * clone() const override { return new VariableExpr( * this ); }
+	virtual void accept( Visitor & v ) override { v.visit( this ); }
+	virtual void accept( Visitor & v ) const override { v.visit( this ); }
+	virtual Expression * acceptMutator( Mutator & m ) override { return m.mutate( this ); }
+	virtual void print( std::ostream & os, Indenter indent = {} ) const override;
+};
+
 // The following classes are used to represent expression types that cannot be converted into
 // function-call format.
@@ -329,29 +354,4 @@
 };
 
-/// VariableExpr represents an expression that simply refers to the value of a named variable.
-/// Does not take ownership of var.
-class VariableExpr : public Expression {
-  public:
-	DeclarationWithType * var;
-
-	VariableExpr();
-	VariableExpr( DeclarationWithType * var );
-	VariableExpr( const VariableExpr & other );
-	virtual ~VariableExpr();
-
-	bool get_lvalue() const final;
-
-	DeclarationWithType * get_var() const { return var; }
-	void set_var( DeclarationWithType * newValue ) { var = newValue; }
-
-	static VariableExpr * functionPointer( FunctionDecl * decl );
-
-	virtual VariableExpr * clone() const override { return new VariableExpr( * this ); }
-	virtual void accept( Visitor & v ) override { v.visit( this ); }
-	virtual void accept( Visitor & v ) const override { v.visit( this ); }
-	virtual Expression * acceptMutator( Mutator & m ) override { return m.mutate( this ); }
-	virtual void print( std::ostream & os, Indenter indent = {} ) const override;
-};
-
 /// ConstantExpr represents an expression that simply refers to the value of a constant
 class ConstantExpr : public Expression {
Index: src/main.cc
===================================================================
--- src/main.cc	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ src/main.cc	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -343,16 +343,23 @@
 			auto transUnit = convert( move( translationUnit ) );
 			PASS( "Resolve", ResolvExpr::resolve( transUnit ) );
+			if ( exprp ) {
+				translationUnit = convert( move( transUnit ) );
+				dump( translationUnit );
+				return EXIT_SUCCESS;
+			} // if
+
+			PASS( "Fix Init", InitTweak::fix(transUnit, buildingLibrary()));
 			translationUnit = convert( move( transUnit ) );
 		} else {
 			PASS( "Resolve", ResolvExpr::resolve( translationUnit ) );
+			if ( exprp ) {
+				dump( translationUnit );
+				return EXIT_SUCCESS;
+			}
+
+			PASS( "Fix Init", InitTweak::fix( translationUnit, buildingLibrary() ) );
 		}
 
-		if ( exprp ) {
-			dump( translationUnit );
-			return EXIT_SUCCESS;
-		} // if
-
 		// fix ObjectDecl - replaces ConstructorInit nodes
-		PASS( "Fix Init", InitTweak::fix( translationUnit, buildingLibrary() ) );
 		if ( ctorinitp ) {
 			dump ( translationUnit );
Index: tests/.expect/castError.oast.txt
===================================================================
--- tests/.expect/castError.oast.txt	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ tests/.expect/castError.oast.txt	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,70 @@
+castError.cfa:23:1 error: Cannot choose between 3 alternatives for expression
+Explicit Cast of:
+  Name: f
+... to:
+  char Alternatives are:
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: f: function
+        accepting unspecified arguments
+      ... returning nothing
+
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: f: double
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: f: signed int
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+
+castError.cfa:28:1 error: Cannot choose between 2 alternatives for expression
+Generated Cast of:
+  Comma Expression:
+    constant expression (3 3: signed int)
+    Name: v
+... to: nothing Alternatives are:
+Cost ( 0, 0, 2, 0, 0, 0, 0 ): Generated Cast of:
+      Comma Expression:
+        constant expression (3 3: signed int)
+        Variable Expression: v: unsigned char
+    ... to: nothing
+  (types:
+    void 
+  )
+  Environment:
+
+Cost ( 0, 0, 2, 0, 0, 0, 0 ): Generated Cast of:
+      Comma Expression:
+        constant expression (3 3: signed int)
+        Variable Expression: v: signed short int
+    ... to: nothing
+  (types:
+    void 
+  )
+  Environment:
+
+
+castError.cfa:30:1 error: No reasonable alternatives for expression Explicit Cast of:
+  Name: sint
+... to:
+  instance of struct S with body 1
+  ... with parameters
+    char
+
Index: sts/.expect/castError.txt
===================================================================
--- tests/.expect/castError.txt	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ 	(revision )
@@ -1,70 +1,0 @@
-castError.cfa:23:1 error: Cannot choose between 3 alternatives for expression
-Explicit Cast of:
-  Name: f
-... to:
-  char Alternatives are:
-Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
-      Variable Expression: f: function
-        accepting unspecified arguments
-      ... returning nothing
-
-    ... to:
-      char
-  (types:
-    char
-  )
-  Environment:
-
-Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
-      Variable Expression: f: double
-    ... to:
-      char
-  (types:
-    char
-  )
-  Environment:
-
-Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
-      Variable Expression: f: signed int
-    ... to:
-      char
-  (types:
-    char
-  )
-  Environment:
-
-
-castError.cfa:28:1 error: Cannot choose between 2 alternatives for expression
-Generated Cast of:
-  Comma Expression:
-    constant expression (3 3: signed int)
-    Name: v
-... to: nothing Alternatives are:
-Cost ( 0, 0, 2, 0, 0, 0, 0 ): Generated Cast of:
-      Comma Expression:
-        constant expression (3 3: signed int)
-        Variable Expression: v: unsigned char
-    ... to: nothing
-  (types:
-    void 
-  )
-  Environment:
-
-Cost ( 0, 0, 2, 0, 0, 0, 0 ): Generated Cast of:
-      Comma Expression:
-        constant expression (3 3: signed int)
-        Variable Expression: v: signed short int
-    ... to: nothing
-  (types:
-    void 
-  )
-  Environment:
-
-
-castError.cfa:30:1 error: No reasonable alternatives for expression Explicit Cast of:
-  Name: sint
-... to:
-  instance of struct S with body 1
-  ... with parameters
-    char
-
Index: tests/.expect/const-init.txt
===================================================================
--- tests/.expect/const-init.txt	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ tests/.expect/const-init.txt	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -1,1 +1,2 @@
-done
+almost done
+dtor
Index: tests/Makefile.am
===================================================================
--- tests/Makefile.am	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ tests/Makefile.am	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -53,5 +53,5 @@
 
 # adjust CC to current flags
-CC = LC_ALL=C $(if $(DISTCC_CFA_PATH),distcc $(DISTCC_CFA_PATH) ${ARCH_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS})
+CC = LC_ALL=C $(if $(DISTCC_CFA_PATH),distcc $(DISTCC_CFA_PATH) ${ARCH_FLAGS} ${AST_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS} ${AST_FLAGS})
 CFACC = $(CC)
 
@@ -60,5 +60,5 @@
 
 # adjusted CC but without the actual distcc call
-CFACCLOCAL = $(if $(DISTCC_CFA_PATH),$(DISTCC_CFA_PATH) ${ARCH_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS})
+CFACCLOCAL = $(if $(DISTCC_CFA_PATH),$(DISTCC_CFA_PATH) ${ARCH_FLAGS} ${AST_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS} ${AST_FLAGS})
 CFACCLINK = $(CFACCLOCAL) -quiet $(if $(test), 2> $(test), ) $($(shell echo "${@}_FLAGSLD" | sed 's/-\|\//_/g'))
 
Index: tests/alloc.cfa
===================================================================
--- tests/alloc.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ tests/alloc.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -342,5 +342,5 @@
 	for ( i; dim ) { printf( "%d %g, ", stp1[i].x, stp1[i].y ); }
 	printf( "\n" );
-	adelete( dim, stp, dim, stp1 );
+	adelete( stp, stp1 );
 
 	// extras
Index: tests/complex.cfa
===================================================================
--- tests/complex.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ tests/complex.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -14,5 +14,4 @@
 //
 
-#include <stdio.h>
 #include <complex.h>
 #ifdef __CFA__
Index: tests/config.py.in
===================================================================
--- tests/config.py.in	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ tests/config.py.in	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -9,2 +9,3 @@
 HOSTARCH = "@host_cpu@"
 DISTRIBUTE = @HAS_DISTCC@
+NEWAST = @DEFAULT_NEW_AST@
Index: tests/const-init.cfa
===================================================================
--- tests/const-init.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ tests/const-init.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -16,11 +16,11 @@
 /*
 
-This test shows non-crashing of generated code for constants with interesting initizers.
+These tests show non-crashing of generated code for constants with interesting initializers.
 The potential for these to crash is compiler dependent.
 
 There are two cases:
-1. static constants in one compilation unit (tested here)
+1. static constants in one compilation unit (tested here, in a few sub-cases)
 2. extern constants across compilation units (tested by libcfa being loadable, specifically
-   the constant declarations in libcfa/src/limits.cfa, which almost every test exercises,
+   the constant definitions in libcfa/src/limits.cfa, which almost every test exercises,
    including "hello;" but notably, the "limits" test does not exercise it because that test
    is compile-only)
@@ -37,12 +37,24 @@
 GCC-10 on Ubuntu 20.04    Has crashed      Has crashed
 
-For this test case to fail, with most other tests passing, would be a situation only ever
+For this test to fail, with most other tests passing, would be a situation only ever
 observed with GCC-8.
 
 */
 
+// initailized by generated function, called before main
 static const char foo = -1;
 
+struct thing{};
+void ^?{}( thing & ) { printf("dtor\n"); }
+
 int main() {
-    printf("done\n");
+    // foo is already initialized
+
+    // no dtor => stays a (static) local, initialized here
+    static const char bar = -1;
+
+    // has dtor => becomes a global, ctor called here, dtor called at exit
+    static const thing it;
+
+    printf("almost done\n");
 }
Index: tests/exceptions/cancel/.expect/thread.txt
===================================================================
--- tests/exceptions/cancel/.expect/thread.txt	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ tests/exceptions/cancel/.expect/thread.txt	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,2 @@
+0112345
+0112345
Index: tests/exceptions/cancel/coroutine.cfa
===================================================================
--- tests/exceptions/cancel/coroutine.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ tests/exceptions/cancel/coroutine.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -1,5 +1,4 @@
 // Try cancelling a coroutine.
 
-#include <stdio.h>
 #include <coroutine.hfa>
 #include <exception.hfa>
Index: tests/exceptions/cancel/thread.cfa
===================================================================
--- tests/exceptions/cancel/thread.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ tests/exceptions/cancel/thread.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,56 @@
+// Try cancelling a thread.
+
+#include <thread.hfa>
+#include <exception.hfa>
+
+TRIVIAL_EXCEPTION(internal_error);
+
+thread WillCancel {};
+
+const char * msg(ThreadCancelled(WillCancel) * this) {
+	return "ThreadCancelled(WillCancel)";
+}
+
+void main(WillCancel &) {
+	printf("1");
+	cancel_stack((internal_error){});
+	printf("!");
+}
+
+void explicit() {
+	try {
+		printf("0");
+		WillCancel cancel;
+		printf("1");
+		join(cancel);
+		printf("4");
+	} catchResume (ThreadCancelled(WillCancel) * error) {
+		printf("2");
+		if ((virtual internal_error *)error->the_exception) {
+			printf("3");
+		}
+	}
+	printf("5\n");
+}
+
+void implicit() {
+	try {
+		{
+			printf("0");
+			WillCancel cancel;
+			printf("1");
+		}
+		printf("4");
+	} catchResume (ThreadCancelled(WillCancel) * error) {
+		printf("2");
+		if ((virtual internal_error *)error->the_exception) {
+			printf("3");
+		}
+	}
+	printf("5\n");
+}
+
+int main(int argc, char * argv[]) {
+	explicit();
+	implicit();
+}
Index: tests/exceptions/conditional.cfa
===================================================================
--- tests/exceptions/conditional.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ tests/exceptions/conditional.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -5,5 +5,4 @@
 
 #include <exception.hfa>
-#include <stdio.h>
 
 VTABLE_DECLARATION(num_error)(
Index: tests/exceptions/except-io.hfa
===================================================================
--- tests/exceptions/except-io.hfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ tests/exceptions/except-io.hfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -1,5 +1,3 @@
 // Common tools for the exception tests.
-
-#include <stdio.h>
 
 // Echo when a destructor is run and an area/block is left.
Index: tests/exceptions/trash.cfa
===================================================================
--- tests/exceptions/trash.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ tests/exceptions/trash.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -2,5 +2,4 @@
 
 #include <exception.hfa>
-#include <stdio.h>
 
 TRIVIAL_EXCEPTION(yin);
Index: tests/global-monomorph.cfa
===================================================================
--- tests/global-monomorph.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ tests/global-monomorph.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -1,6 +1,3 @@
-// Crea
-
-#include <stdlib.hfa>
-#include <stdio.h>
+// Create monomorphic instances of polymorphic types at global scope.
 
 forall(dtype T)
Index: tests/meta/.expect/archVast.nast.arm64.txt
===================================================================
--- tests/meta/.expect/archVast.nast.arm64.txt	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ tests/meta/.expect/archVast.nast.arm64.txt	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,36 @@
+meta/archVast.cfa:28:1 error: Cannot choose between 3 alternatives for expression
+Explicit Cast of:
+  Name: FA64
+... to:
+  char Alternatives are:
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FA64: signed int
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FA64: function
+        accepting unspecified arguments
+      ... returning nothing
+
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FA64: double
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+
Index: tests/meta/.expect/archVast.nast.x64.txt
===================================================================
--- tests/meta/.expect/archVast.nast.x64.txt	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ tests/meta/.expect/archVast.nast.x64.txt	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,36 @@
+meta/archVast.cfa:28:1 error: Cannot choose between 3 alternatives for expression
+Explicit Cast of:
+  Name: FX64
+... to:
+  char Alternatives are:
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX64: signed int
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX64: function
+        accepting unspecified arguments
+      ... returning nothing
+
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX64: double
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+
Index: tests/meta/.expect/archVast.nast.x86.txt
===================================================================
--- tests/meta/.expect/archVast.nast.x86.txt	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ tests/meta/.expect/archVast.nast.x86.txt	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,36 @@
+meta/archVast.cfa:28:1 error: Cannot choose between 3 alternatives for expression
+Explicit Cast of:
+  Name: FX86
+... to:
+  char Alternatives are:
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX86: signed int
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX86: function
+        accepting unspecified arguments
+      ... returning nothing
+
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX86: double
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+
Index: tests/meta/.expect/archVast.oast.arm64.txt
===================================================================
--- tests/meta/.expect/archVast.oast.arm64.txt	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ tests/meta/.expect/archVast.oast.arm64.txt	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,36 @@
+meta/archVast.cfa:28:1 error: Cannot choose between 3 alternatives for expression
+Explicit Cast of:
+  Name: FA64
+... to:
+  char Alternatives are:
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FA64: function
+        accepting unspecified arguments
+      ... returning nothing
+
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FA64: double
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FA64: signed int
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+
Index: tests/meta/.expect/archVast.oast.x64.txt
===================================================================
--- tests/meta/.expect/archVast.oast.x64.txt	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ tests/meta/.expect/archVast.oast.x64.txt	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,36 @@
+meta/archVast.cfa:28:1 error: Cannot choose between 3 alternatives for expression
+Explicit Cast of:
+  Name: FX64
+... to:
+  char Alternatives are:
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX64: function
+        accepting unspecified arguments
+      ... returning nothing
+
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX64: double
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX64: signed int
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+
Index: tests/meta/.expect/archVast.oast.x86.txt
===================================================================
--- tests/meta/.expect/archVast.oast.x86.txt	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ tests/meta/.expect/archVast.oast.x86.txt	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,36 @@
+meta/archVast.cfa:28:1 error: Cannot choose between 3 alternatives for expression
+Explicit Cast of:
+  Name: FX86
+... to:
+  char Alternatives are:
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX86: function
+        accepting unspecified arguments
+      ... returning nothing
+
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX86: double
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+Cost ( 1, 0, 0, 0, 0, 0, 0 ): Explicit Cast of:
+      Variable Expression: FX86: signed int
+    ... to:
+      char
+  (types:
+    char
+  )
+  Environment:
+
+
Index: tests/meta/archVast.cfa
===================================================================
--- tests/meta/archVast.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
+++ tests/meta/archVast.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -0,0 +1,32 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// archVast.cfa -- Check if all combinations are of ast/arch are properly distinguished
+//
+// Author           : Thierry Delisle
+// Created On       : Tue Nov 03 15:04:53 2020
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#if defined( __i386 )
+#define NAME FX86
+#elif defined( __x86_64 )
+#define NAME FX64
+#elif defined( __aarch64__ )
+#define NAME FA64
+#endif
+
+int NAME;
+void NAME() {
+	int NAME;
+	double NAME;
+	(char)NAME;
+	(int(*)())NAME;
+}
+
+int main() {}
Index: tests/poly-d-cycle.cfa
===================================================================
--- tests/poly-d-cycle.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ tests/poly-d-cycle.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -1,5 +1,3 @@
 // Check that a cycle of polymorphic dtype structures can be instancated.
-
-#include <stdio.h>
 
 forall(dtype T)
Index: tests/poly-o-cycle.cfa
===================================================================
--- tests/poly-o-cycle.cfa	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ tests/poly-o-cycle.cfa	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -1,5 +1,3 @@
 // Check that a cycle of polymorphic otype structures can be instancated.
-
-#include <stdio.h>
 
 forall(otype T)
Index: tests/pybin/settings.py
===================================================================
--- tests/pybin/settings.py	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ tests/pybin/settings.py	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -85,5 +85,4 @@
 	def filter(self, tests):
 		return [test for test in tests if not test.arch or self.target == test.arch]
-		return True if not arch else self.target == arch
 
 	@staticmethod
@@ -98,4 +97,26 @@
 		self.path   = "debug" if value else "nodebug"
 
+class AST:
+	def __init__(self, ast):
+		if ast == "new":
+			self.target = ast
+			self.string = "New AST"
+			self.flags  = """AST_FLAGS=-XCFA,--new-ast"""
+		elif ast == "old":
+			self.target = ast
+			self.string = "Old AST"
+			self.flags  = """AST_FLAGS=-XCFA,--old-ast"""
+		elif ast == None:
+			self.target = "new" if config.NEWAST else "old"
+			self.string = "Default AST (%s)" % self.target
+			self.flags  = """AST_FLAGS="""
+		else:
+			print("""ERROR: Invalid ast configuration, must be "old", "new" or left unspecified, was %s""" % (value), file=sys.stderr)
+			sys.exit(1)
+
+	def filter(self, tests):
+
+		return [test for test in tests if not test.astv or self.target == test.astv]
+
 class Install:
 	def __init__(self, value):
@@ -120,14 +141,17 @@
 
 def init( options ):
+	global all_ast
 	global all_arch
 	global all_debug
 	global all_install
+	global ast
 	global arch
+	global debug
 	global archive
+	global install
+
 	global continue_
-	global debug
 	global dry_run
 	global generating
-	global install
 	global make
 	global output_width
@@ -135,4 +159,5 @@
 	global timeout2gdb
 
+	all_ast      = [AST(o)          for o in list(dict.fromkeys(options.ast    ))] if options.ast  else [AST(None)]
 	all_arch     = [Architecture(o) for o in list(dict.fromkeys(options.arch   ))] if options.arch else [Architecture(None)]
 	all_debug    = [Debug(o)        for o in list(dict.fromkeys(options.debug  ))]
Index: tests/pybin/test_run.py
===================================================================
--- tests/pybin/test_run.py	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ tests/pybin/test_run.py	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -11,7 +11,8 @@
 		self.path = ''
 		self.arch = ''
+		self.astv = ''
 
 	def toString(self):
-		return "{:25s} ({:5s} {:s})".format( self.name, self.arch if self.arch else "Any", self.target() )
+		return "{:25s} ({:5s} arch, {:s} ast: {:s})".format( self.name, self.arch if self.arch else "Any", self.astv if self.astv else "Any", self.target() )
 
 	def prepare(self):
@@ -20,5 +21,7 @@
 
 	def expect(self):
-		return os.path.normpath( os.path.join(settings.SRCDIR  , self.path, ".expect", "%s%s.txt" % (self.name,'' if not self.arch else ".%s" % self.arch)) )
+		arch = '' if not self.arch else ".%s" % self.arch
+		astv = '' if not self.astv else ".nast" if self.astv == "new" else ".oast"
+		return os.path.normpath( os.path.join(settings.SRCDIR  , self.path, ".expect", "%s%s%s.txt" % (self.name,astv,arch)) )
 
 	def error_log(self):
@@ -45,9 +48,10 @@
 
 	@staticmethod
-	def new_target(target, arch):
+	def new_target(target, arch, astv):
 		test = Test()
 		test.name = os.path.basename(target)
 		test.path = os.path.relpath (os.path.dirname(target), settings.SRCDIR)
 		test.arch = arch.target if arch else ''
+		test.astv = astv.target if astv else ''
 		return test
 
Index: tests/pybin/tools.py
===================================================================
--- tests/pybin/tools.py	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ tests/pybin/tools.py	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -181,4 +181,5 @@
 		'-s' if silent else None,
 		test_param,
+		settings.ast.flags,
 		settings.arch.flags,
 		settings.debug.flags,
Index: tests/test.py
===================================================================
--- tests/test.py	(revision 4b30e8ccaaf654cc01c74faafd80728628e173c5)
+++ tests/test.py	(revision c28ea4e2bf0768f205a4ffa626565b1cbfedfb95)
@@ -24,10 +24,20 @@
 
 	def match_test(path):
-		match = re.search("^%s\/([\w\/\-_]*).expect\/([\w\-_]+)(\.[\w\-_]+)?\.txt$" % settings.SRCDIR, path)
+		match = re.search("^%s\/([\w\/\-_]*).expect\/([\w\-_]+)(\.nast|\.oast)?(\.[\w\-_]+)?\.txt$" % settings.SRCDIR, path)
 		if match :
 			test = Test()
 			test.name = match.group(2)
 			test.path = match.group(1)
-			test.arch = match.group(3)[1:] if match.group(3) else None
+			test.arch = match.group(4)[1:] if match.group(4) else None
+
+			astv = match.group(3)[1:] if match.group(3) else None
+			if astv == 'oast':
+				test.astv = 'old'
+			elif astv == 'nast':
+				test.astv = 'new'
+			elif astv:
+				print('ERROR: "%s", expect file has astv but it is not "nast" or "oast"' % testname, file=sys.stderr)
+				sys.exit(1)
+
 			expected.append(test)
 
@@ -66,30 +76,34 @@
 	if options.regenerate_expected :
 		for testname in options.tests :
-			testname = canonical_path( testname )
+			testname = os.path.normpath( os.path.join(settings.SRCDIR, testname) )
+
 			# first check if this is a valid name to regenerate
 			if Test.valid_name(testname):
 				# this is a valid name, let's check if it already exists
 				found = [test for test in all_tests if canonical_path( test.target() ) == testname]
+				setup = itertools.product(settings.all_arch if options.arch else [None], settings.all_ast if options.ast else [None])
 				if not found:
-					# it's a new name, create it according to the name and specified architecture
-					if options.arch:
-						# user specified one or multiple architectures, assume the tests will have architecture specific results
-						tests.extend( [Test.new_target(testname, arch) for arch in settings.all_arch] )
-					else:
-						# user didn't specify an architecture, just create a cross platform test
-						tests.append( Test.new_target( testname, None ) )
+					# it's a new name, create it according to the name and specified architecture/ast version
+					tests.extend( [Test.new_target(testname, arch, ast) for arch, ast in setup] )
 				elif len(found) == 1 and not found[0].arch:
 					# we found a single test, the user better be wanting to create a cross platform test
 					if options.arch:
 						print('ERROR: "%s", test has no specified architecture but --arch was specified, ignoring it' % testname, file=sys.stderr)
+					elif options.ast:
+						print('ERROR: "%s", test has no specified ast version but --ast was specified, ignoring it' % testname, file=sys.stderr)
 					else:
 						tests.append( found[0] )
 				else:
 					# this test is already cross platform, just add a test for each platform the user asked
-					tests.extend( [Test.new_target(testname, arch) for arch in settings.all_arch] )
+					tests.extend( [Test.new_target(testname, arch, ast) for arch, ast in setup] )
 
 					# print a warning if it users didn't ask for a specific architecture
 					if not options.arch:
 						print('WARNING: "%s", test has architecture specific expected files but --arch was not specified, regenerating only for current host' % testname, file=sys.stderr)
+
+
+					# print a warning if it users didn't ask for a specific ast version
+					if not options.ast:
+						print('WARNING: "%s", test has ast version specific expected files but --ast was not specified, regenerating only for current ast' % testname, file=sys.stderr)
 
 			else :
@@ -112,7 +126,8 @@
 	# create a parser with the arguments for the tests script
 	parser = argparse.ArgumentParser(description='Script which runs cforall tests')
+	parser.add_argument('--ast', help='Test for specific ast', type=comma_separated(str), default=None)
+	parser.add_argument('--arch', help='Test for specific architecture', type=comma_separated(str), default=None)
 	parser.add_argument('--debug', help='Run all tests in debug or release', type=comma_separated(yes_no), default='yes')
 	parser.add_argument('--install', help='Run all tests based on installed binaries or tree binaries', type=comma_separated(yes_no), default='no')
-	parser.add_argument('--arch', help='Test for specific architecture', type=comma_separated(str), default=None)
 	parser.add_argument('--continue', help='When multiple specifications are passed (debug/install/arch), sets whether or not to continue if the last specification failed', type=yes_no, default='yes', dest='continue_')
 	parser.add_argument('--timeout', help='Maximum duration in seconds after a single test is considered to have timed out', type=int, default=120)
@@ -251,8 +266,8 @@
 	except KeyboardInterrupt:
 		return False, ""
-	except Exception as ex:
-		print("Unexpected error in worker thread running {}: {}".format(t.target(), ex), file=sys.stderr)
-		sys.stderr.flush()
-		return False, ""
+	# except Exception as ex:
+	# 	print("Unexpected error in worker thread running {}: {}".format(t.target(), ex), file=sys.stderr)
+	# 	sys.stderr.flush()
+	# 	return False, ""
 
 
@@ -362,5 +377,6 @@
 		# for each build configurations, run the test
 		with Timed() as total_dur:
-			for arch, debug, install in itertools.product(settings.all_arch, settings.all_debug, settings.all_install):
+			for ast, arch, debug, install in itertools.product(settings.all_ast, settings.all_arch, settings.all_debug, settings.all_install):
+				settings.ast     = ast
 				settings.arch    = arch
 				settings.debug   = debug
@@ -369,5 +385,6 @@
 				# filter out the tests for a different architecture
 				# tests are the same across debug/install
-				local_tests = settings.arch.filter( tests )
+				local_tests = settings.ast.filter( tests )
+				local_tests = settings.arch.filter( local_tests )
 				options.jobs, forceJobs = job_count( options, local_tests )
 				settings.update_make_cmd(forceJobs, options.jobs)
@@ -377,11 +394,15 @@
 
 				# print configuration
-				print('%s %i tests on %i cores (%s:%s)' % (
+				print('%s %i tests on %i cores (%s:%s - %s)' % (
 					'Regenerating' if settings.generating else 'Running',
 					len(local_tests),
 					options.jobs,
+					settings.ast.string,
 					settings.arch.string,
 					settings.debug.string
 				))
+				if not local_tests :
+					print('WARNING: No tests for this configuration')
+					continue
 
 				# otherwise run all tests and make sure to return the correct error code